Пример #1
0
 def test_variable_tracking(self):
     x = tf.Variable(1.)
     b = ForwardOnlyBijector(scale=x, validate_args=True)
     self.assertIsInstance(b, tf.Module)
     self.assertEqual((x, ), b.trainable_variables)
Пример #2
0
 def testAssertsPositiveDf(self):
     df = tf.Variable([1., 2., -3.])
     with self.assertRaisesOpError('Argument `df` must be positive.'):
         d = tfd.Chi2(df, validate_args=True)
         self.evaluate([v.initializer for v in d.variables])
         self.evaluate(d.sample())
Пример #3
0
 def test_broadcasting_during_direct_setting_with_variables_fails(self):
     with self.assertRaisesRegex(ValueError, "passing a Variable"):
         _ = normalization.Normalization(axis=-1,
                                         mean=tf.Variable([1.0]),
                                         variance=tf.Variable([2.0]))
Пример #4
0
    def set_model(self, model):
        """Sets Keras model and creates summary ops."""

        self.model = model
        self._init_writer(model)
        # histogram summaries only enabled in graph mode
        if not tf.executing_eagerly():
            self._make_histogram_ops(model)
            self.merged = tf.compat.v1.summary.merge_all()

        # If both embedding_freq and embeddings_data are available, we will
        # visualize embeddings.
        if self.embeddings_freq and self.embeddings_data is not None:
            # Avoid circular dependency.
            from keras.engine import (
                training_utils_v1, )  # pylint: disable=g-import-not-at-top

            self.embeddings_data = training_utils_v1.standardize_input_data(
                self.embeddings_data, model.input_names)

            # If embedding_layer_names are not provided, get all of the embedding
            # layers from the model.
            embeddings_layer_names = self.embeddings_layer_names
            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == "Embedding"
                ]

            self.assign_embeddings = []
            embeddings_vars = {}

            self.batch_id = batch_id = tf.compat.v1.placeholder(tf.int32)
            self.step = step = tf.compat.v1.placeholder(tf.int32)

            for layer in self.model.layers:
                if layer.name in embeddings_layer_names:
                    embedding_input = self.model.get_layer(layer.name).output
                    embedding_size = np.prod(embedding_input.shape[1:])
                    embedding_input = tf.reshape(embedding_input,
                                                 (step, int(embedding_size)))
                    shape = (
                        self.embeddings_data[0].shape[0],
                        int(embedding_size),
                    )
                    embedding = tf.Variable(tf.zeros(shape),
                                            name=layer.name + "_embedding")
                    embeddings_vars[layer.name] = embedding
                    batch = tf.compat.v1.assign(
                        embedding[batch_id:batch_id + step], embedding_input)
                    self.assign_embeddings.append(batch)

            self.saver = tf.compat.v1.train.Saver(
                list(embeddings_vars.values()))

            # Create embeddings_metadata dictionary
            if isinstance(self.embeddings_metadata, str):
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings_vars.keys()
                }
            else:
                # If embedding_metadata is already a dictionary
                embeddings_metadata = self.embeddings_metadata

            try:
                from tensorboard.plugins import projector
            except ImportError:
                raise ImportError(
                    "Failed to import TensorBoard. Please make sure that "
                    'TensorBoard integration is complete."')

            # TODO(psv): Add integration tests to test embedding visualization
            # with TensorBoard callback. We are unable to write a unit test for this
            # because TensorBoard dependency assumes TensorFlow package is installed.
            config = projector.ProjectorConfig()
            for layer_name, tensor in embeddings_vars.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                if (embeddings_metadata is not None
                        and layer_name in embeddings_metadata):
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
    def test_latent_dirichlet_allocation(self):
        """Tests Latent Dirichlet Allocation joint model.

    The LDA generative process can be written as:

    ```none
    N[i] ~ Poisson(xi)
    theta[i] ~ Dirichlet(alpha)
    Z[i] ~ Multinomial(N[i], theta[i])
    for k in 1...K:
      X[i,k] ~ Multinomial(Z[i, k], beta[j])
    ```

    Typically `xi` is specified and `alpha`, `beta` are fit using type-II
    maximum likelihood estimators.

    Reference: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
    """

        # Hyperparameters.
        num_topics = 3
        num_words = 10
        avg_doc_length = 5
        u = tfd.Uniform(low=-1., high=1.)
        alpha = tfp.util.DeferredTensor(
            tf.math.softplus,
            tf.Variable(u.sample([num_topics]), name='raw_alpha'))
        beta = tf.Variable(u.sample([num_topics, num_words]), name='beta')

        # LDA Model.
        # Note near 1:1 with mathematical specification. The main distinction is the
        # use of Independent--this lets us easily aggregate multinomials across
        # topics (and in any "shape" of documents).
        lda = tfd.JointDistributionSequential([
            tfd.Poisson(rate=avg_doc_length),  # n
            tfd.Dirichlet(concentration=alpha),  # theta
            lambda theta, n: tfd.Multinomial(total_count=n, probs=theta),  # z
            lambda z: tfd.Independent(  # x  pylint: disable=g-long-lambda
                tfd.Multinomial(total_count=z, logits=beta),
                reinterpreted_batch_ndims=1),
        ])

        # Now, let's sample some "documents" and compute the log-prob of each.
        docs_shape = [2, 4]  # That is, 8 docs in the shape of [2, 4].
        [n, theta, z, x] = lda.sample(docs_shape)
        log_probs = lda.log_prob([n, theta, z, x])
        self.assertEqual(docs_shape, log_probs.shape)

        # Verify we correctly track trainable variables.
        self.assertLen(lda.trainable_variables, 2)
        self.assertIs(alpha.pretransformed_input, lda.trainable_variables[0])
        self.assertIs(beta, lda.trainable_variables[1])

        # Ensure we can compute gradients.
        with tf.GradientTape() as tape:
            # Note: The samples are not taped, hence implicitly "stop_gradient."
            negloglik = -lda.log_prob([n, theta, z, x])
        grads = tape.gradient(negloglik, lda.trainable_variables)

        self.assertLen(grads, 2)
        self.assertAllEqual((alpha.pretransformed_input.shape, beta.shape),
                            (grads[0].shape, grads[1].shape))
        self.assertAllNotNone(grads)
Пример #6
0
def main(_):
    # get logger
    save_path = FLAGS.save_dir
    if FLAGS.save_logs:
        if not tf.gfile.Exists(os.path.join(save_path, 'train.log')):
            tf.gfile.MakeDirs(save_path)
            write_mode = 'w'
        else:
            write_mode = 'a'
        stream = tf.gfile.Open(os.path.join(save_path, 'train.log'),
                               write_mode)
        log_handler = native_logging.StreamHandler(stream)
        print('Saving logs in {}'.format(save_path))
    else:
        log_handler = native_logging.StreamHandler(sys.stdout)
    formatter = native_logging.Formatter(
        '%(asctime)s %(levelname)-8s %(message)s')
    log_handler.setFormatter(formatter)
    log_handler.setLevel(logging.INFO)
    logger = logging.get_absl_logger()
    logger.addHandler(log_handler)

    # set up tf.summary
    train_log_dir = save_path + '/train'
    valid_log_dir = save_path + '/valid'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
    valid_summary_writer = tf.summary.create_file_writer(valid_log_dir)

    # load data
    dataset_path = os.path.join(FLAGS.data_dir, FLAGS.dataset)
    dataset = DatasetClass(dataset_path, FLAGS.debug)
    sizes = dataset.get_shape()
    train_examples_reversed = dataset.get_examples('train')
    valid_examples = dataset.get_examples('valid')
    test_examples = dataset.get_examples('test')
    filters = dataset.get_filters()
    logging.info('\t Dataset shape: %s', (str(sizes)))

    # save config
    config_path = os.path.join(save_path, 'config.json')
    if FLAGS.save_logs and not tf.gfile.Exists(config_path):
        with tf.gfile.Open(config_path, 'w') as fjson:
            json.dump(train_utils.get_config_dict(CONFIG), fjson)

    # create and build model
    tf.keras.backend.set_floatx(FLAGS.dtype)
    model = getattr(models, FLAGS.model)(sizes, FLAGS)
    model.build(input_shape=(1, 2))
    trainable_params = train_utils.count_params(model)
    trainer = CFTrainer(sizes, FLAGS)
    logging.info('\t Total number of trainable parameters %s',
                 (trainable_params))

    # restore or create checkpoint
    if FLAGS.save_model:
        ckpt = tf.train.Checkpoint(step=tf.Variable(0),
                                   optimizer=trainer.optimizer,
                                   net=model)
        manager = tf.train.CheckpointManager(ckpt, save_path, max_to_keep=1)
        if manager.latest_checkpoint:
            ckpt.restore(manager.latest_checkpoint)
            logging.info('\t Restored from %s', (manager.latest_checkpoint))
        else:
            logging.info('\t Initializing from scratch.')
    else:
        logging.info('\t Initializing from scratch.')

    # train model
    logging.info('\t Start training')
    early_stopping_counter = 0
    best_mrr = None
    best_epoch = None
    best_weights = None
    if FLAGS.save_model:
        epoch = ckpt.step
    else:
        epoch = 0

    if int(epoch) < FLAGS.max_epochs:
        while int(epoch) < FLAGS.max_epochs:
            if FLAGS.save_model:
                epoch.assign_add(1)
            else:
                epoch += 1

            # Train step
            start = time.perf_counter()
            train_batch = train_examples_reversed.batch(FLAGS.batch_size)
            train_loss = trainer.train_step(model, train_batch).numpy()
            end = time.perf_counter()
            execution_time = (end - start)
            logging.info('\t Epoch %i | train loss: %.4f | total time: %.4f',
                         int(epoch), train_loss, execution_time)
            with train_summary_writer.as_default():
                tf.summary.scalar('loss', train_loss, step=epoch)

            if FLAGS.save_model and int(epoch) % FLAGS.checkpoint == 0:
                save_path = manager.save()
                logging.info('\t Saved checkpoint for epoch %i: %s',
                             int(epoch), save_path)

            if int(epoch) % FLAGS.valid == 0:
                # compute valid loss
                valid_batch = valid_examples.batch(FLAGS.batch_size)
                valid_loss = trainer.valid_step(model, valid_batch).numpy()
                logging.info('\t Epoch %i | average valid loss: %.4f',
                             int(epoch), valid_loss)
                with valid_summary_writer.as_default():
                    tf.summary.scalar('loss', valid_loss, step=epoch)

                # compute validation metrics
                valid = train_utils.metric_dict_full_and_random(
                    *model.random_eval(valid_examples, filters))
                logging.info(train_utils.format_metrics(valid, split='valid'))
                with valid_summary_writer.as_default():
                    tf.summary.scalar('mrs', valid['MR'], step=epoch)
                    tf.summary.scalar('mrrs', valid['MRR'], step=epoch)
                    tf.summary.scalar('hits@[1]',
                                      valid['hits@[1,3,10]'][1],
                                      step=epoch)
                    tf.summary.scalar('hits@[3]',
                                      valid['hits@[1,3,10]'][3],
                                      step=epoch)
                    tf.summary.scalar('hits@[10]',
                                      valid['hits@[1,3,10]'][10],
                                      step=epoch)
                    tf.summary.scalar('mrs_r', valid['MR_r'], step=epoch)
                    tf.summary.scalar('mrrs_r', valid['MRR_r'], step=epoch)
                    tf.summary.scalar('hits@[1]_r',
                                      valid['hits@[1,3,10]_r'][1],
                                      step=epoch)
                    tf.summary.scalar('hits@[3]_r',
                                      valid['hits@[1,3,10]_r'][3],
                                      step=epoch)
                    tf.summary.scalar('hits@[10]_r',
                                      valid['hits@[1,3,10]_r'][10],
                                      step=epoch)

                # early stopping
                valid_mrr = valid['MRR']
                if not best_mrr or valid_mrr > best_mrr:
                    best_mrr = valid_mrr
                    early_stopping_counter = 0
                    best_epoch = int(epoch)
                    best_weights = copy.copy(model.get_weights())
                else:
                    early_stopping_counter += 1
                    if early_stopping_counter == FLAGS.patience:
                        logging.info('\t Early stopping')
                        break

        logging.info('\t Optimization finished')
        logging.info('\t Evaluating best model from epoch %s', best_epoch)
        model.set_weights(best_weights)
        if FLAGS.save_model:
            model.save_weights(os.path.join(save_path, 'best_model.ckpt'))

        # validation metrics
        valid = train_utils.metric_dict_full_and_random(
            *model.random_eval(valid_examples, filters))
        logging.info(train_utils.format_metrics(valid, split='valid'))

        # test metrics
        test = train_utils.metric_dict_full_and_random(
            *model.random_eval(test_examples, filters))
        logging.info(train_utils.format_metrics(test, split='test'))
    else:
        logging.info('\t Training completed')
Пример #7
0
 def make_integer_input(self, number):
     if self.use_static_shape:
         return number
     output = tf.Variable(number, dtype=tf.int32)
     self.evaluate(output.initializer)
     return output
Пример #8
0
	flag = True
	indeces_ = []
	while(flag):
		pair_ = [int(a) for a in list(input()) if a != ' ']
		if any([a > max_indeces for a in pair_]):
			assert (False), ('We enter index more then available!')

		print(pair_)
		if(pair_[0] == 0 or pair_[1] == 0):
			flag = False
		else:
			indeces_.append(pair_)

	return indeces_		

#indeces_ = create_a_list_indeces(num_of_parameters)
#print([(iter_1, iter_2) \
#	for iter_1, iter_2 in indeces_])


markov_theory = Markovitz_theory()
markov_theory._initial_model_param_()

funct_obj = wrapper_fuzzy_constraints_(markov_theory)
x = tf.Variable(np.random.random(markov_theory.weights.shape[0]))
print(x.shape)
with tf.GradientTape() as g:
	g.watch((x))
	f = funct_obj(x)
	grad_ = g.gradient(f, x)
	print(grad_.numpy())
Пример #9
0
 def double_module_fn():
     w = tf.Variable([2.0, 4.0])
     x = tf.compat.v1.placeholder(dtype=tf.float32)
     hub.add_signature(inputs=x, outputs=x * w)
Пример #10
0
    def __init__(self,
                 prior,
                 coding_rank,
                 compression=False,
                 tail_mass=2**-8,
                 range_coder_precision=12,
                 no_variables=False):
        """Initializer.

    Arguments:
      prior: A `tfp.distributions.Distribution` object. A density model fitting
        the marginal distribution of the bottleneck data with additive uniform
        noise, which is shared a priori between the sender and the receiver. For
        best results, the distribution should be flexible enough to have a
        unit-width uniform distribution as a special case, since this is the
        marginal distribution for bottleneck dimensions that are constant. The
        distribution parameters may not depend on data (they must be either
        variables or constants).
      coding_rank: Integer. Number of innermost dimensions considered a coding
        unit. Each coding unit is compressed to its own bit string, and the
        `bits()` method sums over each coding unit.
      compression: Boolean. If set to `True`, the range coding tables used by
        `compress()` and `decompress()` will be built on instantiation. If set
        to `False`, these two methods will not be accessible.
      tail_mass: Float. Approximate probability mass which is range encoded with
        less precision, by using a Golomb-like code.
      range_coder_precision: Integer. Precision passed to the range coding op.
      no_variables: Boolean. If True, creates range coding tables as `Tensor`s
        rather than `Variable`s.

    Raises:
      RuntimeError: when attempting to instantiate an entropy model with
        `compression=True` and not in eager execution mode.
    """
        if coding_rank < prior.batch_shape.rank:
            raise ValueError(
                "`coding_rank` can't be smaller than batch rank of prior.")
        super().__init__(
            prior=prior,
            coding_rank=coding_rank,
            compression=compression,
            tail_mass=tail_mass,
            range_coder_precision=range_coder_precision,
            no_variables=no_variables,
        )

        quantization_offset = helpers.quantization_offset(prior)
        # Optimization: if the quantization offset is zero, we don't need to
        # subtract/add it when quantizing, and we don't need to serialize its value.
        # Note that this code will only work in eager mode.
        # TODO(jonycgn): Reconsider if this optimization is worth keeping once the
        # implementation is stable.
        if tf.executing_eagerly() and tf.reduce_all(
                tf.equal(quantization_offset, 0.)):
            quantization_offset = None
        else:
            quantization_offset = tf.broadcast_to(quantization_offset,
                                                  self.prior_shape_tensor)
            if self.compression and not self.no_variables:
                quantization_offset = tf.Variable(quantization_offset,
                                                  trainable=False,
                                                  name="quantization_offset")
        self._quantization_offset = quantization_offset
Пример #11
0
    def _benchmark(self, gradient_type, num_gpus, mode, loss_scaling):
        """Benchmarks loss scaling.

    We run a simple model with several scalar variables. The loss is the sum of
    all variables. The model is simple because we want to measure only the
    performance of loss scaling, not the performance of the model itself.

    Args:
      gradient_type: "optimizer" or "gradient_tape". How gradients are computed.
        "optimizer" uses Optimizer.minimize. "gradient_tape" uses
        GradientTape.gradient along with LossScaleOptimizer.get_scaled_loss and
        LossScaleOptimizer.get_unscaled_gradients.
      num_gpus: The number of GPUs to use. Must be at least 1.
      mode: "eager" or "tf_function". "tf_function" causes all computations to
        be wrapped in a tf.function, while "eager" runs computations eagerly.
      loss_scaling: "fixed", "dynamic", or None. The type of loss scaling to
        use. None means use no loss scaling, which is useful as a baseline to
        see how much slower loss scaling is in comparison.
    """
        ls_str = loss_scaling or 'no_loss_scaling'
        name = '%s_%d_GPU_%s_%s' % (gradient_type, num_gpus, mode, ls_str)
        with tf.__internal__.eager_context.eager_mode(), _get_strategy(
                num_gpus).scope() as strategy:
            opt = adam.Adam()
            if loss_scaling == 'fixed':
                loss_scale = tf.mixed_precision.experimental.FixedLossScale(2.)
            elif loss_scaling == 'dynamic':
                # Make increment_period so high that it's effectively infinite. This
                # means the loss scale will never change. Any performance overhead
                # from increasing/decreasing the loss scale is typically negligible
                # since it happens infrequently, so we only benchmark the common case
                # of the loss scale not changing.
                increment_period = 1000000
                loss_scale = tf.mixed_precision.experimental.DynamicLossScale(
                    initial_loss_scale=2., increment_period=increment_period)
            else:
                assert loss_scaling is None
                loss_scale = None
            if loss_scale:
                opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)

            num_vars = 200
            num_warmup_iters = 1
            num_iters = 20
            # By using scalar variables, we reduce overhead of the actual GPU work of
            # multiplying variables, dividing gradients, and checking gradients for
            # NaNs. Measuring these overheads isn't very useful as there is little we
            # can do to reduce them (one such way would be to fuse dividing gradients
            # and checking them for NaNs). We still have all other overheads, such as
            # all-reducing the `is_finite` values and having a tf.cond or
            # tf.while_loop based on whether gradients are NaNs. Currently, these
            # other overheads are much more significant than the GPU work.
            var_list = [
                tf.Variable(i, dtype='float32') for i in range(num_vars)
            ]

            def get_loss():
                return tf.add_n(var_list)

            if gradient_type == 'gradient_tape':
                if loss_scale is None:

                    def minimize_fn():
                        with tf.GradientTape() as tape:
                            loss = get_loss()
                        grads = tape.gradient(loss, var_list)
                        return opt.apply_gradients(zip(grads, var_list))
                else:

                    def minimize_fn():
                        with tf.GradientTape() as tape:
                            loss = get_loss()
                            scaled_loss = opt.get_scaled_loss(loss)
                        scaled_grads = tape.gradient(scaled_loss, var_list)
                        grads = opt.get_unscaled_gradients(scaled_grads)
                        return opt.apply_gradients(zip(grads, var_list))
            else:
                assert gradient_type == 'optimizer'

                def minimize_fn():
                    return opt.minimize(get_loss, var_list)

            def run_fn():
                strategy.run(minimize_fn)

            if mode == 'tf_function':
                run_fn = tf.function(run_fn)

            for _ in range(num_warmup_iters):
                run_fn()

            start = time.time()
            for _ in range(num_iters):
                run_fn()
            end = time.time()
            self.report_benchmark(iters=num_iters,
                                  wall_time=(end - start) / num_iters,
                                  name=name)
Пример #12
0
 def __init__(self):
   self.v = tf.Variable(2.)
Пример #13
0
 def __init__(self):
   super(ResourcesOpsModule, self).__init__()
   self.counter = tf.Variable(0.0)
Пример #14
0
    def test_latent_dirichlet_allocation(self, jd_class):  # pylint: disable=g-doc-args
        """Tests Latent Dirichlet Allocation joint model.

    The LDA generative process can be written as:

    ```none
    N[i] ~ Poisson(xi)
    theta[i] ~ Dirichlet(alpha)
    Z[i] ~ Multinomial(N[i], theta[i])
    for k in 1...K:
      X[i,k] ~ Multinomial(Z[i, k], beta[j])
    ```

    Typically `xi` is specified and `alpha`, `beta` are fit using type-II
    maximum likelihood estimators.

    Reference: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
    """
        seed = test_util.test_seed_stream()
        # Hyperparameters.
        num_topics = 3
        num_words = 10
        avg_doc_length = 5
        u = tfd.Uniform(low=-1., high=1.)
        alpha = tfp.util.TransformedVariable(u.sample([num_topics],
                                                      seed=seed()),
                                             tfb.Softplus(),
                                             name='alpha')
        beta = tf.Variable(u.sample([num_topics, num_words], seed=seed()),
                           name='beta')

        # Note near 1:1 with mathematical specification. The main distinction is the
        # use of Independent--this lets us easily aggregate multinomials across
        # topics (and in any "shape" of documents).
        def lda_coroutine_model():
            n = yield Root(tfd.Poisson(rate=avg_doc_length))
            theta = yield Root(tfd.Dirichlet(concentration=alpha))
            z = yield tfd.Multinomial(total_count=n, probs=theta)
            yield tfd.Multinomial(total_count=z, logits=beta)

        if jd_class is tfd.JointDistributionCoroutineAutoBatched:
            model = lda_coroutine_model
        elif jd_class is tfd.JointDistributionSequentialAutoBatched:
            model = [
                tfd.Poisson(rate=avg_doc_length),  # n
                tfd.Dirichlet(concentration=alpha),  # theta
                lambda theta, n: tfd.Multinomial(total_count=n, probs=theta
                                                 ),  # z
                lambda z: tfd.Multinomial(total_count=z, logits=beta)
            ]
        elif jd_class is tfd.JointDistributionNamedAutoBatched:
            model = collections.OrderedDict((
                ('n', tfd.Poisson(rate=avg_doc_length)),
                ('theta', tfd.Dirichlet(concentration=alpha)),
                ('z',
                 lambda theta, n: tfd.Multinomial(total_count=n, probs=theta)),
                ('X', lambda z: tfd.Multinomial(total_count=z, logits=beta))))

        # TODO(b/159842104): Enable autovectorization for Multinomial sampling.
        lda = jd_class(model, validate_args=True, use_vectorized_map=False)

        # Now, let's sample some "documents" and compute the log-prob of each.
        docs_shape = [2, 4]  # That is, 8 docs in the shape of [2, 4].
        sample = lda.sample(docs_shape, seed=seed())
        log_probs = lda.log_prob(sample)
        self.assertEqual(docs_shape, log_probs.shape)

        # Verify we correctly track trainable variables.
        self.assertLen(lda.trainable_variables, 2)
        self.assertIs(alpha.pretransformed_input, lda.trainable_variables[0])
        self.assertIs(beta, lda.trainable_variables[1])

        # Ensure we can compute gradients.
        with tf.GradientTape() as tape:
            # Note: The samples are not taped, hence implicitly "stop_gradient."
            negloglik = -lda.log_prob(sample)
        grads = tape.gradient(negloglik, lda.trainable_variables)

        self.assertLen(grads, 2)
        self.assertAllEqual((alpha.pretransformed_input.shape, beta.shape),
                            (grads[0].shape, grads[1].shape))
        self.assertAllNotNone(grads)
Пример #15
0
def broadcasting_params(draw,
                        batch_shape,
                        params_event_ndims,
                        event_dim=None,
                        enable_vars=False,
                        constraint_fn_for=lambda param: identity_fn,
                        mutex_params=()):
    """Streategy for drawing parameters which jointly have the given batch shape.

  Specifically, the batch shapes of the returned parameters will broadcast to
  the requested batch shape.

  The dtypes of the returned parameters are determined by their respective
  constraint functions.

  Args:
    draw: Hypothesis strategy sampler supplied by `@hps.composite`.
    batch_shape: A `TensorShape`.  The returned parameters' batch shapes will
      broadcast to this.
    params_event_ndims: Python `dict` mapping the name of each parameter to a
      Python `int` giving the event ndims for that parameter.
    event_dim: Optional Python int giving the size of each parameter's event
      dimensions (except where overridden by any applicable constraint
      functions).  This is shared across all parameters, permitting square event
      matrices, compatible location and scale Tensors, etc. If omitted,
      Hypothesis will choose one.
    enable_vars: TODO(bjp): Make this `True` all the time and put variable
      initialization in slicing_test.  If `False`, the returned parameters are
      all Tensors, never Variables or DeferredTensor.
    constraint_fn_for: Python callable mapping parameter name to constraint
      function.  The latter is itself a Python callable which converts an
      unconstrained Tensor (currently with float32 values from -200 to +200)
      into one that meets the parameter's validity constraints.
    mutex_params: Python iterable of Python sets.  Each set gives a clique of
      mutually exclusive parameters (e.g., the 'probs' and 'logits' of a
      Categorical).  At most one parameter from each set will appear in the
      result.

  Returns:
    params: A Hypothesis strategy for drawing Python `dict`s mapping parameter
      name to a Tensor, Variable, or DeferredTensor.  The batch shapes of the
      returned parameters broadcast together to the supplied `batch_shape`.
      Only parameters whose names appear as keys in `params_event_ndims` will
      appear (but possibly not all of them, depending on `mutex_params`).
  """
    if event_dim is None:
        event_dim = draw(hps.integers(min_value=2, max_value=6))

    params_event_ndims = params_event_ndims or {}
    remaining_params = set(params_event_ndims.keys())
    params_to_use = []
    while remaining_params:
        param = draw(hps.sampled_from(sorted(remaining_params)))
        params_to_use.append(param)
        remaining_params.remove(param)
        for mutex_set in mutex_params:
            if param in mutex_set:
                remaining_params -= mutex_set

    param_batch_shapes = draw(
        broadcasting_named_shapes(batch_shape, params_to_use))
    params_kwargs = dict()
    for param in params_to_use:
        param_batch_shape = param_batch_shapes[param]
        param_event_rank = params_event_ndims[param]
        param_shape = (tensorshape_util.as_list(param_batch_shape) +
                       [event_dim] * param_event_rank)

        # Reduce our risk of exceeding TF kernel broadcast limits.
        hp.assume(len(param_shape) < 6)

        # TODO(axch): Can I replace `params_event_ndims` and `constraint_fn_for`
        # with a map from params to `Suppport`s, and use `tensors_in_support` here
        # instead of this explicit `constrained_tensors` function?
        param_strategy = constrained_tensors(constraint_fn_for(param),
                                             param_shape)
        params_kwargs[param] = tf.convert_to_tensor(draw(param_strategy),
                                                    dtype_hint=tf.float32,
                                                    name=param)
        if enable_vars and draw(hps.booleans()):
            params_kwargs[param] = tf.Variable(params_kwargs[param],
                                               name=param)
            alt_value = tf.convert_to_tensor(draw(param_strategy),
                                             dtype_hint=tf.float32,
                                             name='{}_alt_value'.format(param))
            setattr(params_kwargs[param], '_tfp_alt_value', alt_value)
            if draw(hps.booleans()):
                params_kwargs[param] = defer_and_count_usage(
                    params_kwargs[param])
    return params_kwargs
Пример #16
0
def perform_evaluation(model, builder, eval_steps, ckpt, strategy):
    #perform evaluation
    ds = data_lib.build_distributed_dataset(builder, FLAGS.eval_batch_size,
                                            strategy)

    summary_writer = tf.summary.create_file_writer(FLAGS.model_dir)

    #Build metrics
    with strategy.scope():
        regularization_loss = tf.keras.metrics.Mean('eval/regularization_loss')
        label_top_1_accuracy = tf.keras.metrics.Accuracy(
            'eval/label_top_1_accuracy')
        all_metrics = [regularization_loss, label_top_1_accuracy]

        #Restore checkpoint
        logging.info('Restoring from %s', ckpt)
        checkpoint = tf.train.Checkpoint(model=model,
                                         global_step=tf.Variable(
                                             0, dtype=tf.int64))
        checkpoint.restore(ckpt).expect_partial()
        global_step = checkpoint.global_step
        logging.info('Performing eval at step %d', global_step.numpy())

    def single_step(features, labels):
        supervised_head_outputs = model(features, training=False)
        #assert supervised_head_outputs is not None
        outputs = supervised_head_outputs
        l = labels['labels']
        ########## Update metrics ##############################
        label_top_1_accuracy.update_state(tf.argmax(l, 1),
                                          tf.argmax(outputs, axis=1))
        ##################################################
        reg_loss = model_lib.add_weight_decay(model)
        regularization_loss.update_state(reg_loss)

    with strategy.scope():

        @tf.function
        def run_single_step(iterator):
            images, labels = next(iterator)
            features, labels = images, {'labels': labels}
            strategy.run(single_step, (features, labels))

        iterator = iter(ds)
        for i in range(eval_steps):
            run_single_step(iterator)
            logging.info('Completed eval for %d/%d steps', i + 1, eval_steps)
        logging.info('Finished eval for %s', ckpt)

    #Write summaries
    cur_step = global_step.numpy()
    logging.info('Writing summaries for %d step', cur_step)
    with summary_writer.as_default():
        for metric in all_metrics:
            metric_value = metric.result().numpy().astype(float)
            logging.info('Step: [%d] %s = %f', cur_step, metric.name,
                         metric_value)
            tf.summary.scalar(metric.name, metric_value, step=cur_step)
        summary_writer.flush()

    #Record results as JSON.
    result_json_path = os.path.join(FLAGS.model_dir, 'result.json')
    result = {metric.name: metric.result().numpy() for metric in all_metrics}
    result['global_step'] = global_step.numpy()
    logging.info(result)
    with tf.io.gfile.GFile(result_json_path, 'w') as f:
        json.dump({k: float(v) for k, v in result.items()}, f)
    result_json_path = os.path.join(FLAGS.model_dir,
                                    'result_%d.json' % result['global_step'])
    with tf.io.gfile.GFile(result_json_path, 'w') as f:
        json.dump({k: float(v) for k, v in result.items()}, f)
    flag_json_path = os.path.join(FLAGS.model_dir, 'flags.json')
    with tf.io.gfile.GFile(flag_json_path, 'w') as f:
        serializable_flags = {}
        for key, val in FLAGS.flag_values_dict().items():
            #Some flag value types e.g. datetime.timedelta are not json serializable,
            #filter those out
            if json_serializable(val):
                serializable_flags[key] = val

        json.dump(serializable_flags, f)

    modelsave.save(model, global_step=result['global_step'])

    return result
Пример #17
0
    def __init__(self,
                 max_tokens,
                 num_oov_indices,
                 mask_token,
                 oov_token,
                 vocabulary_dtype,
                 vocabulary=None,
                 idf_weights=None,
                 invert=False,
                 output_mode="int",
                 sparse=False,
                 pad_to_max_tokens=False,
                 **kwargs):
        # If max_tokens is set, the value must be greater than 1 - otherwise we
        # are creating a 0-element vocab, which doesn't make sense.
        if max_tokens is not None and max_tokens <= 1:
            raise ValueError(f"If set, `max_tokens` must be greater than 1. "
                             f"Received: max_tokens={max_tokens}")

        if pad_to_max_tokens and max_tokens is None:
            raise ValueError(
                f"If pad_to_max_tokens is True, must set `max_tokens`. "
                f"Received: max_tokens={max_tokens}")

        if num_oov_indices < 0:
            raise ValueError(
                f"`num_oov_indices` must be greater than or equal to 0. "
                f"Received: num_oov_indices={num_oov_indices}")

        # Support deprecated names for output_modes.
        if output_mode == "binary":
            output_mode = MULTI_HOT
        if output_mode == "tf-idf":
            output_mode = TF_IDF
        # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
        layer_utils.validate_string_arg(output_mode,
                                        allowable_strings=(INT, ONE_HOT,
                                                           MULTI_HOT, COUNT,
                                                           TF_IDF),
                                        layer_name=self.__class__.__name__,
                                        arg_name="output_mode")

        if invert and output_mode != INT:
            raise ValueError(
                f"`output_mode` must be `'int'` when `invert` is true. "
                f"Received: output_mode={output_mode}")

        if sparse and output_mode == INT:
            raise ValueError(
                f"`sparse` may only be true if `output_mode` is "
                f"`'one_hot'`, `'multi_hot'`, `'count'` or `'tf_idf'`. "
                f"Received: sparse={sparse} and "
                f"output_mode={output_mode}")

        if idf_weights is not None and output_mode != TF_IDF:
            raise ValueError(
                f"`idf_weights` should only be set if `output_mode` is "
                f"`'tf_idf'`. Received: idf_weights={idf_weights} and "
                f"output_mode={output_mode}")

        self.invert = invert
        self.max_tokens = max_tokens
        self.num_oov_indices = num_oov_indices
        self.mask_token = mask_token
        self.oov_token = oov_token
        self.output_mode = output_mode
        self.sparse = sparse
        self.pad_to_max_tokens = pad_to_max_tokens
        self.vocabulary_dtype = vocabulary_dtype
        self._frozen_vocab_size = None

        self.input_vocabulary = vocabulary
        self.input_idf_weights = idf_weights
        # VocabularySavedModelSaver will clear the config vocabulary to restore the
        # lookup table ops directly. We persist this hidden option to persist the
        # fact that we have have a non-adaptable layer with a manually set vocab.
        self._has_input_vocabulary = kwargs.pop("has_input_vocabulary",
                                                (vocabulary is not None))

        # Drop deprecated config options.
        kwargs.pop("vocabulary_size", None)
        kwargs.pop("has_static_table", None)

        # By default, output int64 when output_mode='int' and floats otherwise.
        if "dtype" not in kwargs:
            kwargs[
                "dtype"] = tf.int64 if output_mode == INT else backend.floatx(
                )

        super().__init__(**kwargs)

        # Check dtype only after base layer parses it; dtype parsing is complex.
        if output_mode == INT and not tf.as_dtype(
                self.compute_dtype).is_integer:
            input_dtype = kwargs["dtype"]
            raise ValueError(
                "When `output_mode='int'`, `dtype` should be an integer "
                f"type. Received: dtype={input_dtype}")

        if invert:
            self._key_dtype = self.dtype if output_mode == INT else tf.int64
            self._value_dtype = tf.as_dtype(self.vocabulary_dtype)
            mask_key = 0
            mask_value = mask_token
            self._default_value = self.oov_token
        else:
            self._key_dtype = tf.as_dtype(self.vocabulary_dtype)
            self._value_dtype = self.dtype if output_mode == INT else tf.int64
            mask_key = mask_token
            # Masks should map to 0 for int output and be dropped otherwise. Max ints
            # will be dropped from the bincount op.
            mask_value = 0 if self.output_mode == INT else self._value_dtype.max
            if self.num_oov_indices == 0:
                # If there are no OOV indices, we map OOV tokens to -1 and error out
                # during call if we find a negative index.
                self._default_value = -1
            elif self.num_oov_indices == 1:
                # If there is only one OOV index, we can set that index as the default
                # value of the index_lookup table.
                self._default_value = self._oov_start_index()
            else:
                # If we have multiple OOV values, we need to do a further hashing step;
                # to make this easier, we set the OOV value to -1. (This lets us do a
                # vectorized add and cast to boolean to determine locations where we
                # need to do extra hashing.)
                self._default_value = -1
        if self.mask_token is not None:
            self._mask_key = tf.convert_to_tensor(mask_key, self._key_dtype)
            self._mask_value = tf.convert_to_tensor(mask_value,
                                                    self._value_dtype)

        if self.output_mode == TF_IDF:
            self.idf_weights = tf.Variable([0] * self._token_start_index(),
                                           shape=(None, ),
                                           dtype=self.compute_dtype,
                                           trainable=False)
            self.idf_weights_const = self.idf_weights.value()

        if vocabulary is not None:
            self.set_vocabulary(vocabulary, idf_weights)
        else:
            # When restoring from a keras SavedModel, the loading code will expect to
            # find and restore a lookup_table attribute on the layer. This table needs
            # to be uninitialized as a StaticHashTable cannot be initialized twice.
            self.lookup_table = self._uninitialized_lookup_table()

        # Only set up adapt state if we did not receive a vocab on construction.
        if not self._has_input_vocabulary:
            # Add a custom weight handler to return the layers vocab as it's weight.
            self._add_trackable(VocabWeightHandler(self), False)
            # Set adapt state.
            self.token_counts = tf.lookup.experimental.MutableHashTable(
                key_dtype=vocabulary_dtype,
                value_dtype=tf.int64,
                default_value=0)
            if self.output_mode == TF_IDF:
                self.token_document_counts = tf.lookup.experimental.MutableHashTable(
                    key_dtype=vocabulary_dtype,
                    value_dtype=tf.int64,
                    default_value=0)
                self.num_documents = tf.Variable(0,
                                                 dtype=tf.int64,
                                                 trainable=False)
Пример #18
0
    def testLARSGradientMultiStep(self,
                                  use_resource=False,
                                  use_callable_params=False):
        for dtype in [tf.float16, tf.float32, tf.float64]:
            shape = [3, 3]
            var_np = np.ones(shape)
            grad_np = np.ones(shape)
            lr_np = 0.1
            m_np = 0.9
            wd_np = 0.1
            ep_np = 1e-5
            eeta = 0.1
            vel_np = np.zeros(shape)
            iterations = 10

            var = tf.Variable(var_np, dtype=dtype)
            grad = tf.Variable(grad_np, dtype=dtype)
            opt = lo.MomentumLARS(
                learning_rate=lr_np,
                momentum=m_np,
                eeta=eeta,
                weight_decay=wd_np,
                epsilon=ep_np,
            )

            if not tf.executing_eagerly():
                self.evaluate(tf.compat.v1.global_variables_initializer())
                # Fetch params to validate initial values
                var_tmp = np.ones(shape)
                self.assertAllClose(var_tmp, self.evaluate(var))

            # initialize the variables for eager mode.
            if not tf.executing_eagerly():
                update = opt.apply_gradients([(grad, var)])
                self.evaluate(tf.compat.v1.global_variables_initializer())

            for _ in range(iterations):
                if tf.executing_eagerly():
                    opt.apply_gradients([(grad, var)])
                else:
                    self.evaluate(update)

                post_var = self.evaluate(var)

                # Check we have slots
                self.assertEqual(["momentum"], opt.get_slot_names())
                slot0 = opt.get_slot(var, "momentum")
                self.assertEqual(slot0.get_shape(), var.get_shape())
                post_vel = self.evaluate(opt.get_slot(var, "momentum"))

                w_norm = np.linalg.norm(var_np.flatten(), ord=2)
                g_norm = np.linalg.norm(grad_np.flatten(), ord=2)
                trust_ratio = eeta * w_norm / (g_norm + wd_np * w_norm + ep_np)
                scaled_lr = lr_np * trust_ratio
                grad_np = grad_np + wd_np * var_np

                vel_np = m_np * vel_np + scaled_lr * grad_np
                var_np -= vel_np

                self.assertAllClose(var_np, post_var, 1e-3, 1e-3)
                self.assertAllClose(vel_np, post_vel, 1e-3, 1e-3)
Пример #19
0
 def variable_tracking_works(self):
   x = tf.Variable(1.)
   bijector = tfb.Kumaraswamy(
       concentration0=1., concentration1=x, validate_args=True)
   self.assertIsInstance(bijector, tf.Module)
   self.assertEqual((x,), bijector.trainable_variables)
Пример #20
0
 def testAssertParamsAreFloats(self):
   df = tf.Variable(14, dtype=tf.int32)
   loc = tf.Variable(0, dtype=tf.int32)
   scale = tf.Variable(1, dtype=tf.int32)
   with self.assertRaisesRegexp(ValueError, 'Expected floating point'):
     tfd.HalfStudentT(df=df, loc=loc, scale=scale, validate_args=True)
Пример #21
0
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        # Training params
        num_iterations=20000,
        actor_fc_layers=(64, 64),
        value_fc_layers=(64, 64),
        learning_rate=3e-4,
        collect_sequence_length=2048,
        minibatch_size=64,
        num_epochs=10,
        # Agent params
        importance_ratio_clipping=0.2,
        lambda_value=0.95,
        discount_factor=0.99,
        entropy_regularization=0.,
        value_pred_loss_coef=0.5,
        use_gae=True,
        use_td_lambda_return=True,
        gradient_clipping=None,
        value_clipping=None,
        # Replay params
        reverb_port=None,
        replay_capacity=10000,
        # Others
        policy_save_interval=5000,
        summary_interval=1000,
        eval_interval=10000,
        eval_episodes=30,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """Trains and evaluates PPO (Importance Ratio Clipping).

  Args:
    root_dir: Main directory path where checkpoints, saved_models, and summaries
      will be written to.
    env_name: Name for the Mujoco environment to load.
    num_iterations: The number of iterations to perform collection and training.
    actor_fc_layers: List of fully_connected parameters for the actor network,
      where each item is the number of units in the layer.
    value_fc_layers: : List of fully_connected parameters for the value network,
      where each item is the number of units in the layer.
    learning_rate: Learning rate used on the Adam optimizer.
    collect_sequence_length: Number of steps to take in each collect run.
    minibatch_size: Number of elements in each mini batch. If `None`, the entire
      collected sequence will be treated as one batch.
    num_epochs: Number of iterations to repeat over all collected data per data
      collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for
      Roboschool and 3 for Atari.
    importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For
      more detail, see explanation at the top of the doc.
    lambda_value: Lambda parameter for TD-lambda computation.
    discount_factor: Discount factor for return computation. Default to `0.99`
      which is the value used for all environments from (Schulman, 2017).
    entropy_regularization: Coefficient for entropy regularization loss term.
      Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
    value_pred_loss_coef: Multiplier for value prediction loss to balance with
      policy gradient loss. Default to `0.5`, which was used for all
      environments in the OpenAI baseline implementation. This parameters is
      irrelevant unless you are sharing part of actor_net and value_net. In that
      case, you would want to tune this coeeficient, whose value depends on the
      network architecture of your choice.
    use_gae: If True (default False), uses generalized advantage estimation for
      computing per-timestep advantage. Else, just subtracts value predictions
      from empirical return.
    use_td_lambda_return: If True (default False), uses td_lambda_return for
      training value function; here: `td_lambda_return = gae_advantage +
        value_predictions`. `use_gae` must be set to `True` as well to enable TD
        -lambda returns. If `use_td_lambda_return` is set to True while
        `use_gae` is False, the empirical return will be used and a warning will
        be logged.
    gradient_clipping: Norm length to clip gradients.
    value_clipping: Difference between new and old value predictions are clipped
      to this threshold. Value clipping could be helpful when training
      very deep networks. Default: no clipping.
    reverb_port: Port for reverb server, if None, use a randomly chosen unused
      port.
    replay_capacity: The maximum number of elements for the replay buffer. Items
      will be wasted if this is smalled than collect_sequence_length.
    policy_save_interval: How often, in train_steps, the policy will be saved.
    summary_interval: How often to write data into Tensorboard.
    eval_interval: How often to run evaluation, in train_steps.
    eval_episodes: Number of episodes to evaluate over.
    debug_summaries: Boolean for whether to gather debug summaries.
    summarize_grads_and_vars: If true, gradient summaries will be written.
  """
    collect_env = suite_mujoco.load(env_name)
    eval_env = suite_mujoco.load(env_name)
    num_environments = 1

    observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
        spec_utils.get_tensor_specs(collect_env))

    train_step = train_utils.create_train_step()

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_tensor_spec,
        action_tensor_spec,
        fc_layer_params=actor_fc_layers,
        activation_fn=tf.nn.tanh,
        kernel_initializer=tf.keras.initializers.Orthogonal())
    value_net = value_network.ValueNetwork(
        observation_tensor_spec,
        fc_layer_params=value_fc_layers,
        kernel_initializer=tf.keras.initializers.Orthogonal())

    current_iteration = tf.Variable(0, dtype=tf.int64)

    def learning_rate_fn():
        # Linearly decay the learning rate.
        return learning_rate * (1 - current_iteration / num_iterations)

    agent = ppo_clip_agent.PPOClipAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate_fn, epsilon=1e-5),
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=importance_ratio_clipping,
        lambda_value=lambda_value,
        discount_factor=discount_factor,
        entropy_regularization=entropy_regularization,
        value_pred_loss_coef=value_pred_loss_coef,
        # This is a legacy argument for the number of times we repeat the data
        # inside of the train function, incompatible with mini batch learning.
        # We set the epoch number from the replay buffer and tf.Data instead.
        num_epochs=1,
        use_gae=use_gae,
        use_td_lambda_return=use_td_lambda_return,
        gradient_clipping=gradient_clipping,
        value_clipping=value_clipping,
        # TODO(b/150244758): Default compute_value_and_advantage_in_train to False
        # after Reverb open source.
        compute_value_and_advantage_in_train=False,
        # Skips updating normalizers in the agent, as it's handled in the learner.
        update_normalizers_in_train=False,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step)
    agent.initialize()

    table_name = 'uniform_table'
    table = reverb.Table(table_name,
                         max_size=replay_capacity,
                         sampler=reverb.selectors.Uniform(),
                         remover=reverb.selectors.Fifo(),
                         rate_limiter=reverb.rate_limiters.MinSize(1),
                         max_times_sampled=1)

    reverb_server = reverb.Server([table], port=reverb_port)
    reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=collect_sequence_length,
        table_name=table_name,
        server_address='localhost:{}'.format(reverb_server.port),
        # The only collected sequence is used to populate the batches.
        max_cycle_length=1,
        rate_limiter_timeout_ms=1000)
    rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
        reverb_replay.py_client,
        table_name,
        sequence_length=collect_sequence_length,
        stride_length=collect_sequence_length)

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    collect_env_step_metric = py_metrics.EnvironmentSteps()
    learning_triggers = [
        triggers.PolicySavedModelTrigger(saved_model_dir,
                                         agent,
                                         train_step,
                                         interval=policy_save_interval,
                                         metadata_metrics={
                                             triggers.ENV_STEP_METADATA_KEY:
                                             collect_env_step_metric
                                         }),
        triggers.StepPerSecondLogTrigger(train_step,
                                         interval=summary_interval),
    ]

    agent_learner = ppo_learner.PPOLearner(
        root_dir,
        train_step,
        agent,
        minibatch_size=minibatch_size,
        shuffle_buffer_size=collect_sequence_length,
        triggers=learning_triggers)

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(collect_env,
                                collect_policy,
                                train_step,
                                steps_per_run=collect_sequence_length,
                                observers=[rb_observer],
                                metrics=actor.collect_metrics(buffer_size=10) +
                                [collect_env_step_metric],
                                reference_metrics=[collect_env_step_metric],
                                summary_dir=os.path.join(
                                    root_dir, learner.TRAIN_DIR),
                                summary_interval=summary_interval)

    tf_greedy_policy = agent.policy
    greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy,
                                                       use_tf_function=True)

    if eval_interval:
        logging.info('Intial evaluation.')
        eval_actor = actor.Actor(eval_env,
                                 greedy_policy,
                                 train_step,
                                 metrics=actor.eval_metrics(eval_episodes),
                                 summary_dir=os.path.join(root_dir, 'eval'),
                                 episodes_per_run=eval_episodes)

        eval_actor.run_and_log()

    logging.info('Training.')
    dataset = reverb_replay.as_dataset(
        sample_batch_size=num_environments,
        sequence_preprocess_fn=agent.preprocess_sequence)
    for _ in range(num_iterations):
        collect_actor.run()
        # TODO(b/159490625): Get rid of the reset call once the
        # multi_episode_sequences flag is gone.
        # TODO(b/159615593): Update to use observer.flush.
        # Reset the reverb observer to make sure the data collected is flushed and
        # written to the RB.
        rb_observer.reset()
        agent_learner.run(iterations=num_epochs, dataset=dataset)
        reverb_replay.clear()
        current_iteration.assign_add(1)

        if eval_interval and agent_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()
Пример #22
0
 def test_push_raises_error_if_variable_struct_not_match(self) -> None:
     variable_container = reverb_variable_container.ReverbVariableContainer(
         self._server_address)
     with self.assertRaises(tf.errors.InvalidArgumentError):
         variable_container.push(tf.Variable(1))
Пример #23
0
 def testAssertionsProbs(self):
   x = tf.Variable([0.1, 0.7, 0.0])
   with self.assertRaisesOpError('Argument `probs` must sum to 1.'):
     d = tfd.OneHotCategorical(probs=x, validate_args=True)
     self.evaluate([v.initializer for v in d.variables])
     self.evaluate(d.entropy())
Пример #24
0
 def test_update_raises_key_error_on_unknown_table(self) -> None:
     variable_container = reverb_variable_container.ReverbVariableContainer(
         self._server_address)
     with self.assertRaises(KeyError):
         variable_container.update(tf.Variable(1), 'unknown_table')
Пример #25
0
  def __init__(
      self,
      dataset_spec,
      alpha_optimizer,
      gamma: Union[float, tf.Tensor],
      divergence_limit: Union[float, np.ndarray, tf.Tensor],
      reward_fn: Optional[Callable] = None,
      solve_for_state_action_ratio: bool = True,
      divergence_type: Text = 'rkl',  #'chi2',
      algae_alpha: Union[float, tf.Tensor] = 1.0,
      weight_by_gamma: bool = True,
      limit_episodes: Optional[int] = None,
      num_samples: Optional[int] = None):
    """Initializes the solver.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      weight_network: The weights network.
      weight_optimizer: The optimizer to use for the weights.
      alpha_optimizer: The optimizer to use for Lagrange multipliers on weights.
      gamma: The discount factor to use.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward.
      solve_for_state_action_ratio: Whether to solve for state-action density
        ratio. Defaults to False, which instead solves for state density ratio.
        Although the estimated policy value should be the same, approximating
        using the state density ratio is much faster (especially in large
        environments) and more accurate (especially in low-data regimes).
      divergence_limit: The limit on the f-divergence between the weights and
        the empirical distribution. This should contain half as many elements as
        outputted by the nu, zeta, and weight networks.
      divergence_type: The type of f-divergence to use, e.g., 'kl'.
      algae_alpha: Regularizer coefficient on Df(dpi || dD).
      closed_form_weights: Whether to use closed-form weights. If true,
        weight_network and weight_optimizer are ignored.
      weight_by_gamma: Weight nu and zeta losses by gamma ** step_num.
      limit_episodes: How many episodes to take from the dataset. Defaults to
        None (take whole dataset).
    """
    self._dataset_spec = dataset_spec
    self._gamma = gamma
    if reward_fn is None:
      reward_fn = lambda env_step: env_step.reward
    self._reward_fn = reward_fn

    self._solve_for_state_action_ratio = solve_for_state_action_ratio
    if (not self._solve_for_state_action_ratio and
        not self._dataset_spec.has_log_probability()):
      raise ValueError('Dataset must contain log-probability when '
                       'solve_for_state_action_ratio is False.')

    # Get number of states/actions.
    observation_spec = self._dataset_spec.observation
    action_spec = self._dataset_spec.action
    if not tabular_dual_dice._is_categorical_spec(observation_spec):
      raise ValueError('Observation spec must be discrete and bounded.')
    self._num_states = observation_spec.maximum + 1

    if not tabular_dual_dice._is_categorical_spec(action_spec):
      raise ValueError('Action spec must be discrete and bounded.')
    self._num_actions = action_spec.maximum + 1
    self._dimension = 1 + (
        self._num_states * self._num_actions
        if self._solve_for_state_action_ratio else self._num_states)

    # For learning data weight
    self._divergence_limit = tf.convert_to_tensor(
        divergence_limit, dtype=tf.float32)
    if tf.rank(self._divergence_limit) < 1:
      self._divergence_limit = tf.expand_dims(self._divergence_limit, -1)
    self._two_sided_limit = tf.concat(
        [self._divergence_limit, self._divergence_limit], -1)
    self._num_limits = int(self._two_sided_limit.shape[0])
    # The lagrange multiplier w.r.t. data weight constraint
    self._alpha = tf.Variable(
        np.zeros(self._two_sided_limit.shape), dtype=tf.float32)
    self._alpha_optimizer = alpha_optimizer

    self._algae_alpha = tf.convert_to_tensor(algae_alpha, dtype=tf.float32)
    if tf.rank(self._algae_alpha) < 1:
      self._algae_alpha = tf.expand_dims(self._algae_alpha, -1)
    if self._algae_alpha.shape[-1] != self._two_sided_limit.shape[-1]:
      self._algae_alpha *= tf.ones_like(self._two_sided_limit)
    self._algae_alpha_sign = 2 * (
        tf.cast(self._algae_alpha >= 0, tf.float32) - 0.5)

    self._num_samples = num_samples
    self._categorical_action = common_lib.is_categorical_spec(
        self._dataset_spec.action)
    if not self._categorical_action and self._num_samples is None:
      self._num_samples = 1

    self._divergence_type = divergence_type
    if self._divergence_type not in ['kl', 'rkl', 'chi2']:
      raise ValueError('Unsupported divergence type %s.' %
                       self._divergence_type)

    self._nu = tf.zeros([self._dimension, self._num_limits])
    self._nu2 = tf.zeros([self._dimension, self._num_limits])
    self._zeta = tf.zeros([self._dimension, self._num_limits])
    self._zeta2 = tf.zeros([self._dimension, self._num_limits])
    self._weight_by_gamma = weight_by_gamma
    self._limit_episodes = limit_episodes
Пример #26
0
def _create_nested_variable() -> types.NestedVariable:
    return (tf.Variable(0, dtype=tf.int64, shape=()), {
        'var1': (tf.Variable([1, 1], dtype=tf.float64, shape=(2, )), ),
        'var2': tf.Variable([[2], [3]], dtype=tf.int32, shape=(2, 1))
    })
Пример #27
0
    def test_imhogeneous_poisson_process_example(self):
        # Toy 1D data.
        index_points = np.array([-10., -7.2, -4., -0.1, 0.1, 4., 6.2,
                                 9.]).reshape([-1, 1]).astype(np.float32)
        observed_counts = np.array([100, 90, 60, 13, 18, 37, 55,
                                    42]).astype(np.float32)

        # Trainable GP hyperparameters.
        kernel_log_amplitude = tf.Variable(0., name='kernel_log_amplitude')
        kernel_log_lengthscale = tf.Variable(0., name='kernel_log_lengthscale')
        observation_noise_log_scale = tf.Variable(
            0., name='observation_noise_log_scale')

        # Generative model.
        def model_fn():
            kernel = tfp.math.psd_kernels.ExponentiatedQuadratic(
                amplitude=tf.exp(kernel_log_amplitude),
                length_scale=tf.exp(kernel_log_lengthscale))
            latent_log_rates = yield tfd.JointDistributionCoroutine.Root(
                tfd.GaussianProcess(kernel,
                                    index_points=index_points,
                                    observation_noise_variance=tf.exp(
                                        observation_noise_log_scale),
                                    name='latent_log_rates'))
            yield tfd.Independent(tfd.Poisson(log_rate=latent_log_rates),
                                  reinterpreted_batch_ndims=1,
                                  name='y')

        model = tfd.JointDistributionCoroutine(model_fn, name='model')

        # Variational model.
        logit_locs = tf.Variable(tf.zeros(observed_counts.shape))
        logit_softplus_scales = tf.Variable(
            tf.ones(observed_counts.shape) * -1)

        def variational_model_fn():
            _ = yield tfd.JointDistributionCoroutine.Root(
                tfd.Independent(tfd.Normal(
                    loc=logit_locs,
                    scale=tf.nn.softplus(logit_softplus_scales)),
                                reinterpreted_batch_ndims=1))
            _ = yield tfd.VectorDeterministic(observed_counts)

        q = tfd.JointDistributionCoroutine(variational_model_fn,
                                           name='variational_model')

        losses, sample_path = tfp.vi.fit_surrogate_posterior(
            target_log_prob_fn=lambda *args: model.log_prob(args),
            surrogate_posterior=q,
            optimizer=tf.optimizers.Adam(learning_rate=0.1),
            num_steps=100,
            seed=test_util.test_seed(),
            sample_size=1,
            trace_fn=lambda t: (t.loss, q.sample(seed=42)[0]))

        self.evaluate(tf1.global_variables_initializer())
        losses_, sample_path_ = self.evaluate((losses, sample_path))
        self.assertLess(losses_[-1], 80.)  # Optimal loss is roughly 40.
        # Optimal latent logits are approximately the log observed counts.
        self.assertAllClose(sample_path_[-1],
                            np.log(observed_counts),
                            atol=1.0)
Пример #28
0
 def testAssertionsProbs(self):
     x = tf.Variable([0.1, 0.7, 0.0])
     with self.assertRaisesOpError('Argument `probs` must sum to 1.'):
         d = tfd.Multinomial(total_count=2., probs=x, validate_args=True)
         self.evaluate([v.initializer for v in d.variables])
         self.evaluate(d.mean())
def main(argv):
  del argv  # unused arg
  tf.io.gfile.makedirs(FLAGS.output_dir)
  logging.info('Saving checkpoints at %s', FLAGS.output_dir)
  tf.random.set_seed(FLAGS.seed)

  data_dir = FLAGS.data_dir
  if FLAGS.use_gpu:
    logging.info('Use GPU')
    strategy = tf.distribute.MirroredStrategy()
  else:
    logging.info('Use TPU at %s',
                 FLAGS.tpu if FLAGS.tpu is not None else 'local')
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)

  per_core_batch_size = FLAGS.per_core_batch_size // FLAGS.ensemble_size
  batch_size = per_core_batch_size * FLAGS.num_cores
  check_bool = FLAGS.train_proportion > 0 and FLAGS.train_proportion <= 1
  assert check_bool, 'Proportion of train set has to meet 0 < prop <= 1.'

  drop_remainder_validation = True
  if not FLAGS.use_gpu:
    # This has to be True for TPU traing, otherwise the batchsize of images in
    # the validation set can't be determined by TPU compile.
    assert drop_remainder_validation, 'drop_remainder must be True in TPU mode.'

  validation_percent = 1 - FLAGS.train_proportion
  train_dataset = ub.datasets.get(
      FLAGS.dataset,
      data_dir=data_dir,
      download_data=FLAGS.download_data,
      split=tfds.Split.TRAIN,
      validation_percent=validation_percent).load(batch_size=batch_size)
  validation_dataset = ub.datasets.get(
      FLAGS.dataset,
      data_dir=data_dir,
      download_data=FLAGS.download_data,
      split=tfds.Split.VALIDATION,
      validation_percent=validation_percent,
      drop_remainder=drop_remainder_validation).load(batch_size=batch_size)
  validation_dataset = validation_dataset.repeat()
  clean_test_dataset = ub.datasets.get(
      FLAGS.dataset,
      data_dir=data_dir,
      download_data=FLAGS.download_data,
      split=tfds.Split.TEST).load(batch_size=batch_size)
  train_dataset = strategy.experimental_distribute_dataset(train_dataset)
  validation_dataset = strategy.experimental_distribute_dataset(
      validation_dataset)
  test_datasets = {
      'clean': strategy.experimental_distribute_dataset(clean_test_dataset),
  }
  if FLAGS.corruptions_interval > 0:
    if FLAGS.dataset == 'cifar100':
      data_dir = FLAGS.cifar100_c_path
    corruption_types, _ = utils.load_corrupted_test_info(FLAGS.dataset)
    for corruption_type in corruption_types:
      for severity in range(1, 6):
        dataset = ub.datasets.get(
            f'{FLAGS.dataset}_corrupted',
            corruption_type=corruption_type,
            data_dir=data_dir,
            severity=severity,
            split=tfds.Split.TEST).load(batch_size=batch_size)
        test_datasets[f'{corruption_type}_{severity}'] = (
            strategy.experimental_distribute_dataset(dataset))

  ds_info = tfds.builder(FLAGS.dataset).info
  train_sample_size = ds_info.splits[
      'train'].num_examples * FLAGS.train_proportion
  steps_per_epoch = int(train_sample_size / batch_size)
  train_sample_size = int(train_sample_size)

  steps_per_eval = ds_info.splits['test'].num_examples // batch_size
  num_classes = ds_info.features['label'].num_classes

  summary_writer = tf.summary.create_file_writer(
      os.path.join(FLAGS.output_dir, 'summaries'))

  logging.info('Building Keras model.')
  depth = 28
  width = 10

  dict_ranges = {'min': FLAGS.min_l2_range, 'max': FLAGS.max_l2_range}
  ranges = [dict_ranges for _ in range(6)]  # 6 independent l2 parameters
  model_config = {
      'key_to_index': {
          'input_conv_l2_kernel': 0,
          'group_l2_kernel': 1,
          'group_1_l2_kernel': 2,
          'group_2_l2_kernel': 3,
          'dense_l2_kernel': 4,
          'dense_l2_bias': 5,
      },
      'ranges': ranges,
      'test': None
  }
  lambdas_config = LambdaConfig(model_config['ranges'],
                                model_config['key_to_index'])

  if FLAGS.e_body_hidden_units > 0:
    e_body_arch = '({},)'.format(FLAGS.e_body_hidden_units)
  else:
    e_body_arch = '()'
  e_shared_arch = '()'
  e_activation = 'tanh'
  filters_resnet = [16]
  for i in range(0, 3):  # 3 groups of blocks
    filters_resnet.extend([16 * width * 2**i] * 9)  # 9 layers in each block
  # e_head dim for conv2d is just the number of filters (only
  # kernel) and twice num of classes for the last dense layer (kernel + bias)
  e_head_dims = [x for x in filters_resnet] + [2 * num_classes]

  with strategy.scope():
    e_models = e_factory(
        lambdas_config.input_shape,
        e_head_dims=e_head_dims,
        e_body_arch=eval(e_body_arch),  # pylint: disable=eval-used
        e_shared_arch=eval(e_shared_arch),  # pylint: disable=eval-used
        activation=e_activation,
        use_bias=FLAGS.e_model_use_bias,
        e_head_init=FLAGS.init_emodels_stddev)

    model = wide_resnet_hyperbatchensemble(
        input_shape=ds_info.features['image'].shape,
        depth=depth,
        width_multiplier=width,
        num_classes=num_classes,
        ensemble_size=FLAGS.ensemble_size,
        random_sign_init=FLAGS.random_sign_init,
        config=lambdas_config,
        e_models=e_models,
        l2_batchnorm_layer=FLAGS.l2_batchnorm,
        regularize_fast_weights=FLAGS.regularize_fast_weights,
        fast_weights_eq_contraint=FLAGS.fast_weights_eq_contraint,
        version=2)

    logging.info('Model input shape: %s', model.input_shape)
    logging.info('Model output shape: %s', model.output_shape)
    logging.info('Model number of weights: %s', model.count_params())
    # build hyper-batchensemble complete -------------------------

    # Initialize Lambda distributions for tuning
    lambdas_mean = tf.reduce_mean(
        log_uniform_mean(
            [lambdas_config.log_min, lambdas_config.log_max]))
    lambdas0 = tf.random.normal((FLAGS.ensemble_size, lambdas_config.dim),
                                lambdas_mean,
                                0.1 * FLAGS.ens_init_delta_bounds)
    lower0 = lambdas0 - tf.constant(FLAGS.ens_init_delta_bounds)
    lower0 = tf.maximum(lower0, 1e-8)
    upper0 = lambdas0 + tf.constant(FLAGS.ens_init_delta_bounds)

    log_lower = tf.Variable(tf.math.log(lower0))
    log_upper = tf.Variable(tf.math.log(upper0))
    lambda_parameters = [log_lower, log_upper]  # these variables are tuned
    clip_lambda_parameters(lambda_parameters, lambdas_config)

    # Optimizer settings to train model weights
    # Linearly scale learning rate and the decay epochs by vanilla settings.
    # Note: Here, we don't divide the epochs by 200 as for the other uncertainty
    # baselines.
    base_lr = FLAGS.base_learning_rate * batch_size / 128
    lr_decay_epochs = [int(l) for l in FLAGS.lr_decay_epochs]

    lr_schedule = ub.schedules.WarmUpPiecewiseConstantSchedule(
        steps_per_epoch,
        base_lr,
        decay_ratio=FLAGS.lr_decay_ratio,
        decay_epochs=lr_decay_epochs,
        warmup_epochs=FLAGS.lr_warmup_epochs)
    optimizer = tf.keras.optimizers.SGD(lr_schedule,
                                        momentum=1.0 - FLAGS.one_minus_momentum,
                                        nesterov=True)

    # tuner used for optimizing lambda_parameters
    tuner = tf.keras.optimizers.Adam(FLAGS.lr_tuning)

    metrics = {
        'train/negative_log_likelihood': tf.keras.metrics.Mean(),
        'train/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(),
        'train/loss': tf.keras.metrics.Mean(),
        'train/ece': rm.metrics.ExpectedCalibrationError(
            num_bins=FLAGS.num_bins),
        'train/diversity': rm.metrics.AveragePairwiseDiversity(),
        'test/negative_log_likelihood': tf.keras.metrics.Mean(),
        'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(),
        'test/ece': rm.metrics.ExpectedCalibrationError(
            num_bins=FLAGS.num_bins),
        'test/gibbs_nll': tf.keras.metrics.Mean(),
        'test/gibbs_accuracy': tf.keras.metrics.SparseCategoricalAccuracy(),
        'test/diversity': rm.metrics.AveragePairwiseDiversity(),
        'validation/loss': tf.keras.metrics.Mean(),
        'validation/loss_entropy': tf.keras.metrics.Mean(),
        'validation/loss_ce': tf.keras.metrics.Mean()
    }
    corrupt_metrics = {}

    for i in range(FLAGS.ensemble_size):
      metrics['test/nll_member_{}'.format(i)] = tf.keras.metrics.Mean()
      metrics['test/accuracy_member_{}'.format(i)] = (
          tf.keras.metrics.SparseCategoricalAccuracy())
    if FLAGS.corruptions_interval > 0:
      for intensity in range(1, 6):
        for corruption in corruption_types:
          dataset_name = '{0}_{1}'.format(corruption, intensity)
          corrupt_metrics['test/nll_{}'.format(dataset_name)] = (
              tf.keras.metrics.Mean())
          corrupt_metrics['test/accuracy_{}'.format(dataset_name)] = (
              tf.keras.metrics.SparseCategoricalAccuracy())
          corrupt_metrics['test/ece_{}'.format(dataset_name)] = (
              rm.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins))

    checkpoint = tf.train.Checkpoint(
        model=model, lambda_parameters=lambda_parameters, optimizer=optimizer)

    latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir)
    initial_epoch = 0
    if latest_checkpoint and FLAGS.restore_checkpoint:
      # checkpoint.restore must be within a strategy.scope() so that optimizer
      # slot variables are mirrored.
      checkpoint.restore(latest_checkpoint)
      logging.info('Loaded checkpoint %s', latest_checkpoint)
      initial_epoch = optimizer.iterations.numpy() // steps_per_epoch

  @tf.function
  def train_step(iterator):
    """Training StepFn."""
    def step_fn(inputs):
      """Per-Replica StepFn."""
      images = inputs['features']
      labels = inputs['labels']
      images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])

      # generate lambdas
      lambdas = log_uniform_sample(
          per_core_batch_size, lambda_parameters)
      lambdas = tf.reshape(
          lambdas,
          (FLAGS.ensemble_size * per_core_batch_size, lambdas_config.dim))

      with tf.GradientTape() as tape:
        logits = model([images, lambdas], training=True)

        if FLAGS.use_gibbs_ce:
          # Average of single model CEs
          # tiling of labels should be only done for Gibbs CE loss
          labels = tf.tile(labels, [FLAGS.ensemble_size])
          negative_log_likelihood = tf.reduce_mean(
              tf.keras.losses.sparse_categorical_crossentropy(labels,
                                                              logits,
                                                              from_logits=True))
        else:
          # Ensemble CE uses no tiling of the labels
          negative_log_likelihood = ensemble_crossentropy(
              labels, logits, FLAGS.ensemble_size)
        # Note: Divide l2_loss by sample_size (this differs from uncertainty_
        # baselines implementation.)
        l2_loss = sum(model.losses) / train_sample_size
        loss = negative_log_likelihood + l2_loss
        # Scale the loss given the TPUStrategy will reduce sum all gradients.
        scaled_loss = loss / strategy.num_replicas_in_sync

      grads = tape.gradient(scaled_loss, model.trainable_variables)

      # Separate learning rate for fast weights.
      grads_and_vars = []
      for grad, var in zip(grads, model.trainable_variables):
        if (('alpha' in var.name or 'gamma' in var.name) and
            'batch_norm' not in var.name):
          grads_and_vars.append((grad * FLAGS.fast_weight_lr_multiplier, var))
        else:
          grads_and_vars.append((grad, var))
      optimizer.apply_gradients(grads_and_vars)

      probs = tf.nn.softmax(logits)
      per_probs = tf.split(
          probs, num_or_size_splits=FLAGS.ensemble_size, axis=0)
      per_probs_stacked = tf.stack(per_probs, axis=0)
      metrics['train/ece'].add_batch(probs, label=labels)
      metrics['train/loss'].update_state(loss)
      metrics['train/negative_log_likelihood'].update_state(
          negative_log_likelihood)
      metrics['train/accuracy'].update_state(labels, logits)
      metrics['train/diversity'].add_batch(per_probs_stacked)

      if grads_and_vars:
        grads, _ = zip(*grads_and_vars)

    strategy.run(step_fn, args=(next(iterator),))

  @tf.function
  def tuning_step(iterator):
    """Tuning StepFn."""
    def step_fn(inputs):
      """Per-Replica StepFn."""
      images = inputs['features']
      labels = inputs['labels']
      images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])

      with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(lambda_parameters)

        # sample lambdas
        if FLAGS.sample_and_tune:
          lambdas = log_uniform_sample(
              per_core_batch_size, lambda_parameters)
        else:
          lambdas = log_uniform_mean(lambda_parameters)
          lambdas = tf.repeat(lambdas, per_core_batch_size, axis=0)
        lambdas = tf.reshape(lambdas,
                             (FLAGS.ensemble_size * per_core_batch_size,
                              lambdas_config.dim))
        # ensemble CE
        logits = model([images, lambdas], training=False)
        ce = ensemble_crossentropy(labels, logits, FLAGS.ensemble_size)
        # entropy penalty for lambda distribution
        entropy = FLAGS.tau * log_uniform_entropy(
            lambda_parameters)
        loss = ce - entropy
        scaled_loss = loss / strategy.num_replicas_in_sync

      gradients = tape.gradient(loss, lambda_parameters)
      tuner.apply_gradients(zip(gradients, lambda_parameters))

      metrics['validation/loss_ce'].update_state(ce /
                                                 strategy.num_replicas_in_sync)
      metrics['validation/loss_entropy'].update_state(
          entropy / strategy.num_replicas_in_sync)
      metrics['validation/loss'].update_state(scaled_loss)

    strategy.run(step_fn, args=(next(iterator),))

  @tf.function
  def test_step(iterator, dataset_name, num_eval_samples=0):
    """Evaluation StepFn."""

    n_samples = num_eval_samples if num_eval_samples >= 0 else -num_eval_samples
    if num_eval_samples >= 0:
      # the +1 accounts for the fact that we add the mean of lambdas
      ensemble_size = FLAGS.ensemble_size * (1 + n_samples)
    else:
      ensemble_size = FLAGS.ensemble_size * n_samples

    def step_fn(inputs):
      """Per-Replica StepFn."""
      # Note that we don't use tf.tile for labels here
      images = inputs['features']
      labels = inputs['labels']
      images = tf.tile(images, [ensemble_size, 1, 1, 1])

      # get lambdas
      samples = log_uniform_sample(n_samples, lambda_parameters)
      if num_eval_samples >= 0:
        lambdas = log_uniform_mean(lambda_parameters)
        lambdas = tf.expand_dims(lambdas, 1)
        lambdas = tf.concat((lambdas, samples), 1)
      else:
        lambdas = samples

      # lambdas with shape (ens size, samples, dim of lambdas)
      rep_lambdas = tf.repeat(lambdas, per_core_batch_size, axis=1)
      rep_lambdas = tf.reshape(rep_lambdas,
                               (ensemble_size * per_core_batch_size, -1))

      # eval on testsets
      logits = model([images, rep_lambdas], training=False)
      probs = tf.nn.softmax(logits)
      per_probs = tf.split(probs,
                           num_or_size_splits=ensemble_size,
                           axis=0)

      # per member performance and gibbs performance (average per member perf)
      if dataset_name == 'clean':
        for i in range(FLAGS.ensemble_size):
          # we record the first sample of lambdas per batch-ens member
          first_member_index = i * (ensemble_size // FLAGS.ensemble_size)
          member_probs = per_probs[first_member_index]
          member_loss = tf.keras.losses.sparse_categorical_crossentropy(
              labels, member_probs)
          metrics['test/nll_member_{}'.format(i)].update_state(member_loss)
          metrics['test/accuracy_member_{}'.format(i)].update_state(
              labels, member_probs)

        labels_tile = tf.tile(labels, [ensemble_size])
        metrics['test/gibbs_nll'].update_state(tf.reduce_mean(
            tf.keras.losses.sparse_categorical_crossentropy(labels_tile,
                                                            logits,
                                                            from_logits=True)))
        metrics['test/gibbs_accuracy'].update_state(labels_tile, probs)

      # ensemble performance
      negative_log_likelihood = ensemble_crossentropy(labels, logits,
                                                      ensemble_size)
      probs = tf.reduce_mean(per_probs, axis=0)
      if dataset_name == 'clean':
        metrics['test/negative_log_likelihood'].update_state(
            negative_log_likelihood)
        metrics['test/accuracy'].update_state(labels, probs)
        metrics['test/ece'].add_batch(probs, label=labels)
      else:
        corrupt_metrics['test/nll_{}'.format(dataset_name)].update_state(
            negative_log_likelihood)
        corrupt_metrics['test/accuracy_{}'.format(dataset_name)].update_state(
            labels, probs)
        corrupt_metrics['test/ece_{}'.format(dataset_name)].add_batch(
            probs, label=labels)

      if dataset_name == 'clean':
        per_probs_stacked = tf.stack(per_probs, axis=0)
        metrics['test/diversity'].add_batch(per_probs_stacked)

    strategy.run(step_fn, args=(next(iterator),))

  logging.info(
      '--- Starting training using %d examples. ---', train_sample_size)
  train_iterator = iter(train_dataset)
  validation_iterator = iter(validation_dataset)
  start_time = time.time()
  for epoch in range(initial_epoch, FLAGS.train_epochs):
    logging.info('Starting to run epoch: %s', epoch)
    for step in range(steps_per_epoch):
      train_step(train_iterator)
      do_tuning = (epoch >= FLAGS.tuning_warmup_epochs)
      if do_tuning and ((step + 1) % FLAGS.tuning_every_x_step == 0):
        tuning_step(validation_iterator)
        # clip lambda parameters if outside of range
        clip_lambda_parameters(lambda_parameters, lambdas_config)

      current_step = epoch * steps_per_epoch + (step + 1)
      max_steps = steps_per_epoch * FLAGS.train_epochs
      time_elapsed = time.time() - start_time
      steps_per_sec = float(current_step) / time_elapsed
      eta_seconds = (max_steps - current_step) / steps_per_sec
      message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
                 'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                     current_step / max_steps,
                     epoch + 1,
                     FLAGS.train_epochs,
                     steps_per_sec,
                     eta_seconds / 60,
                     time_elapsed / 60))
      if step % 20 == 0:
        logging.info(message)

    # evaluate on test data
    datasets_to_evaluate = {'clean': test_datasets['clean']}
    if (FLAGS.corruptions_interval > 0 and
        (epoch + 1) % FLAGS.corruptions_interval == 0):
      datasets_to_evaluate = test_datasets
    for dataset_name, test_dataset in datasets_to_evaluate.items():
      test_iterator = iter(test_dataset)
      logging.info('Testing on dataset %s', dataset_name)
      for step in range(steps_per_eval):
        if step % 20 == 0:
          logging.info('Starting to run eval step %s of epoch: %s', step,
                       epoch)
        test_step(test_iterator, dataset_name, FLAGS.num_eval_samples)
      logging.info('Done with testing on %s', dataset_name)

    corrupt_results = {}
    if (FLAGS.corruptions_interval > 0 and
        (epoch + 1) % FLAGS.corruptions_interval == 0):
      corrupt_results = utils.aggregate_corrupt_metrics(corrupt_metrics,
                                                        corruption_types)
    logging.info('Train Loss: %.4f, Accuracy: %.2f%%',
                 metrics['train/loss'].result(),
                 metrics['train/accuracy'].result() * 100)
    logging.info('Validation Loss: %.4f, CE: %.4f, Entropy: %.4f',
                 metrics['validation/loss'].result(),
                 metrics['validation/loss_ce'].result(),
                 metrics['validation/loss_entropy'].result())
    logging.info('Test NLL: %.4f, Accuracy: %.2f%%',
                 metrics['test/negative_log_likelihood'].result(),
                 metrics['test/accuracy'].result() * 100)
    for i in range(FLAGS.ensemble_size):
      logging.info('Member %d Test Loss: %.4f, Accuracy: %.2f%%',
                   i, metrics['test/nll_member_{}'.format(i)].result(),
                   metrics['test/accuracy_member_{}'.format(i)].result() * 100)

    total_results = {name: metric.result() for name, metric in metrics.items()}
    total_results.update(
        {name: metric.result() for name, metric in corrupt_metrics.items()})
    total_results.update(corrupt_results)
    # Results from Robustness Metrics themselves return a dict, so flatten them.
    total_results = utils.flatten_dictionary(total_results)
    with summary_writer.as_default():
      for name, result in total_results.items():
        tf.summary.scalar(name, result, step=epoch + 1)

    for metric in metrics.values():
      metric.reset_states()

    # save checkpoint and lambdas config
    if (FLAGS.checkpoint_interval > 0 and
        (epoch + 1) % FLAGS.checkpoint_interval == 0):
      checkpoint_name = checkpoint.save(
          os.path.join(FLAGS.output_dir, 'checkpoint'))
      lambdas_cf = lambdas_config.get_config()
      filepath = os.path.join(FLAGS.output_dir, 'lambdas_config.p')
      with tf.io.gfile.GFile(filepath, 'wb') as fp:
        pickle.dump(lambdas_cf, fp, protocol=pickle.HIGHEST_PROTOCOL)
      logging.info('Saved checkpoint to %s', checkpoint_name)
  with summary_writer.as_default():
    hp.hparams({
        'base_learning_rate': FLAGS.base_learning_rate,
        'one_minus_momentum': FLAGS.one_minus_momentum,
        'l2': FLAGS.l2,
        'random_sign_init': FLAGS.random_sign_init,
        'fast_weight_lr_multiplier': FLAGS.fast_weight_lr_multiplier,
    })
Пример #30
0
  def __init__(self,
               num_channels,
               float_dtype,
               alpha_lo=0.001,
               alpha_hi=1.999,
               alpha_init=None,
               scale_lo=1e-5,
               scale_init=1.0,
               name=None):
    """Constructs the loss function.

    Args:
      num_channels: the number of different "channels" for the adaptive loss
        function, where each channel will be assigned its own shape (alpha) and
        scale parameters that are constructed as variables and can be optimized
        over.
      float_dtype: The expected numerical precision of the input, which will
        also determine the precision of the latent variables used to model scale
        and alpha internally.
      alpha_lo: The lowest possible value for loss's alpha parameters, must be
        >=0 and a scalar. Should probably be in (0, 2).
      alpha_hi: The highest possible value for loss's alpha parameters, must be
        >=alpha_lo and a scalar. Should probably be in (0, 2).
      alpha_init: The value that the loss's alpha parameters will be initialized
        to, must be in (`alpha_lo`, `alpha_hi`), unless `alpha_lo==alpha_hi` in
        which case this will be ignored. Defaults to (`alpha_lo+alpha_hi)/2`.
      scale_lo: The lowest possible value for the loss's scale parameters. Must
        be > 0 and a scalar. This value may have more of an effect than you
        think, as the loss is unbounded as scale approaches zero.
      scale_init: The initial value used for the loss's scale parameters. This
        also defines the zero-point of the latent representation of scales, so
        SGD may cause optimization to gravitate towards producing scales near
        this value.
      name: The name of the module.

    Raises:
      ValueError: If any of the arguments are invalid.
    """
    super(AdaptiveLossFunction, self).__init__(name=name)
    _check_scale(scale_lo, scale_init)
    if not np.isscalar(alpha_lo):
      raise ValueError('`alpha_lo` must be a scalar, but is of type {}'.format(
          type(alpha_lo)))
    if not np.isscalar(alpha_hi):
      raise ValueError('`alpha_hi` must be a scalar, but is of type {}'.format(
          type(alpha_hi)))
    if alpha_init is not None and not np.isscalar(alpha_init):
      raise ValueError(
          '`alpha_init` must be None or a scalar, but is of type {}'.format(
              type(alpha_init)))
    if not alpha_lo >= 0:
      raise ValueError('`alpha_lo` must be >= 0, but is {}'.format(alpha_lo))
    if not alpha_hi >= alpha_lo:
      raise ValueError('`alpha_hi` = {} must be >= `alpha_lo` = {}'.format(
          alpha_hi, alpha_lo))
    if alpha_init is not None and alpha_lo != alpha_hi:
      if not (alpha_init > alpha_lo and alpha_init < alpha_hi):
        raise ValueError(
            '`alpha_init` = {} must be in (`alpha_lo`, `alpha_hi`) = ({} {})'
            .format(alpha_init, alpha_lo, alpha_hi))

    if alpha_lo != alpha_hi:
      # If alpha isn't constant, construct a "latent" alpha variable.
      if alpha_init is None:
        alpha_init = (alpha_lo + alpha_hi) / 2.
      latent_alpha_init = (
          util.inv_affine_sigmoid(alpha_init, lo=alpha_lo, hi=alpha_hi))
      self._latent_alpha = tf.Variable(
          tf.fill((1, num_channels),
                  tf.cast(latent_alpha_init, dtype=float_dtype)),
          name='LatentAlpha')

    if scale_lo != scale_init:
      # If shape isn't constant, construct a "latent" scale variable.
      self._latent_scale = tf.Variable(
          tf.zeros((1, num_channels), float_dtype), name='LatentScale')

    self._num_channels = num_channels
    self._float_dtype = tf.dtypes.as_dtype(float_dtype)
    self._alpha_lo = alpha_lo
    self._alpha_hi = alpha_hi
    self._scale_lo = scale_lo
    self._scale_init = scale_init
    self._distribution = distribution.Distribution()