示例#1
0
def main(argv):
    del argv  # Unused arg.

    tf.enable_v2_behavior()
    tf.random.set_seed(FLAGS.seed)

    if FLAGS.version2:
        per_core_bs_train = FLAGS.per_core_batch_size // (
            FLAGS.ensemble_size * FLAGS.num_train_samples)
        per_core_bs_eval = FLAGS.per_core_batch_size // (
            FLAGS.ensemble_size * FLAGS.num_eval_samples)
    else:
        per_core_bs_train = FLAGS.per_core_batch_size // FLAGS.num_train_samples
        per_core_bs_eval = FLAGS.per_core_batch_size // FLAGS.num_eval_samples
    batch_size_train = per_core_bs_train * FLAGS.num_cores
    batch_size_eval = per_core_bs_eval * FLAGS.num_cores

    logging.info('Saving checkpoints at %s', FLAGS.output_dir)

    if FLAGS.use_gpu:
        logging.info('Use GPU')
        strategy = tf.distribute.MirroredStrategy()
    else:
        logging.info('Use TPU at %s',
                     FLAGS.tpu if FLAGS.tpu is not None else 'local')
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)

    train_input_fn = utils.load_input_fn(split=tfds.Split.TRAIN,
                                         name=FLAGS.dataset,
                                         batch_size=per_core_bs_train,
                                         use_bfloat16=FLAGS.use_bfloat16,
                                         normalize=False)
    clean_test_input_fn = utils.load_input_fn(split=tfds.Split.TEST,
                                              name=FLAGS.dataset,
                                              batch_size=per_core_bs_eval,
                                              use_bfloat16=FLAGS.use_bfloat16,
                                              normalize=False)
    train_dataset = strategy.experimental_distribute_datasets_from_function(
        train_input_fn)
    test_datasets = {
        'clean':
        strategy.experimental_distribute_datasets_from_function(
            clean_test_input_fn),
    }
    if FLAGS.corruptions_interval > 0:
        if FLAGS.dataset == 'cifar10':
            load_c_input_fn = utils.load_cifar10_c_input_fn
        else:
            load_c_input_fn = functools.partial(utils.load_cifar100_c_input_fn,
                                                path=FLAGS.cifar100_c_path)
        corruption_types, max_intensity = utils.load_corrupted_test_info(
            FLAGS.dataset)
        for corruption in corruption_types:
            for intensity in range(1, max_intensity + 1):
                input_fn = load_c_input_fn(corruption_name=corruption,
                                           corruption_intensity=intensity,
                                           batch_size=per_core_bs_eval,
                                           use_bfloat16=FLAGS.use_bfloat16,
                                           normalize=False)
                test_datasets['{0}_{1}'.format(corruption, intensity)] = (
                    strategy.experimental_distribute_datasets_from_function(
                        input_fn))

    ds_info = tfds.builder(FLAGS.dataset).info
    train_dataset_size = ds_info.splits['train'].num_examples
    test_dataset_size = ds_info.splits['test'].num_examples
    num_classes = ds_info.features['label'].num_classes

    steps_per_epoch = train_dataset_size // batch_size_train
    steps_per_eval = test_dataset_size // batch_size_eval

    if FLAGS.use_bfloat16:
        policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
        tf.keras.mixed_precision.experimental.set_policy(policy)

    summary_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.output_dir, 'summaries'))

    with strategy.scope():
        logging.info('Building Keras ResNet-32 model')
        model = resnet_cifar_model.rank1_resnet_v1(
            input_shape=ds_info.features['image'].shape,
            depth=32,
            num_classes=num_classes,
            width_multiplier=4,
            alpha_initializer=FLAGS.alpha_initializer,
            gamma_initializer=FLAGS.gamma_initializer,
            alpha_regularizer=FLAGS.alpha_regularizer,
            gamma_regularizer=FLAGS.gamma_regularizer,
            use_additive_perturbation=FLAGS.use_additive_perturbation,
            ensemble_size=FLAGS.ensemble_size,
            random_sign_init=FLAGS.random_sign_init,
            dropout_rate=FLAGS.dropout_rate)
        logging.info(model.summary())
        base_lr = FLAGS.base_learning_rate * batch_size_train / 128
        lr_decay_epochs = [(int(start_epoch_str) * FLAGS.train_epochs) // 200
                           for start_epoch_str in FLAGS.lr_decay_epochs]
        lr_schedule = utils.LearningRateSchedule(
            steps_per_epoch,
            base_lr,
            decay_ratio=FLAGS.lr_decay_ratio,
            decay_epochs=lr_decay_epochs,
            warmup_epochs=FLAGS.lr_warmup_epochs)
        optimizer = tf.keras.optimizers.SGD(lr_schedule,
                                            momentum=0.9,
                                            nesterov=True)
        metrics = {
            'train/negative_log_likelihood':
            tf.keras.metrics.Mean(),
            'train/accuracy':
            tf.keras.metrics.SparseCategoricalAccuracy(),
            'train/loss':
            tf.keras.metrics.Mean(),
            'train/ece':
            ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins),
            'test/negative_log_likelihood':
            tf.keras.metrics.Mean(),
            'test/accuracy':
            tf.keras.metrics.SparseCategoricalAccuracy(),
            'test/ece':
            ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins),
            'test/loss':
            tf.keras.metrics.Mean(),
        }
        if FLAGS.corruptions_interval > 0:
            corrupt_metrics = {}
            for intensity in range(1, max_intensity + 1):
                for corruption in corruption_types:
                    dataset_name = '{0}_{1}'.format(corruption, intensity)
                    corrupt_metrics['test/nll_{}'.format(dataset_name)] = (
                        tf.keras.metrics.Mean())
                    corrupt_metrics['test/accuracy_{}'.format(
                        dataset_name)] = (
                            tf.keras.metrics.SparseCategoricalAccuracy())
                    corrupt_metrics['test/ece_{}'.format(dataset_name)] = (
                        ed.metrics.ExpectedCalibrationError(
                            num_bins=FLAGS.num_bins))

        test_diversity = {}
        training_diversity = {}
        if FLAGS.ensemble_size > 1:
            for i in range(FLAGS.ensemble_size):
                metrics['test/nll_member_{}'.format(
                    i)] = tf.keras.metrics.Mean()
                metrics['test/accuracy_member_{}'.format(i)] = (
                    tf.keras.metrics.SparseCategoricalAccuracy())
            test_diversity = {
                'test/disagreement': tf.keras.metrics.Mean(),
                'test/average_kl': tf.keras.metrics.Mean(),
                'test/cosine_similarity': tf.keras.metrics.Mean(),
            }
            training_diversity = {
                'train/disagreement': tf.keras.metrics.Mean(),
                'train/average_kl': tf.keras.metrics.Mean(),
                'train/cosine_similarity': tf.keras.metrics.Mean(),
            }

        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir)
        initial_epoch = 0
        if latest_checkpoint:
            # checkpoint.restore must be within a strategy.scope() so that optimizer
            # slot variables are mirrored.
            checkpoint.restore(latest_checkpoint)
            logging.info('Loaded checkpoint %s', latest_checkpoint)
            initial_epoch = optimizer.iterations.numpy() // steps_per_epoch

    @tf.function
    def train_step(iterator):
        """Training StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            if FLAGS.version2 and FLAGS.ensemble_size > 1:
                images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
                if not (FLAGS.member_sampling or FLAGS.expected_probs):
                    labels = tf.tile(labels, [FLAGS.ensemble_size])

            if FLAGS.num_train_samples > 1:
                images = tf.tile(images, [FLAGS.num_train_samples, 1, 1, 1])

            with tf.GradientTape() as tape:
                logits = model(images, training=True)
                probs = tf.nn.softmax(logits)
                # Diversity evaluation.
                if FLAGS.version2 and FLAGS.ensemble_size > 1:
                    per_probs = tf.reshape(
                        probs,
                        tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]],
                                  0))

                    diversity_results = ed.metrics.average_pairwise_diversity(
                        per_probs, FLAGS.ensemble_size)

                if FLAGS.num_train_samples > 1:
                    probs = tf.reshape(
                        probs,
                        tf.concat(
                            [[FLAGS.num_train_samples, -1], probs.shape[1:]],
                            0))
                    probs = tf.reduce_mean(probs, 0)

                if FLAGS.member_sampling and FLAGS.version2 and FLAGS.ensemble_size > 1:
                    idx = tf.random.uniform([],
                                            maxval=FLAGS.ensemble_size,
                                            dtype=tf.int64)
                    idx_one_hot = tf.expand_dims(
                        tf.one_hot(idx, FLAGS.ensemble_size,
                                   dtype=probs.dtype), 0)
                    probs_shape = probs.shape
                    probs = tf.reshape(probs, [FLAGS.ensemble_size, -1])
                    probs = tf.matmul(idx_one_hot, probs)
                    probs = tf.reshape(probs,
                                       tf.concat([[-1], probs_shape[1:]], 0))

                elif FLAGS.expected_probs and FLAGS.version2 and FLAGS.ensemble_size > 1:
                    probs = tf.reshape(
                        probs,
                        tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]],
                                  0))
                    probs = tf.reduce_mean(probs, 0)

                negative_log_likelihood = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        labels, probs))

                filtered_variables = []
                for var in model.trainable_variables:
                    # Apply l2 on the slow weights and bias terms. This excludes BN
                    # parameters and fast weight approximate posterior/prior parameters,
                    # but pay caution to their naming scheme.
                    if 'kernel' in var.name or 'bias' in var.name:
                        filtered_variables.append(tf.reshape(var, (-1, )))

                l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss(
                    tf.concat(filtered_variables, axis=0))
                kl = sum(model.losses) / train_dataset_size
                kl_scale = tf.cast(optimizer.iterations + 1, kl.dtype)
                kl_scale /= FLAGS.kl_annealing_steps
                kl_scale = tf.minimum(1., kl_scale)
                kl_loss = kl_scale * kl

                # Scale the loss given the TPUStrategy will reduce sum all gradients.
                loss = negative_log_likelihood + l2_loss + kl_loss
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)

            # Separate learning rate implementation.
            grad_list = []
            if FLAGS.fast_weight_lr_multiplier != 1.0:
                grads_and_vars = list(zip(grads, model.trainable_variables))
                for vec, var in grads_and_vars:
                    # Apply different learning rate on the fast weight approximate
                    # posterior/prior parameters. This is excludes BN and slow weights,
                    # but pay caution to the naming scheme.
                    if ('batch_norm' not in var.name
                            and 'kernel' not in var.name):
                        grad_list.append(
                            (vec * FLAGS.fast_weight_lr_multiplier, var))
                    else:
                        grad_list.append((vec, var))
                optimizer.apply_gradients(grad_list)
            else:
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))

            metrics['train/ece'].update_state(labels, probs)
            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/accuracy'].update_state(labels, probs)
            if FLAGS.version2 and FLAGS.ensemble_size > 1:
                for k, v in diversity_results.items():
                    training_diversity['train/' + k].update_state(v)

        strategy.run(step_fn, args=(next(iterator), ))

    @tf.function
    def test_step(iterator, dataset_name):
        """Evaluation StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            if FLAGS.ensemble_size > 1:
                images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
            if FLAGS.num_eval_samples > 1:
                images = tf.tile(images, [FLAGS.num_eval_samples, 1, 1, 1])
            logits = model(images, training=False)
            probs = tf.nn.softmax(logits)

            if FLAGS.num_eval_samples > 1:
                probs = tf.reshape(
                    probs,
                    tf.concat([[FLAGS.num_eval_samples, -1], probs.shape[1:]],
                              0))
                probs = tf.reduce_mean(probs, 0)

            if FLAGS.ensemble_size > 1:
                per_probs = tf.split(probs,
                                     num_or_size_splits=FLAGS.ensemble_size,
                                     axis=0)
                if dataset_name == 'clean':
                    per_probs_tensor = tf.reshape(
                        probs,
                        tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]],
                                  0))
                    diversity_results = ed.metrics.average_pairwise_diversity(
                        per_probs_tensor, FLAGS.ensemble_size)

                    for k, v in diversity_results.items():
                        test_diversity['test/' + k].update_state(v)

                    for i in range(FLAGS.ensemble_size):
                        member_probs = per_probs[i]
                        member_nll = tf.keras.losses.sparse_categorical_crossentropy(
                            labels, member_probs)
                        metrics['test/nll_member_{}'.format(i)].update_state(
                            member_nll)
                        metrics['test/accuracy_member_{}'.format(
                            i)].update_state(labels, member_probs)

                probs = tf.reduce_mean(per_probs, axis=0)

            negative_log_likelihood = tf.reduce_mean(
                tf.keras.losses.sparse_categorical_crossentropy(labels, probs))
            filtered_variables = []
            for var in model.trainable_variables:
                if 'kernel' in var.name or 'bias' in var.name:
                    filtered_variables.append(tf.reshape(var, (-1, )))

            kl = sum(model.losses) / test_dataset_size
            l2_loss = kl + FLAGS.l2 * 2 * tf.nn.l2_loss(
                tf.concat(filtered_variables, axis=0))
            loss = negative_log_likelihood + l2_loss
            if dataset_name == 'clean':
                metrics['test/negative_log_likelihood'].update_state(
                    negative_log_likelihood)
                metrics['test/accuracy'].update_state(labels, probs)
                metrics['test/ece'].update_state(labels, probs)
                metrics['test/loss'].update_state(loss)
            else:
                corrupt_metrics['test/nll_{}'.format(
                    dataset_name)].update_state(negative_log_likelihood)
                corrupt_metrics['test/accuracy_{}'.format(
                    dataset_name)].update_state(labels, probs)
                corrupt_metrics['test/ece_{}'.format(
                    dataset_name)].update_state(labels, probs)

        strategy.run(step_fn, args=(next(iterator), ))

    train_iterator = iter(train_dataset)
    start_time = time.time()
    for epoch in range(initial_epoch, FLAGS.train_epochs):
        logging.info('Starting to run epoch: %s', epoch)
        for step in range(steps_per_epoch):
            train_step(train_iterator)

            current_step = epoch * steps_per_epoch + (step + 1)
            max_steps = steps_per_epoch * FLAGS.train_epochs
            time_elapsed = time.time() - start_time
            steps_per_sec = float(current_step) / time_elapsed
            eta_seconds = (max_steps - current_step) / steps_per_sec
            message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
                       'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                           current_step / max_steps, epoch + 1,
                           FLAGS.train_epochs, steps_per_sec, eta_seconds / 60,
                           time_elapsed / 60))
            work_unit.set_notes(message)
            if step % 20 == 0:
                logging.info(message)

        datasets_to_evaluate = {'clean': test_datasets['clean']}
        if (FLAGS.corruptions_interval > 0
                and (epoch + 1) % FLAGS.corruptions_interval == 0):
            datasets_to_evaluate = test_datasets
        for dataset_name, test_dataset in datasets_to_evaluate.items():
            test_iterator = iter(test_dataset)
            logging.info('Testing on dataset %s', dataset_name)
            for step in range(steps_per_eval):
                if step % 20 == 0:
                    logging.info('Starting to run eval step %s of epoch: %s',
                                 step, epoch)
                test_step(test_iterator, dataset_name)
            logging.info('Done with testing on %s', dataset_name)

        corrupt_results = {}
        if (FLAGS.corruptions_interval > 0
                and (epoch + 1) % FLAGS.corruptions_interval == 0):
            corrupt_results = utils.aggregate_corrupt_metrics(
                corrupt_metrics, corruption_types, max_intensity)

        logging.info('Train Loss: %.4f, Accuracy: %.2f%%',
                     metrics['train/loss'].result(),
                     metrics['train/accuracy'].result() * 100)
        logging.info('Test NLL: %.4f, Accuracy: %.2f%%',
                     metrics['test/negative_log_likelihood'].result(),
                     metrics['test/accuracy'].result() * 100)
        for i in range(FLAGS.ensemble_size):
            logging.info(
                'Member %d Test Loss: %.4f, Accuracy: %.2f%%', i,
                metrics['test/nll_member_{}'.format(i)].result(),
                metrics['test/accuracy_member_{}'.format(i)].result() * 100)
        total_metrics = itertools.chain(metrics.items(),
                                        training_diversity.items(),
                                        test_diversity.items())
        total_results = {
            name: metric.result()
            for name, metric in total_metrics
        }
        total_results.update(corrupt_results)
        with summary_writer.as_default():
            for name, result in total_results.items():
                tf.summary.scalar(name, result, step=epoch + 1)

        for name, result in total_results.items():
            name = name.replace('/', '_')
            if 'negative_log_likelihood' in name:
                # Plots sort WIDs from high-to-low so look at maximization objectives.
                name = name.replace('negative_log_likelihood',
                                    'log_likelihood')
                result = -result
            objective = work_unit.get_measurement_series(name)
            objective.create_measurement(result, epoch + 1)

        for _, metric in total_metrics:
            metric.reset_states()
        summary_writer.flush()

        if (FLAGS.checkpoint_interval > 0
                and (epoch + 1) % FLAGS.checkpoint_interval == 0):
            checkpoint_name = checkpoint.save(
                os.path.join(FLAGS.output_dir, 'checkpoint'))
            logging.info('Saved checkpoint to %s', checkpoint_name)
示例#2
0
def main(argv):
    del argv  # unused arg
    tf.enable_v2_behavior()
    tf.io.gfile.makedirs(FLAGS.output_dir)
    logging.info('Saving checkpoints at %s', FLAGS.output_dir)
    tf.random.set_seed(FLAGS.seed)

    if FLAGS.use_gpu:
        logging.info('Use GPU')
        strategy = tf.distribute.MirroredStrategy()
    else:
        logging.info('Use TPU at %s',
                     FLAGS.tpu if FLAGS.tpu is not None else 'local')
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=FLAGS.tpu)
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)

    train_input_fn = utils.load_input_fn(
        split=tfds.Split.TRAIN,
        name=FLAGS.dataset,
        batch_size=FLAGS.per_core_batch_size // FLAGS.ensemble_size,
        use_bfloat16=FLAGS.use_bfloat16)
    clean_test_input_fn = utils.load_input_fn(
        split=tfds.Split.TEST,
        name=FLAGS.dataset,
        batch_size=FLAGS.per_core_batch_size // FLAGS.ensemble_size,
        use_bfloat16=FLAGS.use_bfloat16)
    train_dataset = strategy.experimental_distribute_datasets_from_function(
        train_input_fn)
    test_datasets = {
        'clean':
        strategy.experimental_distribute_datasets_from_function(
            clean_test_input_fn),
    }
    if FLAGS.corruptions_interval > 0:
        if FLAGS.dataset == 'cifar10':
            load_c_input_fn = utils.load_cifar10_c_input_fn
        else:
            load_c_input_fn = functools.partial(utils.load_cifar100_c_input_fn,
                                                path=FLAGS.cifar100_c_path)
        corruption_types, max_intensity = utils.load_corrupted_test_info(
            FLAGS.dataset)
        for corruption in corruption_types:
            for intensity in range(1, max_intensity + 1):
                input_fn = load_c_input_fn(
                    corruption_name=corruption,
                    corruption_intensity=intensity,
                    batch_size=FLAGS.per_core_batch_size //
                    FLAGS.ensemble_size,
                    use_bfloat16=FLAGS.use_bfloat16)
                test_datasets['{0}_{1}'.format(corruption, intensity)] = (
                    strategy.experimental_distribute_datasets_from_function(
                        input_fn))

    ds_info = tfds.builder(FLAGS.dataset).info
    batch_size = ((FLAGS.per_core_batch_size // FLAGS.ensemble_size) *
                  FLAGS.num_cores)
    train_dataset_size = ds_info.splits['train'].num_examples
    steps_per_epoch = train_dataset_size // batch_size
    steps_per_eval = ds_info.splits['test'].num_examples // batch_size
    num_classes = ds_info.features['label'].num_classes

    if FLAGS.use_bfloat16:
        policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
        tf.keras.mixed_precision.experimental.set_policy(policy)

    summary_writer = tf.summary.create_file_writer(
        os.path.join(FLAGS.output_dir, 'summaries'))

    with strategy.scope():
        logging.info('Building Keras model')
        model = cifar_model.wide_resnet(
            input_shape=ds_info.features['image'].shape,
            depth=28,
            width_multiplier=10,
            num_classes=num_classes,
            alpha_initializer=FLAGS.alpha_initializer,
            gamma_initializer=FLAGS.gamma_initializer,
            alpha_regularizer=FLAGS.alpha_regularizer,
            gamma_regularizer=FLAGS.gamma_regularizer,
            use_additive_perturbation=FLAGS.use_additive_perturbation,
            ensemble_size=FLAGS.ensemble_size,
            random_sign_init=FLAGS.random_sign_init,
            dropout_rate=FLAGS.dropout_rate,
            prior_mean=FLAGS.prior_mean,
            prior_stddev=FLAGS.prior_stddev)
        logging.info('Model input shape: %s', model.input_shape)
        logging.info('Model output shape: %s', model.output_shape)
        logging.info('Model number of weights: %s', model.count_params())
        # Linearly scale learning rate and the decay epochs by vanilla settings.
        base_lr = FLAGS.base_learning_rate * batch_size / 128
        lr_decay_epochs = [(start_epoch * FLAGS.train_epochs) // 200
                           for start_epoch in FLAGS.lr_decay_epochs]
        lr_schedule = utils.LearningRateSchedule(
            steps_per_epoch,
            base_lr,
            decay_ratio=FLAGS.lr_decay_ratio,
            decay_epochs=lr_decay_epochs,
            warmup_epochs=FLAGS.lr_warmup_epochs)
        optimizer = tf.keras.optimizers.SGD(lr_schedule,
                                            momentum=0.9,
                                            nesterov=True)
        metrics = {
            'train/negative_log_likelihood':
            tf.keras.metrics.Mean(),
            'train/accuracy':
            tf.keras.metrics.SparseCategoricalAccuracy(),
            'train/loss':
            tf.keras.metrics.Mean(),
            'train/ece':
            ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins),
            'train/kl':
            tf.keras.metrics.Mean(),
            'train/kl_scale':
            tf.keras.metrics.Mean(),
            'test/negative_log_likelihood':
            tf.keras.metrics.Mean(),
            'test/accuracy':
            tf.keras.metrics.SparseCategoricalAccuracy(),
            'test/ece':
            ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins),
        }
        if FLAGS.ensemble_size > 1:
            for i in range(FLAGS.ensemble_size):
                metrics['test/nll_member_{}'.format(
                    i)] = tf.keras.metrics.Mean()
                metrics['test/accuracy_member_{}'.format(i)] = (
                    tf.keras.metrics.SparseCategoricalAccuracy())
        if FLAGS.corruptions_interval > 0:
            corrupt_metrics = {}
            for intensity in range(1, max_intensity + 1):
                for corruption in corruption_types:
                    dataset_name = '{0}_{1}'.format(corruption, intensity)
                    corrupt_metrics['test/nll_{}'.format(dataset_name)] = (
                        tf.keras.metrics.Mean())
                    corrupt_metrics['test/accuracy_{}'.format(
                        dataset_name)] = (
                            tf.keras.metrics.SparseCategoricalAccuracy())
                    corrupt_metrics['test/ece_{}'.format(dataset_name)] = (
                        ed.metrics.ExpectedCalibrationError(
                            num_bins=FLAGS.num_bins))

        global_step = tf.Variable(
            0,
            trainable=False,
            name='global_step',
            dtype=tf.int64,
            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
        checkpoint = tf.train.Checkpoint(model=model,
                                         optimizer=optimizer,
                                         global_step=global_step)
        latest_checkpoint = tf.train.latest_checkpoint(FLAGS.output_dir)
        initial_epoch = 0
        if latest_checkpoint:
            # checkpoint.restore must be within a strategy.scope() so that optimizer
            # slot variables are mirrored.
            checkpoint.restore(latest_checkpoint)
            logging.info('Loaded checkpoint %s', latest_checkpoint)
            initial_epoch = optimizer.iterations.numpy() // steps_per_epoch

    @tf.function
    def train_step(iterator):
        """Training StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            if FLAGS.ensemble_size > 1:
                images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
                labels = tf.tile(labels, [FLAGS.ensemble_size])

            with tf.GradientTape() as tape:
                logits = model(images, training=True)
                if FLAGS.use_bfloat16:
                    logits = tf.cast(logits, tf.float32)
                negative_log_likelihood = tf.reduce_mean(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        labels, logits, from_logits=True))
                filtered_variables = []
                for var in model.trainable_variables:
                    # Apply l2 on the BN parameters and bias terms. This
                    # excludes only fast weight approximate posterior/prior parameters,
                    # but pay caution to their naming scheme.
                    if ('kernel' in var.name or 'batch_norm' in var.name
                            or 'bias' in var.name):
                        filtered_variables.append(tf.reshape(var, (-1, )))

                l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss(
                    tf.concat(filtered_variables, axis=0))
                kl = sum(model.losses) / train_dataset_size
                kl_scale = tf.cast(global_step + 1, tf.float32)
                kl_scale /= steps_per_epoch * FLAGS.kl_annealing_epochs
                kl_scale = tf.minimum(1., kl_scale)
                kl_loss = kl_scale * kl

                # Scale the loss given the TPUStrategy will reduce sum all gradients.
                loss = negative_log_likelihood + l2_loss + kl_loss
                scaled_loss = loss / strategy.num_replicas_in_sync

            grads = tape.gradient(scaled_loss, model.trainable_variables)

            # Separate learning rate implementation.
            if FLAGS.fast_weight_lr_multiplier != 1.0:
                grads_and_vars = []
                for grad, var in zip(grads, model.trainable_variables):
                    # Apply different learning rate on the fast weight approximate
                    # posterior/prior parameters. This is excludes BN and slow weights,
                    # but pay caution to the naming scheme.
                    if ('kernel' not in var.name
                            and 'batch_norm' not in var.name
                            and 'bias' not in var.name):
                        grads_and_vars.append(
                            (grad * FLAGS.fast_weight_lr_multiplier, var))
                    else:
                        grads_and_vars.append((grad, var))
                optimizer.apply_gradients(grads_and_vars)
            else:
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))

            probs = tf.nn.softmax(logits)
            metrics['train/ece'].update_state(labels, probs)
            metrics['train/loss'].update_state(loss)
            metrics['train/negative_log_likelihood'].update_state(
                negative_log_likelihood)
            metrics['train/kl'].update_state(kl)
            metrics['train/kl_scale'].update_state(kl_scale)
            metrics['train/accuracy'].update_state(labels, logits)

            global_step.assign_add(1)

        strategy.run(step_fn, args=(next(iterator), ))

    @tf.function
    def test_step(iterator, dataset_name):
        """Evaluation StepFn."""
        def step_fn(inputs):
            """Per-Replica StepFn."""
            images, labels = inputs
            if FLAGS.ensemble_size > 1:
                images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1])
            logits = tf.reshape([
                model(images, training=False)
                for _ in range(FLAGS.num_eval_samples)
            ], [FLAGS.num_eval_samples, FLAGS.ensemble_size, -1, num_classes])
            if FLAGS.use_bfloat16:
                logits = tf.cast(logits, tf.float32)
            probs = tf.nn.softmax(logits)

            if FLAGS.ensemble_size > 1:
                per_probs = tf.reduce_mean(probs,
                                           axis=0)  # marginalize samples
                for i in range(FLAGS.ensemble_size):
                    member_probs = per_probs[i]
                    member_loss = tf.keras.losses.sparse_categorical_crossentropy(
                        labels, member_probs)
                    metrics['test/nll_member_{}'.format(i)].update_state(
                        member_loss)
                    metrics['test/accuracy_member_{}'.format(i)].update_state(
                        labels, member_probs)

            # Negative log marginal likelihood computed in a numerically-stable way.
            labels_broadcasted = tf.broadcast_to(
                labels,
                [FLAGS.num_eval_samples, FLAGS.ensemble_size, labels.shape[0]])
            log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy(
                labels_broadcasted, logits, from_logits=True)
            negative_log_likelihood = tf.reduce_mean(
                -tf.reduce_logsumexp(log_likelihoods, axis=[0, 1]) +
                tf.math.log(float(FLAGS.num_eval_samples *
                                  FLAGS.ensemble_size)))
            probs = tf.math.reduce_mean(probs, axis=[0, 1])  # marginalize

            if dataset_name == 'clean':
                metrics['test/negative_log_likelihood'].update_state(
                    negative_log_likelihood)
                metrics['test/accuracy'].update_state(labels, probs)
                metrics['test/ece'].update_state(labels, probs)
            else:
                corrupt_metrics['test/nll_{}'.format(
                    dataset_name)].update_state(negative_log_likelihood)
                corrupt_metrics['test/accuracy_{}'.format(
                    dataset_name)].update_state(labels, probs)
                corrupt_metrics['test/ece_{}'.format(
                    dataset_name)].update_state(labels, probs)

        strategy.run(step_fn, args=(next(iterator), ))

    train_iterator = iter(train_dataset)
    start_time = time.time()
    for epoch in range(initial_epoch, FLAGS.train_epochs):
        logging.info('Starting to run epoch: %s', epoch)
        for step in range(steps_per_epoch):
            train_step(train_iterator)

            current_step = epoch * steps_per_epoch + (step + 1)
            max_steps = steps_per_epoch * FLAGS.train_epochs
            time_elapsed = time.time() - start_time
            steps_per_sec = float(current_step) / time_elapsed
            eta_seconds = (max_steps - current_step) / steps_per_sec
            message = ('{:.1%} completion: epoch {:d}/{:d}. {:.1f} steps/s. '
                       'ETA: {:.0f} min. Time elapsed: {:.0f} min'.format(
                           current_step / max_steps, epoch + 1,
                           FLAGS.train_epochs, steps_per_sec, eta_seconds / 60,
                           time_elapsed / 60))
            if step % 20 == 0:
                logging.info(message)

        datasets_to_evaluate = {'clean': test_datasets['clean']}
        if (FLAGS.corruptions_interval > 0
                and (epoch + 1) % FLAGS.corruptions_interval == 0):
            datasets_to_evaluate = test_datasets
        for dataset_name, test_dataset in datasets_to_evaluate.items():
            test_iterator = iter(test_dataset)
            logging.info('Testing on dataset %s', dataset_name)
            for step in range(steps_per_eval):
                if step % 20 == 0:
                    logging.info('Starting to run eval step %s of epoch: %s',
                                 step, epoch)
                test_step(test_iterator, dataset_name)
            logging.info('Done with testing on %s', dataset_name)

        corrupt_results = {}
        if (FLAGS.corruptions_interval > 0
                and (epoch + 1) % FLAGS.corruptions_interval == 0):
            corrupt_results = utils.aggregate_corrupt_metrics(
                corrupt_metrics, corruption_types, max_intensity)

        logging.info('Train Loss: %.4f, Accuracy: %.2f%%',
                     metrics['train/loss'].result(),
                     metrics['train/accuracy'].result() * 100)
        logging.info('Test NLL: %.4f, Accuracy: %.2f%%',
                     metrics['test/negative_log_likelihood'].result(),
                     metrics['test/accuracy'].result() * 100)
        if FLAGS.ensemble_size > 1:
            for i in range(FLAGS.ensemble_size):
                logging.info(
                    'Member %d Test Loss: %.4f, Accuracy: %.2f%%', i,
                    metrics['test/nll_member_{}'.format(i)].result(),
                    metrics['test/accuracy_member_{}'.format(i)].result() *
                    100)
        total_results = {
            name: metric.result()
            for name, metric in metrics.items()
        }
        total_results.update(corrupt_results)
        with summary_writer.as_default():
            for name, result in total_results.items():
                tf.summary.scalar(name, result, step=epoch + 1)

        for metric in metrics.values():
            metric.reset_states()

        if (FLAGS.checkpoint_interval > 0
                and (epoch + 1) % FLAGS.checkpoint_interval == 0):
            checkpoint_name = checkpoint.save(
                os.path.join(FLAGS.output_dir, 'checkpoint'))
            logging.info('Saved checkpoint to %s', checkpoint_name)

    final_checkpoint_name = checkpoint.save(
        os.path.join(FLAGS.output_dir, 'checkpoint'))
    logging.info('Saved last checkpoint to %s', final_checkpoint_name)