示例#1
0
def model_fn(features, labels, mode, params):
    im_mode = MODEKEY_TO_MODE[mode]
    model_config = configuration.ModelConfig()
    training_config = configuration.TrainingConfig()
    model = show_and_tell_model.ShowAndTellModel(
        model_config, mode=im_mode, train_inception=FLAGS.train_inception)
    model.build_model_for_tpu(images=features["images"],
                              input_seqs=features["input_seqs"],
                              target_seqs=features["target_seqs"],
                              input_mask=features["input_mask"])

    optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=training_config.initial_learning_rate)
    optimizer = contrib_estimator.clip_gradients_by_norm(
        optimizer, training_config.clip_gradients)
    if FLAGS.use_tpu:
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)
    train_op = optimizer.minimize(
        model.total_loss, global_step=tf.train.get_or_create_global_step())

    def scaffold_fn():
        """Load pretrained Inception checkpoint at initialization time."""
        return tf.train.Scaffold(init_fn=model.init_fn)

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=model.total_loss,
                                        train_op=train_op,
                                        scaffold_fn=scaffold_fn)
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps,
                     use_tpu):
    """Creates an optimizer training op."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning ratde.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_tpu:
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    # Normally the global step update is done inside of `apply_gradients`.
    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
    # a different optimizer, you should probably take this line out.
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
示例#3
0
文件: model.py 项目: wen8411/tpu
 def optimizer(lr):
     opt = tf.train.AdamOptimizer(learning_rate=lr,
                                  beta1=opt_cfg.beta1,
                                  beta2=opt_cfg.beta2,
                                  epsilon=opt_cfg.epsilon)
     if is_tpu:
         opt = contrib_tpu.CrossShardOptimizer(opt)
     return opt
def model_fn(features, labels, mode, params):
    """TPUEstimatorSpec for the Squeezenet model."""
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    logits = squeezenet(features,
                        is_training=is_training,
                        num_classes=params["num_classes"])

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    global_batch_size = (params["train"]["num_cores_per_replica"] *
                         params["train"]["train_batch_size"])
    decay_steps = (params["train"]["num_examples_per_epoch"] *
                   params["train"]["num_epochs"]) // global_batch_size
    learning_rate = tf.train.polynomial_decay(
        params["train"]["learning_rate"]["init_learning_rate"],
        global_step=tf.train.get_or_create_global_step(),
        end_learning_rate=params["train"]["learning_rate"]
        ["end_learning_rate"],
        decay_steps=decay_steps,
        power=1.0,
        cycle=False)

    # TODO(power): Hack copied from resnet: remove when summaries are working.
    lr_repeat = tf.reshape(
        tf.tile(tf.expand_dims(learning_rate, 0), [
            params["train"]["train_batch_size"],
        ]), [params["train"]["train_batch_size"], 1])

    if params["train"]["optimizer"]["type"] == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    elif params["train"]["optimizer"]["type"] == "rmsprop":
        optimizer = tf.train.RMSPropOptimizer(
            learning_rate=learning_rate,
            momentum=params["train"]["optimizer"]["momentum"],
            epsilon=1.0)
    else:
        optimizer = tf.train.MomentumOptimizer(
            learning_rate=learning_rate,
            momentum=params["train"]["optimizer"]["momentum"],
            use_nesterov=True)

    if params["use_tpu"]:
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, tf.train.get_global_step())

    return contrib_tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        eval_metrics=(metric_fn, [labels, logits, lr_repeat]),
        predictions={
            "classes": tf.argmax(input=logits, axis=1),
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
        },
    )
示例#5
0
def my_model(features, labels, mode, params):
    """Deep Neural Network(DNN) model.

  This is a DNN Model with 3 hidden layers. First 2 hidden layers are having
  10 neurons in each. And number of neurons in the last layer is equal to the
  number of output classes. This is a densely connected network where each
  neuron of previous layer is connected to each neuron of next layer.

  Args:
    features: Feature values for input samples.
    labels: label/class assigned to the corresponding input sample.
    mode: "TRAIN"/"EVAL"/"PREDICT"
    params: Dictionary used to pass extra parameters to model function from
      the main function.

  Returns:
    TPUEstimator object.

  """

    # Create three fully connected layers.
    net = tf.feature_column.input_layer(features, params["feature_columns"])
    for units in params["hidden_units"]:
        net = tf.layers.dense(net, units=units, activation=tf.nn.relu)

    # Compute logits (1 per class).
    logits = tf.layers.dense(net, params["n_classes"], activation=None)

    # Compute predictions.
    predicted_classes = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            "class_ids": predicted_classes[:, tf.newaxis],
            "probabilities": tf.nn.softmax(logits),
            "logits": logits,
        }
        return contrib_tpu.TPUEstimatorSpec(mode, predictions=predictions)

    # Compute loss.
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    if mode == tf.estimator.ModeKeys.EVAL:
        return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                            loss=loss,
                                            eval_metrics=(metric_fn,
                                                          [labels, logits]))

    # Create training op.
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
        if FLAGS.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)
        train_op = optimizer.minimize(loss,
                                      global_step=tf.train.get_global_step())
        return contrib_tpu.TPUEstimatorSpec(mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params):
    """Inception v3 model using Estimator API."""
    del params

    if mode != tf.estimator.ModeKeys.TRAIN:
        raise RuntimeError('mode {} is not supported yet'.format(mode))

    num_labels = FLAGS.num_labels

    with slim.arg_scope(inception_v3_arg_scope(is_training=True)):
        logits, end_points = inception.inception_v3(
            features,
            num_labels,
            is_training=True,
            depth_multiplier=FLAGS.depth_multiplier)

    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32),
                               depth=num_labels)

    if 'AuxLogits' in end_points:
        tf.losses.softmax_cross_entropy(end_points['AuxLogits'],
                                        onehot_labels,
                                        label_smoothing=0.1,
                                        weights=0.4,
                                        scope='aux_loss')
    tf.losses.softmax_cross_entropy(logits,
                                    onehot_labels,
                                    label_smoothing=0.1,
                                    weights=1.0)
    loss = tf.losses.get_total_loss()

    if FLAGS.optimizer == 'sgd':
        tf.logging.info('Using SGD optimizer')
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
    elif FLAGS.optimizer == 'momentum':
        tf.logging.info('Using Momentum optimizer')
        optimizer = tf.train.MomentumOptimizer(
            learning_rate=FLAGS.learning_rate, momentum=0.9)
    else:
        tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

    if FLAGS.use_tpu:
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(
        loss, global_step=tf.train.get_or_create_global_step())

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op)
示例#7
0
def model_fn(features, labels, mode, params):
    """Define a CIFAR model in Keras."""
    del params  # unused
    layers = contrib_keras.layers

    # Pass our input tensor to initialize the Keras input layer.
    v = layers.Input(tensor=features)
    v = layers.Conv2D(filters=32,
                      kernel_size=5,
                      activation="relu",
                      padding="same")(v)
    v = layers.MaxPool2D(pool_size=2)(v)
    v = layers.Conv2D(filters=64,
                      kernel_size=5,
                      activation="relu",
                      padding="same")(v)
    v = layers.MaxPool2D(pool_size=2)(v)
    v = layers.Flatten()(v)
    fc1 = layers.Dense(units=512, activation="relu")(v)
    logits = layers.Dense(units=10)(fc1)

    # Instead of constructing a Keras model for training, build our loss function
    # and optimizer in Tensorflow.
    #
    # N.B.  This construction omits some features that are important for more
    # complex models (e.g. regularization, batch-norm).  Once
    # `model_to_estimator` support is added for TPUs, it should be used instead.
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=labels))
    optimizer = tf.train.AdamOptimizer()
    if FLAGS.use_tpu:
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op,
                                        predictions={
                                            "classes":
                                            tf.argmax(input=logits, axis=1),
                                            "probabilities":
                                            tf.nn.softmax(
                                                logits, name="softmax_tensor")
                                        })
示例#8
0
def create_train_op(loss, learning_rate, var_list, global_step, use_tpu=False):
    exp_learning_rate = tf.train.exponential_decay(learning_rate,
                                                   global_step,
                                                   decay_steps=10000,
                                                   decay_rate=0.96)

    optimizer = tf.train.AdamOptimizer(learning_rate=exp_learning_rate,
                                       beta1=0.5,
                                       beta2=0.999)

    if use_tpu:
        optimizer = tpu.CrossShardOptimizer(optimizer)

    return optimizer.minimize(loss,
                              var_list=var_list,
                              global_step=global_step,
                              colocate_gradients_with_ops=True)
示例#9
0
def get_train_op_and_metrics(loss, params):
    """Generate training op and metrics to save in TensorBoard."""
    with tf.variable_scope("get_train_op"):
        learning_rate = get_learning_rate(
            learning_rate=params["learning_rate"],
            hidden_size=params["hidden_size"],
            learning_rate_warmup_steps=params["learning_rate_warmup_steps"])

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        optimizer = contrib_opt.LazyAdamOptimizer(
            learning_rate,
            beta1=params["optimizer_adam_beta1"],
            beta2=params["optimizer_adam_beta2"],
            epsilon=params["optimizer_adam_epsilon"])

        if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        # Uses automatic mixed precision FP16 training if on GPU.
        if params["dtype"] == "fp16":
            optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                optimizer)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        train_metrics = {"learning_rate": learning_rate}

        if not params["use_tpu"]:
            # gradient norm is not included as a summary when running on TPU, as
            # it can cause instability between the TPU and the host controller.
            gradient_norm = tf.global_norm(list(zip(*gradients))[0])
            train_metrics["global_norm/gradient_norm"] = gradient_norm

        return train_op, train_metrics
示例#10
0
def model_fn(features, labels, mode, params):
    """model_fn constructs the ML model used to predict handwritten digits."""

    del params
    image = features
    if isinstance(image, dict):
        image = features["image"]

    model = mnist.create_model("channels_last")

    if mode == tf.estimator.ModeKeys.PREDICT:
        logits = model(image, training=False)
        predictions = {
            'class_ids': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits),
        }
        return contrib_tpu.TPUEstimatorSpec(mode, predictions=predictions)

    logits = model(image, training=(mode == tf.estimator.ModeKeys.TRAIN))
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    if mode == tf.estimator.ModeKeys.TRAIN:
        learning_rate = tf.train.exponential_decay(FLAGS.learning_rate,
                                                   tf.train.get_global_step(),
                                                   decay_steps=100000,
                                                   decay_rate=0.96)
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
        if FLAGS.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)
        return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                            loss=loss,
                                            train_op=optimizer.minimize(
                                                loss,
                                                tf.train.get_global_step()))

    if mode == tf.estimator.ModeKeys.EVAL:
        return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                            loss=loss,
                                            eval_metrics=(metric_fn,
                                                          [labels, logits]))
示例#11
0
def model_fn(features, labels, mode, params):

    # Build graph
    logits = tf.layers.dense(features, 10)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=logits)
    optim = tf.train.GradientDescentOptimizer(learning_rate=1e-2)

    # NOTE:
    # When using TPUs, you have to use CrossShardOptimizer which aggregate gradients with all reduce.
    if params["use_tpu"]:
        optim = tpu.CrossShardOptimizer(optim)

    train_op = optim.minimize(loss=loss,
                              global_step=tf.train.get_or_create_global_step())
    # Create EstimatorSpec
    estimator_spec = tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
    )
    return estimator_spec
示例#12
0
    def _build_optimizer(self,
                         optimizer_to_use=tf.train.AdamOptimizer,
                         tpu_support=False):
        """Buids the optimizer(s) to minimize the loss(es) of the model.

        Args:
            optimizer_to_use (tf optimizer, optional): Defaults to tf.train.AdamOptimizer. Which
                optimizer to use.
            tpu_support (bool, optional): Defaults to False. If the optimizer has to support shard
                optimier, required for TPU usage.
        """
        self.optimize_ops = []
        for loss in self.losses[
                'train']:  # TODO Create apropoiate external training scheme
            optimize_op = optimizer_to_use(learning_rate=self.learning_rate)
            if tpu_support:
                optimize_op = tpu.CrossShardOptimizer(optimize_op)
            optimize_op = optimize_op.minimize(
                loss=loss, global_step=tf.train.get_global_step())
            self.optimize_ops.append(optimize_op)
        logging.info('Optimizers built')
示例#13
0
    def _build_optimizer(self, learning_rate):
        """Build optimizer."""
        if self.hparams.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif self.hparams.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=learning_rate,
                momentum=self.hparams.momentum_rate)
        elif self.hparams.optimizer == 'rmsprop':
            tf.logging.info('Using RMSProp optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', self.hparams.optimizer)

        if self.hparams.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)
        return optimizer
示例#14
0
def train_function(training_method, loss, cross_loss, reg_loss, output_dir,
                   use_tpu):
    """Training script for resnet model.

  Args:
   training_method: string indicating pruning method used to compress model.
   loss: tensor float32 of the cross entropy + regularization losses.
   cross_loss: tensor, only cross entropy loss, passed for logging.
   reg_loss: tensor, only regularization loss, passed for logging.
   output_dir: string tensor indicating the directory to save summaries.
   use_tpu: boolean indicating whether to run script on a tpu.

  Returns:
    host_call: summary tensors to be computed at each training step.
    train_op: the optimization term.
  """

    global_step = tf.train.get_global_step()

    steps_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size
    current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch)
    learning_rate = lr_schedule(current_epoch)
    if FLAGS.use_adam:
        # We don't use step decrease for the learning rate.
        learning_rate = FLAGS.base_learning_rate * (FLAGS.train_batch_size /
                                                    256.0)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    else:
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=FLAGS.momentum,
                                               use_nesterov=True)

    if use_tpu:
        # use CrossShardOptimizer when using TPU.
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    if training_method == 'set':
        # We override the train op to also update the mask.
        optimizer = sparse_optimizers.SparseSETOptimizer(
            optimizer,
            begin_step=FLAGS.maskupdate_begin_step,
            end_step=FLAGS.maskupdate_end_step,
            grow_init=FLAGS.grow_init,
            frequency=FLAGS.maskupdate_frequency,
            drop_fraction=FLAGS.drop_fraction,
            drop_fraction_anneal=FLAGS.drop_fraction_anneal,
            stateless_seed_offset=FLAGS.seed)
    elif training_method == 'static':
        # We override the train op to also update the mask.
        optimizer = sparse_optimizers.SparseStaticOptimizer(
            optimizer,
            begin_step=FLAGS.maskupdate_begin_step,
            end_step=FLAGS.maskupdate_end_step,
            grow_init=FLAGS.grow_init,
            frequency=FLAGS.maskupdate_frequency,
            drop_fraction=FLAGS.drop_fraction,
            drop_fraction_anneal=FLAGS.drop_fraction_anneal,
            stateless_seed_offset=FLAGS.seed)
    elif training_method == 'momentum':
        # We override the train op to also update the mask.
        optimizer = sparse_optimizers.SparseMomentumOptimizer(
            optimizer,
            begin_step=FLAGS.maskupdate_begin_step,
            end_step=FLAGS.maskupdate_end_step,
            momentum=FLAGS.s_momentum,
            frequency=FLAGS.maskupdate_frequency,
            drop_fraction=FLAGS.drop_fraction,
            grow_init=FLAGS.grow_init,
            stateless_seed_offset=FLAGS.seed,
            drop_fraction_anneal=FLAGS.drop_fraction_anneal,
            use_tpu=use_tpu)
    elif training_method == 'rigl':
        # We override the train op to also update the mask.
        optimizer = sparse_optimizers.SparseRigLOptimizer(
            optimizer,
            begin_step=FLAGS.maskupdate_begin_step,
            end_step=FLAGS.maskupdate_end_step,
            grow_init=FLAGS.grow_init,
            frequency=FLAGS.maskupdate_frequency,
            drop_fraction=FLAGS.drop_fraction,
            stateless_seed_offset=FLAGS.seed,
            drop_fraction_anneal=FLAGS.drop_fraction_anneal,
            initial_acc_scale=FLAGS.rigl_acc_scale,
            use_tpu=use_tpu)
    elif training_method == 'snip':
        optimizer = sparse_optimizers.SparseSnipOptimizer(
            optimizer,
            mask_init_method=FLAGS.mask_init_method,
            custom_sparsity_map=CUSTOM_SPARSITY_MAP,
            default_sparsity=FLAGS.end_sparsity,
            use_tpu=use_tpu)
    elif training_method in ('scratch', 'baseline'):
        pass
    else:
        raise ValueError('Unsupported pruning method: %s' %
                         FLAGS.training_method)
    # UPDATE_OPS needs to be added as a dependency due to batch norm
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops), tf.name_scope('train'):
        train_op = optimizer.minimize(loss, global_step)

    metrics = {
        'global_step': tf.train.get_or_create_global_step(),
        'loss': loss,
        'cross_loss': cross_loss,
        'reg_loss': reg_loss,
        'learning_rate': learning_rate,
        'current_epoch': current_epoch,
    }

    # Logging drop_fraction if dynamic sparse training.
    if training_method in ('set', 'momentum', 'rigl', 'static'):
        metrics['drop_fraction'] = optimizer.drop_fraction

    # Let's log some statistics from a single parameter-mask couple.
    # This is useful for debugging.
    test_var = pruning.get_weights()[0]
    test_var_mask = pruning.get_masks()[0]
    metrics.update({
        'fw_nz_weight': tf.count_nonzero(test_var),
        'fw_nz_mask': tf.count_nonzero(test_var_mask),
        'fw_l1_weight': tf.reduce_sum(tf.abs(test_var))
    })

    masks = pruning.get_masks()
    global_sparsity = sparse_utils.calculate_sparsity(masks)
    metrics['global_sparsity'] = global_sparsity
    metrics.update(
        utils.mask_summaries(masks[:4] + masks[-1:],
                             with_img=FLAGS.log_mask_imgs_each_iteration))

    host_call = (functools.partial(utils.host_call_fn,
                                   output_dir), utils.format_tensors(metrics))

    return host_call, train_op
示例#15
0
def neumf_model_fn(features, labels, mode, params):
    """Model Function for NeuMF estimator."""
    if params.get("use_seed"):
        tf.set_random_seed(stat_utils.random_int32())

    users = features[movielens.USER_COLUMN]
    items = features[movielens.ITEM_COLUMN]

    user_input = tf.keras.layers.Input(tensor=users)
    item_input = tf.keras.layers.Input(tensor=items)
    logits = construct_model(user_input, item_input, params).output

    # Softmax with the first column of zeros is equivalent to sigmoid.
    softmax_logits = ncf_common.convert_to_softmax_logits(logits)

    if mode == tf.estimator.ModeKeys.EVAL:
        duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
        return _get_estimator_spec_with_metrics(
            logits,
            softmax_logits,
            duplicate_mask,
            params["num_neg"],
            params["match_mlperf"],
            use_tpu_spec=params["use_xla_for_gpu"])

    elif mode == tf.estimator.ModeKeys.TRAIN:
        labels = tf.cast(labels, tf.int32)
        valid_pt_mask = features[rconst.VALID_POINT_MASK]

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
                                value=params["learning_rate"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA1,
                                value=params["beta1"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_BETA2,
                                value=params["beta2"])
        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_HP_ADAM_EPSILON,
                                value=params["epsilon"])

        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=params["learning_rate"],
            beta1=params["beta1"],
            beta2=params["beta2"],
            epsilon=params["epsilon"])
        if params["use_tpu"]:
            # TODO(seemuch): remove this contrib import
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        mlperf_helper.ncf_print(key=mlperf_helper.TAGS.MODEL_HP_LOSS_FN,
                                value=mlperf_helper.TAGS.BCE)

        loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=softmax_logits,
            weights=tf.cast(valid_pt_mask, tf.float32))

        # This tensor is used by logging hooks.
        tf.identity(loss, name="cross_entropy")

        global_step = tf.compat.v1.train.get_global_step()
        tvars = tf.compat.v1.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        gradients = sparse_to_dense_grads(gradients)
        minimize_op = optimizer.apply_gradients(gradients,
                                                global_step=global_step,
                                                name="train")
        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    else:
        raise NotImplementedError
示例#16
0
def train_function(pruning_method, loss, output_dir, use_tpu):
    """Training script for resnet model.

  Args:
   pruning_method: string indicating pruning method used to compress model.
   loss: tensor float32 of the cross entropy + regularization losses.
   output_dir: string tensor indicating the directory to save summaries.
   use_tpu: boolean indicating whether to run script on a tpu.

  Returns:
    host_call: summary tensors to be computed at each training step.
    train_op: the optimization term.
  """

    global_step = tf.train.get_global_step()

    steps_per_epoch = FLAGS.num_train_images / FLAGS.train_batch_size
    current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch)
    learning_rate = lr_schedule(current_epoch)
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=FLAGS.momentum,
                                           use_nesterov=True)

    if use_tpu:
        # use CrossShardOptimizer when using TPU.
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    # UPDATE_OPS needs to be added as a dependency due to batch norm
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops), tf.name_scope('train'):
        train_op = optimizer.minimize(loss, global_step)

    if not use_tpu:
        if FLAGS.num_workers > 0:
            optimizer = tf.train.SyncReplicasOptimizer(
                optimizer,
                replicas_to_aggregate=FLAGS.num_workers,
                total_num_replicas=FLAGS.num_workers)
            optimizer.make_session_run_hook(True)

    metrics = {
        'global_step': tf.train.get_or_create_global_step(),
        'loss': loss,
        'learning_rate': learning_rate,
        'current_epoch': current_epoch
    }

    if pruning_method == 'threshold':
        # construct the necessary hparams string from the FLAGS
        hparams_string = ('begin_pruning_step={0},'
                          'sparsity_function_begin_step={0},'
                          'end_pruning_step={1},'
                          'sparsity_function_end_step={1},'
                          'target_sparsity={2},'
                          'pruning_frequency={3},'
                          'threshold_decay=0,'
                          'use_tpu={4}'.format(
                              FLAGS.sparsity_begin_step,
                              FLAGS.sparsity_end_step,
                              FLAGS.end_sparsity,
                              FLAGS.pruning_frequency,
                              FLAGS.use_tpu,
                          ))

        # Parse pruning hyperparameters
        pruning_hparams = pruning.get_pruning_hparams().parse(hparams_string)

        # The first layer has so few parameters, we don't need to prune it, and
        # pruning it a higher sparsity levels has very negative effects.
        if FLAGS.prune_first_layer and FLAGS.first_layer_sparsity >= 0.:
            pruning_hparams.set_hparam(
                'weight_sparsity_map',
                ['resnet_model/initial_conv:%f' % FLAGS.first_layer_sparsity])
        if FLAGS.prune_last_layer and FLAGS.last_layer_sparsity >= 0:
            pruning_hparams.set_hparam(
                'weight_sparsity_map',
                ['resnet_model/final_dense:%f' % FLAGS.last_layer_sparsity])

        # Create a pruning object using the pruning hyperparameters
        pruning_obj = pruning.Pruning(pruning_hparams, global_step=global_step)

        # We override the train op to also update the mask.
        with tf.control_dependencies([train_op]):
            train_op = pruning_obj.conditional_mask_update_op()

        masks = pruning.get_masks()
        metrics.update(utils.mask_summaries(masks))
    elif pruning_method == 'scratch':
        masks = pruning.get_masks()
        # make sure the masks have the sparsity we expect and that it doesn't change
        metrics.update(utils.mask_summaries(masks))
    elif pruning_method == 'variational_dropout':
        masks = utils.add_vd_pruning_summaries(
            threshold=FLAGS.log_alpha_threshold)
        metrics.update(masks)
    elif pruning_method == 'l0_regularization':
        summaries = utils.add_l0_summaries()
        metrics.update(summaries)
    elif pruning_method == 'baseline':
        pass
    else:
        raise ValueError('Unsupported pruning method', FLAGS.pruning_method)

    host_call = (functools.partial(utils.host_call_fn,
                                   output_dir), utils.format_tensors(metrics))

    return host_call, train_op
示例#17
0
def model_fn(features, labels, mode, params):
    """TPUEstimator compatible model function."""
    loss = loss_fn(features, labels, mode, params)

    host_call = None
    train_op = None
    if mode == tf.estimator.ModeKeys.TRAIN:
        num_batches_per_epoch = params['num_batches_per_epoch']
        global_step = tf.train.get_global_step()
        current_epoch = tf.cast(global_step,
                                tf.float32) / num_batches_per_epoch

        learning_rate = _get_learning_rate(params, global_step,
                                           num_batches_per_epoch)
        optimizer = _get_optimizer(params, learning_rate)
        if params['use_tpu']:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, tf.train.get_global_step())

        if params['use_host_call']:

            def host_call_fn(global_step, loss, learning_rate, current_epoch):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          global_step: `Tensor with shape `[batch, ]` for the global_step.
          loss: `Tensor` with shape `[batch, ]` for the training loss.
          learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
          current_epoch: `Tensor` with shape `[batch, ]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                # Outfeed supports int32 but global_step is expected to be int64.
                global_step = tf.reduce_mean(global_step)
                with (contrib_summary.create_file_writer(
                        params['model_dir']).as_default()):
                    with contrib_summary.always_record_summaries():
                        contrib_summary.scalar('loss',
                                               tf.reduce_mean(loss),
                                               step=global_step)
                        contrib_summary.scalar('learning_rate',
                                               tf.reduce_mean(learning_rate),
                                               step=global_step)
                        contrib_summary.scalar('current_epoch',
                                               tf.reduce_mean(current_epoch),
                                               step=global_step)

                        return contrib_summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            global_step_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            learning_rate_t = tf.reshape(learning_rate, [1])
            current_epoch_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [
                global_step_t, loss_t, learning_rate_t, current_epoch_t
            ])

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics = _create_eval_metric(features, labels, params)

    # Restore from checkpoint if available.
    if params['init_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:
        tf.logging.info('Found an init checkpoint.')
        model_variant = params['model_options'].model_variant
        var_scope = '{}/'.format(feature_extractor.name_scope[model_variant])

        def scaffold_fn():
            """Create Scaffold for initialization, etc."""
            tf.train.init_from_checkpoint(params['init_checkpoint'], {
                var_scope: var_scope,
            })
            return tf.train.Scaffold()
    else:
        tf.logging.info('No init checkpoint found. Training from scratch.')
        scaffold_fn = None
    return contrib_tpu.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        scaffold_fn=scaffold_fn,
        host_call=host_call,
        eval_metrics=eval_metrics,
    )
示例#18
0
def model_fn(features, labels, mode, params):
    """`model_fn` for training mode for `TPUEstimator`."""
    labels = labels
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    x = tf.transpose(features['x'], [1, 0])
    y = tf.transpose(features['y'], [1, 0])
    init_states, model_params = _build_params(params)

    (update_average_ops, moving_average_mu,
     use_moving_average_ops) = _create_average_ops(params)

    if params.moving_average:
        tf.logging.info('swap in moving average')
        with tf.control_dependencies(use_moving_average_ops):
            total_loss, cross_entropy_loss = _forward(params,
                                                      x,
                                                      y,
                                                      model_params,
                                                      init_states,
                                                      is_training=is_training)
    else:
        if not is_training:
            tf.logging.info('not swap in moving average')
        total_loss, cross_entropy_loss = _forward(params,
                                                  x,
                                                  y,
                                                  model_params,
                                                  init_states,
                                                  is_training=is_training)

    if is_training:
        tf_vars = tf.trainable_variables()
        global_step = tf.train.get_or_create_global_step()
        lr_scale = (tf.cast(tf.shape(y)[-1], dtype=tf.float32) /
                    tf.cast(params.bptt_steps, dtype=tf.float32))
        learning_rate = utils.get_lr(global_step, params) * lr_scale
        grads = tf.gradients(total_loss, tf_vars)
        clipped_grads, grad_norm = tf.clip_by_global_norm(
            grads, params.grad_bound)

        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        if params.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(
                opt=optimizer, reduction=tf.losses.Reduction.MEAN)
        with tf.control_dependencies(update_average_ops):
            train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars),
                                                 global_step=global_step)

        names_and_tensors = [
            ('learning_rate', learning_rate),
            ('per_example_cross_entropy', cross_entropy_loss),
            ('train_ppl', tf.exp(cross_entropy_loss)),
            ('grad_norm', grad_norm),
            ('moving_average_mu', moving_average_mu),
        ]
        host_call = utils.build_host_call_fn(params, names_and_tensors)
        return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                            loss=total_loss,
                                            train_op=train_op,
                                            host_call=host_call)
    else:

        def _metric_fn(cross_entropy_loss):
            """Computes metrics for EstimatorSpec."""
            metrics = {
                'log_ppl/{0}'.format(params.task_mode):
                tf.metrics.mean(values=cross_entropy_loss),
            }
            return metrics

        return contrib_tpu.TPUEstimatorSpec(mode=tf.estimator.ModeKeys.EVAL,
                                            loss=total_loss,
                                            eval_metrics=(_metric_fn,
                                                          [cross_entropy_loss
                                                           ]))
示例#19
0
def get_cross_shard_optimizer(optimizer, disable_for_cpu_debugging=False):
    if disable_for_cpu_debugging:
        return optimizer
    return contrib_tpu.CrossShardOptimizer(optimizer)
示例#20
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
                     optimizer="adamw", poly_power=1.0, start_warmup_step=0,
                     colocate_gradients_with_ops=False, hvd=None, use_fp16=False, manual_fp16=False):
  """Creates an optimizer training op."""
  global_step = tf.train.get_or_create_global_step()

  learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

  # Implements linear decay of the learning rate.
  learning_rate = tf.train.polynomial_decay(
      learning_rate,
      global_step,
      num_train_steps,
      end_learning_rate=0.0,
      power=poly_power,
      cycle=False)

  # Implements linear warmup. I.e., if global_step - start_warmup_step <
  # num_warmup_steps, the learning rate will be
  # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`.
  if num_warmup_steps:
    tf.logging.info("++++++ warmup starts at step " + str(start_warmup_step)
                    + ", for " + str(num_warmup_steps) + " steps ++++++")
    global_steps_int = tf.cast(global_step, tf.int32)
    start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32)
    global_steps_int = global_steps_int - start_warm_int
    warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

    global_steps_float = tf.cast(global_steps_int, tf.float32)
    warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

    warmup_percent_done = global_steps_float / warmup_steps_float
    warmup_learning_rate = init_lr * warmup_percent_done

    is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
    learning_rate = (
        (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

  # It is OK that you use this optimizer for finetuning, since this
  # is how the model was trained (note that the Adam m/v variables are NOT
  # loaded from init_checkpoint.)
  # It is OK to use AdamW in the finetuning even the model is trained by LAMB.
  # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune
  # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a
  # batch size of 64 in the finetune.
  if optimizer == "adamw":
    tf.logging.info("using adamw")
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
  elif optimizer == "lamb":
    tf.logging.info("using lamb")
    optimizer = lamb_optimizer.LAMBOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
  else:
    raise ValueError("Not supported optimizer: ", optimizer)

  if use_tpu:
    optimizer = contrib_tpu.CrossShardOptimizer(optimizer)
    
  // Change 9 add horovod optimizer
  if hvd is not None:
    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)

  tvars = tf.trainable_variables()
  # grads = tf.gradients(
  #    loss, tvars, colocate_gradients_with_ops=colocate_gradients_with_ops)
  // Change 10 calculate gradients with horovod
示例#21
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model defination for the RetinaNet model based on ResNet-50.

  Args:
    features: The input images tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: The input labels in a tensor with the same shape as input images.
    mode: The mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: The dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: The FPN segmentation model outputs class logits.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
  """
    def _model_outputs():
        return model(features,
                     min_level=params['min_level'],
                     max_level=params['max_level'],
                     num_classes=params['num_classes'],
                     resnet_depth=params['resnet_depth'],
                     is_training_bn=params['is_training_bn'])

    if params['use_bfloat16']:
        with contrib_tpu.bfloat16_scope():
            cls_outputs = _model_outputs()
            cls_outputs = tf.cast(cls_outputs, tf.float32)
    else:
        cls_outputs = _model_outputs()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {'image': features, 'cls_outputs': cls_outputs}
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Load pretrained model from checkpoint.
    if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            tf.train.init_from_checkpoint(
                params['resnet_checkpoint'], {
                    '/': 'resnet%s/' % params['resnet_depth'],
                })
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    # Set up training loss and learning rate.
    retinanet_model.update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_global_step()
    learning_rate = retinanet_model.learning_rate_schedule(
        params['adjusted_learning_rate'], params['lr_warmup_init'],
        params['lr_warmup_step'], params['first_lr_drop_step'],
        params['second_lr_drop_step'], global_step)

    cls_loss = _segmentation_loss(cls_outputs, labels, params)
    weight_decay_loss = params['weight_decay'] * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])
    # Add L2 regularization loss
    total_loss = cls_loss + weight_decay_loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=params['momentum'])
        if params['use_tpu']:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = variable_filter_fn(
            tf.trainable_variables(),
            params['resnet_depth']) if variable_filter_fn else None

        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(total_loss,
                                          global_step,
                                          var_list=var_list)
    else:
        train_op = None

    # Evaluation only works on GPU/CPU host and batch_size=1
    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        batch_size = params['batch_size']

        def metric_fn(**kwargs):
            """Creates metric_fn for TPUEstimatorSpec."""
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            total_loss = tf.metrics.mean(kwargs['total_loss_repeat'])
            logits = tf.image.resize_bilinear(kwargs['prediction'],
                                              tf.shape(kwargs['labels'])[1:3],
                                              align_corners=True)
            predictions_with_shape = tf.argmax(logits, 3, output_type=tf.int32)
            predictions = tf.reshape(predictions_with_shape, shape=[-1])

            labels = tf.reshape(kwargs['labels'], shape=[-1])
            # Background class is considered as a class. Not ignored.
            weights = tf.to_float(tf.not_equal(labels, params['ignore_label']))

            # Set ignore_label regions to label 0, because metrics.mean_iou requires
            # range of labels = [0, dataset.num_classes).
            # Note the ignore_lable regions are not evaluated since the corresponding
            # regions contain weights = 0.
            labels = tf.where(tf.equal(labels, params['ignore_label']),
                              tf.zeros_like(labels), labels)

            return {
                'total_loss':
                total_loss,
                'cls_loss':
                cls_loss,
                'miou':
                tf.metrics.mean_iou(predictions,
                                    labels,
                                    params['num_classes'],
                                    weights=weights),
            }

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                batch_size,
            ]), [batch_size, 1])

        total_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(total_loss, 0), [
                batch_size,
            ]), [batch_size, 1])

        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'total_loss_repeat': total_loss_repeat,
            'prediction': cls_outputs,
            'labels': labels,
        }

        eval_metrics = (metric_fn, metric_fn_inputs)

    return contrib_tpu.TPUEstimatorSpec(
        mode=mode,
        loss=total_loss,
        train_op=train_op,
        eval_metrics=eval_metrics,
        scaffold_fn=scaffold_fn,
    )
示例#22
0
def inception_model_fn(features, labels, mode, params):
    """Inception v2 model using Estimator API."""
    num_classes = FLAGS.num_classes
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    is_eval = (mode == tf.estimator.ModeKeys.EVAL)

    if isinstance(features, dict):
        features = features['feature']

    features = tensor_transform_fn(features, params['input_perm'])

    if FLAGS.clear_update_collections:
        # updates_collections must be set to None in order to use fused batchnorm
        with arg_scope(
                inception.inception_v2_arg_scope(
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON,
                    updates_collections=None)):
            logits, end_points = inception.inception_v2(
                features,
                num_classes,
                is_training=is_training,
                replace_separable_convolution=True)
    else:
        with arg_scope(
                inception.inception_v2_arg_scope(
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON)):
            logits, end_points = inception.inception_v2(
                features,
                num_classes,
                is_training=is_training,
                replace_separable_convolution=True)

    predictions = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
            not FLAGS.use_tpu):
        with tf.control_dependencies([
                tf.Print(predictions['classes'], [predictions['classes']],
                         summarize=FLAGS.eval_batch_size,
                         message='prediction: ')
        ]):
            labels = tf.Print(labels, [labels],
                              summarize=FLAGS.eval_batch_size,
                              message='label: ')

    one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32)

    tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                    logits=logits,
                                    weights=1.0,
                                    label_smoothing=0.1)
    loss = tf.losses.get_total_loss(add_regularization_losses=True)

    initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256
    if FLAGS.use_learning_rate_warmup:
        # Adjust initial learning rate to match final warmup rate
        warmup_decay = FLAGS.learning_rate_decay**(
            (FLAGS.warmup_epochs + FLAGS.cold_epochs) /
            FLAGS.learning_rate_decay_epochs)
        adj_initial_learning_rate = initial_learning_rate * warmup_decay

    final_learning_rate = 0.0001 * initial_learning_rate

    host_call = None
    train_op = None
    if is_training:
        batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size
        global_step = tf.train.get_or_create_global_step()
        current_epoch = tf.cast(
            (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32)

        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=int(FLAGS.learning_rate_decay_epochs *
                            batches_per_epoch),
            decay_rate=FLAGS.learning_rate_decay,
            staircase=True)

        if FLAGS.use_learning_rate_warmup:
            wlr = 0.1 * adj_initial_learning_rate
            wlr_height = tf.cast(
                0.9 * adj_initial_learning_rate /
                (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1),
                tf.float32)
            epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32)
            exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs +
                               FLAGS.learning_rate_decay_epochs)
            lin_inc_lr = tf.add(
                wlr,
                tf.multiply(
                    tf.cast(tf.subtract(current_epoch, epoch_offset),
                            tf.float32), wlr_height))
            learning_rate = tf.where(
                tf.greater_equal(current_epoch, FLAGS.cold_epochs),
                (tf.where(tf.greater_equal(current_epoch, exp_decay_start),
                          learning_rate, lin_inc_lr)), wlr)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   final_learning_rate,
                                   name='learning_rate')

        if FLAGS.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif FLAGS.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=0.9)
        elif FLAGS.optimizer == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

        if FLAGS.use_tpu:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)
        if FLAGS.moving_average:
            ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                    num_updates=global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            with tf.control_dependencies([train_op
                                          ]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

        # To log the loss, current learning rate, and epoch for Tensorboard, the
        # summary op needs to be run on the host CPU via host_call. host_call
        # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
        # dimension. These Tensors are implicitly concatenated to
        # [params['batch_size']].
        gs_t = tf.reshape(global_step, [1])
        loss_t = tf.reshape(loss, [1])
        lr_t = tf.reshape(learning_rate, [1])
        ce_t = tf.reshape(current_epoch, [1])

        def host_call_fn(gs, loss, lr, ce):
            """Training host call. Creates scalar summaries for training metrics.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `host_call`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `host_call`.

      Args:
        gs: `Tensor with shape `[batch]` for the global_step
        loss: `Tensor` with shape `[batch]` for the training loss.
        lr: `Tensor` with shape `[batch]` for the learning_rate.
        ce: `Tensor` with shape `[batch]` for the current_epoch.

      Returns:
        List of summary ops to run on the CPU host.
      """
            gs = gs[0]
            with summary.create_file_writer(FLAGS.model_dir).as_default():
                with summary.always_record_summaries():
                    summary.scalar('loss', tf.reduce_mean(loss), step=gs)
                    summary.scalar('learning_rate',
                                   tf.reduce_mean(lr),
                                   step=gs)
                    summary.scalar('current_epoch',
                                   tf.reduce_mean(ce),
                                   step=gs)

                    return summary.all_summary_ops()

        host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    eval_metrics = None
    if is_eval:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, ]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'accuracy': top_1_accuracy,
                'accuracy@5': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op,
                                        host_call=host_call,
                                        eval_metrics=eval_metrics)
示例#23
0
def _model_fn(features,
              labels,
              mode,
              params,
              model,
              use_tpu_estimator_spec,
              variable_filter_fn=None):
    """Model defination for the RetinaNet model based on ResNet.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in dataloader.py
    mode: the mode of TPUEstimator/Estimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the RetinaNet model outputs class logits and box regression outputs.
    use_tpu_estimator_spec: Whether to use TPUEstimatorSpec or EstimatorSpec.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
  """

    # In predict mode features is a dict with input as value of the 'inputs'.
    image_info = None
    if (mode == tf.estimator.ModeKeys.PREDICT and isinstance(features, dict)
            and 'inputs' in features):
        image_info = features['image_info']
        labels = None
        if 'labels' in features:
            labels = features['labels']
        features = features['inputs']

    def _model_outputs():
        return model(features,
                     min_level=params['min_level'],
                     max_level=params['max_level'],
                     num_classes=params['num_classes'],
                     num_anchors=len(params['aspect_ratios'] *
                                     params['num_scales']),
                     resnet_depth=params['resnet_depth'],
                     is_training_bn=params['is_training_bn'])

    if params['use_bfloat16']:
        with contrib_tpu.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        # Postprocess on host; memory layout for NMS on TPU is very inefficient.
        def _predict_postprocess_wrapper(args):
            return _predict_postprocess(*args)

        predictions = contrib_tpu.outside_compilation(
            _predict_postprocess_wrapper,
            (cls_outputs, box_outputs, labels, params))

        # Include resizing information on prediction output to help bbox drawing.
        if image_info is not None:
            predictions.update({
                'image_info':
                tf.identity(image_info, 'ImageInfo'),
            })

        return contrib_tpu.TPUEstimatorSpec(mode=tf.estimator.ModeKeys.PREDICT,
                                            predictions=predictions)

    # Load pretrained model from checkpoint.
    if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            tf.train.init_from_checkpoint(
                params['resnet_checkpoint'], {
                    '/': 'resnet%s/' % params['resnet_depth'],
                })
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_global_step()
    learning_rate = learning_rate_schedule(params['adjusted_learning_rate'],
                                           params['lr_warmup_init'],
                                           params['lr_warmup_step'],
                                           params['first_lr_drop_step'],
                                           params['second_lr_drop_step'],
                                           global_step)
    # cls_loss and box_loss are for logging. only total_loss is optimized.
    total_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                    labels, params)
    total_loss += _WEIGHT_DECAY * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=params['momentum'])
        if params['use_tpu']:
            optimizer = contrib_tpu.CrossShardOptimizer(optimizer)
        else:
            if params['auto_mixed_precision']:
                optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                    optimizer)

        # Batch norm requires `update_ops` to be executed alongside `train_op`.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = variable_filter_fn(
            tf.trainable_variables(),
            params['resnet_depth']) if variable_filter_fn else None

        minimize_op = optimizer.minimize(total_loss,
                                         global_step,
                                         var_list=var_list)
        train_op = tf.group(minimize_op, update_ops)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                          params['val_json_file'], **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    if use_tpu_estimator_spec:
        return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                            loss=total_loss,
                                            train_op=train_op,
                                            eval_metrics=eval_metrics,
                                            scaffold_fn=scaffold_fn)
    else:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=total_loss,
            # TODO(rostam): Fix bug to get scaffold working.
            # scaffold=scaffold_fn(),
            train_op=train_op)
def model_fn(features, labels, mode, params):
    """Our model_fn for Densenet to be used with our Estimator."""
    tf.logging.info("model_fn")

    if FLAGS.network_depth == 169:
        logits = densenet_model.densenet_imagenet_169(
            features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
    elif FLAGS.network_depth == 201:
        logits = densenet_model.densenet_imagenet_201(
            features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
    elif FLAGS.network_depth == 121:
        logits = densenet_model.densenet_imagenet_121(
            features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
    else:
        tf.logging.info("Number of layers not supported, revert to 121")
        logits = densenet_model.densenet_imagenet_121(
            features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    cross_entropy = tf.losses.softmax_cross_entropy(logits=logits,
                                                    onehot_labels=labels)

    # Add weight decay to the loss. We exclude weight decay on the batch
    # normalization variables because it slightly improves accuracy.
    loss = cross_entropy + _WEIGHT_DECAY * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if "batch_normalization" not in v.name
    ])

    global_step = tf.train.get_global_step()
    current_epoch = (tf.cast(global_step, tf.float32) /
                     params["batches_per_epoch"])
    learning_rate = learning_rate_schedule(current_epoch)

    # TODO(chrisying): this is a hack to get the LR and epoch for Tensorboard.
    # Reimplement this when TPU training summaries are supported.
    lr_repeat = tf.reshape(
        tf.tile(tf.expand_dims(learning_rate, 0), [
            params["batch_size"],
        ]), [params["batch_size"], 1])
    ce_repeat = tf.reshape(
        tf.tile(tf.expand_dims(current_epoch, 0), [
            params["batch_size"],
        ]), [params["batch_size"], 1])

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=_MOMENTUM)
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)
    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits, lr_repeat, ce_repeat):
            """Evaluation metric fn. Performed on CPU, do not reference TPU ops."""
            predictions = tf.argmax(logits, axis=1)
            accuracy = tf.metrics.accuracy(tf.argmax(labels, axis=1),
                                           predictions)
            lr = tf.metrics.mean(lr_repeat)
            ce = tf.metrics.mean(ce_repeat)
            return {
                "accuracy": accuracy,
                "learning_rate": lr,
                "current_epoch": ce
            }

        eval_metrics = (metric_fn, [labels, logits, lr_repeat, ce_repeat])

    return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                        loss=loss,
                                        train_op=train_op,
                                        eval_metrics=eval_metrics)
示例#25
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        unique_ids = features["unique_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        # obtaining the membership variables is important since only those weights
        # are modified during the optimization process.
        membership_logits, membership_vars = create_model(
            bert_config=bert_config,
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            membership_features_str=membership_features_str)

        membership_probs = tf.nn.softmax(membership_logits, axis=-1)
        membership_log_probs = tf.nn.log_softmax(membership_logits, axis=-1)

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            one_hot_positions = tf.one_hot(label_ids,
                                           depth=2,
                                           dtype=tf.float32)
            loss = -tf.reduce_mean(
                tf.reduce_sum(one_hot_positions * membership_log_probs,
                              axis=-1))

            global_step = tf.train.get_or_create_global_step()

            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            if use_tpu:
                optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

            train_op = optimizer.minimize(loss=loss,
                                          global_step=global_step,
                                          var_list=membership_vars)

            output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                       loss=loss,
                                                       train_op=train_op,
                                                       scaffold_fn=scaffold_fn)

        elif mode == tf.estimator.ModeKeys.EVAL:

            one_hot_positions = tf.one_hot(label_ids,
                                           depth=2,
                                           dtype=tf.float32)
            per_example_loss = -1 * tf.reduce_sum(
                one_hot_positions * membership_log_probs, axis=-1)
            total_loss = tf.reduce_mean(per_example_loss)

            def metric_fn(per_example_loss, label_ids, membership_logits):
                predictions = tf.argmax(membership_logits,
                                        axis=-1,
                                        output_type=tf.int32)
                loss = tf.metrics.mean(values=per_example_loss)
                accuracy = tf.metrics.accuracy(labels=label_ids,
                                               predictions=predictions)
                return {"eval_accuracy": accuracy, "eval_loss": loss}

            eval_metrics = (metric_fn,
                            [per_example_loss, label_ids, membership_logits])

            output_spec = contrib_tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)

        elif mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                "unique_ids": unique_ids,
                "membership_probs": membership_probs
            }
            output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                       predictions=predictions,
                                                       scaffold_fn=scaffold_fn)

        else:
            raise ValueError("Only TRAIN and PREDICT modes are supported: %s" %
                             (mode))

        return output_spec
示例#26
0
def create_optimizer(loss,
                     init_lr,
                     num_train_steps,
                     num_warmup_steps,
                     use_tpu,
                     optimizer="adamw",
                     poly_power=1.0,
                     start_warmup_step=0):
    """Creates an optimizer training op."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=poly_power,
                                              cycle=False)

    # Implements linear warmup. I.e., if global_step - start_warmup_step <
    # num_warmup_steps, the learning rate will be
    # `(global_step - start_warmup_step)/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        tf.logging.info("++++++ warmup starts at step " +
                        str(start_warmup_step) + ", for " +
                        str(num_warmup_steps) + " steps ++++++")
        global_steps_int = tf.cast(global_step, tf.int32)
        start_warm_int = tf.constant(start_warmup_step, dtype=tf.int32)
        global_steps_int = global_steps_int - start_warm_int
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is OK that you use this optimizer for finetuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    # It is OK to use AdamW in the finetuning even the model is trained by LAMB.
    # As report in the Bert pulic github, the learning rate for SQuAD 1.1 finetune
    # is 3e-5, 4e-5 or 5e-5. For LAMB, the users can use 3e-4, 4e-4,or 5e-4 for a
    # batch size of 64 in the finetune.
    if optimizer == "adamw":
        tf.logging.info("using adamw")
        optimizer = AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    elif optimizer == "lamb":
        tf.logging.info("using lamb")
        optimizer = lamb_optimizer.LAMBOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    else:
        raise ValueError("Not supported optimizer: ", optimizer)

    if use_tpu:
        optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)

    train_op = optimizer.apply_gradients(list(zip(grads, tvars)),
                                         global_step=global_step)

    # Normally the global step update is done inside of `apply_gradients`.
    # However, neither `AdamWeightDecayOptimizer` nor `LAMBOptimizer` do this.
    # But if you use a different optimizer, you should probably take this line
    # out.
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
示例#27
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        # Set policy for mixed-precision training with Keras-based models.
        if use_tpu and train_config.use_bfloat16:
            from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
            # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0.
            base_layer_utils.enable_v2_dtype_behavior()
            tf.compat.v2.keras.mixed_precision.experimental.set_policy(
                'mixed_bfloat16')
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            provide_groundtruth(detection_model, labels)

        preprocessed_images = features[fields.InputDataFields.image]

        side_inputs = detection_model.get_side_inputs(features)

        if use_tpu and train_config.use_bfloat16:
            with contrib_tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape],
                    **side_inputs)
                prediction_dict = ops.bfloat16_to_float32_nested(
                    prediction_dict)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape],
                **side_inputs)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = contrib_tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            load_pretrained = hparams.load_pretrained if hparams else False
            if train_config.fine_tune_checkpoint and load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            if (mode == tf.estimator.ModeKeys.EVAL
                    and eval_config.use_dummy_loss_in_eval):
                total_loss = tf.constant(1.0)
                losses_dict = {'Loss/total_loss': total_loss}
            else:
                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if use_tpu and train_config.use_bfloat16:
                        regularization_losses = ops.bfloat16_to_float32_nested(
                            regularization_losses)
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                        losses.append(regularization_loss)
                        losses_dict[
                            'Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = contrib_tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = contrib_framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = contrib_layers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if fields.InputDataFields.image_additional_channels in features:
                eval_dict[fields.InputDataFields.
                          image_additional_channels] = features[
                              fields.InputDataFields.image_additional_channels]

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                scaffold_fn=scaffold_fn,
                                                predictions=detections,
                                                loss=total_loss,
                                                train_op=train_op,
                                                eval_metrics=eval_metric_ops,
                                                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
示例#28
0
def get_estimator_spec(hparams, mode, features, labels, frame_logits,
                       onset_logits, offset_logits, velocity_values,
                       offset_network=True):
  """Create TPUEstimatorSpec."""
  loss_metrics = {}
  loss = None
  if (mode == tf.estimator.ModeKeys.TRAIN or
      mode == tf.estimator.ModeKeys.EVAL):
    onset_losses = tf.losses.sigmoid_cross_entropy(
        labels.onsets[:, :, :constants.MIDI_PITCHES],
        onset_logits[:, :, :constants.MIDI_PITCHES],
        weights=tf.expand_dims(
            tf.sequence_mask(
                features.length, maxlen=tf.shape(labels.onsets)[1]),
            axis=2))
    loss_metrics['onset'] = onset_losses

    if offset_network and not hparams.drums_only:
      offset_losses = tf.losses.sigmoid_cross_entropy(
          labels.offsets[:, :, :constants.MIDI_PITCHES],
          offset_logits[:, :, :constants.MIDI_PITCHES],
          weights=tf.expand_dims(
              tf.sequence_mask(
                  features.length, maxlen=tf.shape(labels.offsets)[1]),
              axis=2))
      loss_metrics['offset'] = offset_losses

    velocity_losses = tf.losses.mean_squared_error(
        labels.velocities, velocity_values,
        weights=labels.onsets * hparams.velocity_loss_weight)
    loss_metrics['velocity'] = velocity_losses

    if not hparams.drums_only:
      frame_losses = tf.losses.sigmoid_cross_entropy(
          labels.labels[:, :, :constants.MIDI_PITCHES],
          frame_logits[:, :, :constants.MIDI_PITCHES],
          weights=tf.expand_dims(
              tf.sequence_mask(
                  features.length, maxlen=tf.shape(labels.labels)[1]),
              axis=2))
      loss_metrics['frame'] = frame_losses

    loss = tf.losses.get_total_loss()

  if (mode == tf.estimator.ModeKeys.EVAL or
      mode == tf.estimator.ModeKeys.PREDICT):
    frame_probs = tf.sigmoid(frame_logits)
    onset_probs = tf.sigmoid(onset_logits)
    if offset_network:
      offset_probs = tf.sigmoid(offset_logits)
    else:
      offset_probs = tf.zeros_like(onset_probs)
    frame_predictions = frame_probs > hparams.predict_frame_threshold
    onset_predictions = onset_probs > hparams.predict_onset_threshold
    offset_predictions = offset_probs > hparams.predict_offset_threshold

    if hparams.drum_prediction_map:
      map_predictions = functools.partial(
          drum_mappings.map_pianoroll,
          mapping_name=hparams.drum_prediction_map,
          reduce_mode='any',
          min_pitch=constants.MIN_MIDI_PITCH)
      frame_predictions = tf.map_fn(map_predictions, frame_predictions)
      onset_predictions = tf.map_fn(map_predictions, onset_predictions)
      offset_predictions = tf.map_fn(map_predictions, offset_predictions)
      map_values = functools.partial(
          drum_mappings.map_pianoroll,
          mapping_name=hparams.drum_prediction_map,
          reduce_mode='max',
          min_pitch=constants.MIN_MIDI_PITCH)
      velocity_values = tf.map_fn(map_values, velocity_values)

    metrics_values = get_metrics(features, labels, frame_probs, onset_probs,
                                 frame_predictions, onset_predictions,
                                 offset_predictions, velocity_values, hparams)

    for label, loss_collection in loss_metrics.items():
      loss_label = 'losses/' + label
      metrics_values[loss_label] = loss_collection

  if mode == tf.estimator.ModeKeys.TRAIN:
    train_op = contrib_layers.optimize_loss(
        name='training',
        loss=loss,
        global_step=tf.train.get_or_create_global_step(),
        learning_rate=hparams.learning_rate,
        learning_rate_decay_fn=functools.partial(
            tf.train.exponential_decay,
            decay_steps=hparams.decay_steps,
            decay_rate=hparams.decay_rate,
            staircase=True),
        clip_gradients=hparams.clip_norm,
        summaries=[],
        optimizer=lambda lr: contrib_tpu.CrossShardOptimizer(  # pylint:disable=g-long-lambda
            tf.train.AdamOptimizer(lr)))

    return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op)
  elif mode == tf.estimator.ModeKeys.EVAL:
    metric_ops = {k: tf.metrics.mean(v) for k, v in metrics_values.items()}
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, eval_metric_ops=metric_ops)
  elif mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
        'frame_probs':
            frame_probs,
        'onset_probs':
            onset_probs,
        'frame_predictions':
            frame_predictions,
        'onset_predictions':
            onset_predictions,
        'offset_predictions':
            offset_predictions,
        'velocity_values':
            velocity_values,
        'sequence_predictions':
            _predict_sequences(
                frame_probs=frame_probs,
                onset_probs=onset_probs,
                frame_predictions=frame_predictions,
                onset_predictions=onset_predictions,
                offset_predictions=offset_predictions,
                velocity_values=velocity_values,
                hparams=hparams),
        # Include some features and labels in output because Estimator 'predict'
        # API does not give access to them.
        'sequence_ids':
            features.sequence_id,
        'sequence_labels':
            labels.note_sequence,
        'frame_labels':
            labels.labels,
        'onset_labels':
            labels.onsets,
    }
    for k, v in metrics_values.items():
      predictions[k] = tf.stack(v)

    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
  else:
    raise ValueError('Unsupported mode: %s' % mode)
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        if "is_real_example" in features:
            is_real_example = tf.cast(features["is_real_example"],
                                      dtype=tf.float32)
        else:
            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf_estimator.ModeKeys.TRAIN)

        membership_logits, membership_vars = create_model(
            bert_config, is_training, input_ids, input_mask, segment_ids,
            num_labels, use_one_hot_embeddings, membership_features_str)

        membership_probs = tf.nn.softmax(membership_logits, axis=-1)
        membership_log_probs = tf.nn.log_softmax(membership_logits, axis=-1)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf_estimator.ModeKeys.TRAIN:

            one_hot_positions = tf.one_hot(label_ids,
                                           depth=2,
                                           dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(
                one_hot_positions * membership_log_probs, axis=-1)
            total_loss = tf.reduce_mean(per_example_loss)

            global_step = tf.train.get_or_create_global_step()

            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            if use_tpu:
                optimizer = contrib_tpu.CrossShardOptimizer(optimizer)

            # only optimize the membership classifier variables since we want to
            # freeze the model.
            train_op = optimizer.minimize(loss=total_loss,
                                          global_step=global_step,
                                          var_list=membership_vars)

            output_spec = contrib_tpu.TPUEstimatorSpec(mode=mode,
                                                       loss=total_loss,
                                                       train_op=train_op,
                                                       scaffold_fn=scaffold_fn)
        elif mode == tf_estimator.ModeKeys.EVAL:

            one_hot_positions = tf.one_hot(label_ids,
                                           depth=2,
                                           dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(
                one_hot_positions * membership_log_probs, axis=-1)
            total_loss = tf.reduce_mean(per_example_loss)

            def metric_fn(per_example_loss, label_ids, logits,
                          is_real_example):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.accuracy(labels=label_ids,
                                               predictions=predictions,
                                               weights=is_real_example)
                loss = tf.metrics.mean(values=per_example_loss,
                                       weights=is_real_example)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metrics = (metric_fn, [
                per_example_loss, label_ids, membership_logits, is_real_example
            ])
            output_spec = contrib_tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            output_spec = contrib_tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={"probabilities": membership_probs},
                scaffold_fn=scaffold_fn)
        return output_spec