Exemplo n.º 1
0
def apply_pruning(
        pruning_obj,  # pylint:disable=invalid-name
        pruning_hparams,
        weight_params_fn,
        weight_init_obj,
        layerobj,
        wm_pc,
        dtype):
    """Apply pruning to an lingvo layer.

  Args:
    pruning_obj: a Pruning object;
    pruning_hparams: a Pruning hparams object;
    weight_params_fn: functional handle to create model parameters;
    weight_init_obj: a weight initialization object;
    layerobj: a layer object in the lingvo package;
    wm_pc: weight matrix;
    dtype: data type of the weight matrix.

  Returns:
    pruning_obj as passed in or a compression_obj.
  """
    # Pruning options that corresponds to the pruning operations in model_pruning.
    if pruning_hparams.prune_option in [
            'weight', 'first_order_gradient', 'second_order_gradient'
    ]:
        mask_pc = weight_params_fn(wm_pc.shape, weight_init_obj.Constant(1.0),
                                   dtype)
        threshold_pc = weight_params_fn([], weight_init_obj.Constant(0.0),
                                        tf.float32)
        layerobj.CreateVariable('mask',
                                mask_pc,
                                theta_fn=None,
                                trainable=False)
        layerobj.CreateVariable('threshold',
                                threshold_pc,
                                theta_fn=None,
                                trainable=False)
        if layerobj.vars.mask not in tf.get_collection(
                pruning.MASK_COLLECTION):
            tf.add_to_collection(pruning.WEIGHT_COLLECTION, layerobj.vars.wm)
            tf.add_to_collection(pruning.MASK_COLLECTION, layerobj.vars.mask)
            tf.add_to_collection(pruning.THRESHOLD_COLLECTION,
                                 layerobj.vars.threshold)
        return pruning_obj
    else:  # TODO(wanxin): add model_compression options.
        return pruning_obj
 def recurrent(self, node, current_level, postfix, is_training):
     tf.add_to_collection('checkpoints', node)
     num_features = self.num_filters(current_level)
     batch_size, _, image_size = get_batch_channel_image_size(
         node, data_format=self.data_format)
     cell = self.recurrent_cell(image_size, num_features, postfix,
                                is_training)
     if self.use_lstm_input_state:
         lstm_input_state = self.lstm_input_states[current_level]
     else:
         lstm_input_state = cell.zero_state(batch_size, tf.float32)
         self.lstm_input_states[current_level] = lstm_input_state
     node, lstm_output_state = cell(node, lstm_input_state)
     tf.add_to_collection('checkpoints', node)
     tf.add_to_collection('checkpoints', lstm_output_state)
     self.lstm_output_states[current_level] = lstm_output_state
     return node
Exemplo n.º 3
0
  def __init__(self, hps, net, output_layer, experiment_proto, input_paths):
    inputs, outputs = data.input_pipeline(
        input_paths, experiment_proto, hps.mbsz, hps=hps, num_threads=8)
    with tf.name_scope('neural_net'):
      logits = net.fprop(inputs, mode='train')
    with tf.name_scope('output_layer'):
      loss_per_target = output_layer.average_loss_per_target(
          logits, outputs, include_array=hps.train_on_array)
      loss = utils.reduce_nanmean(loss_per_target)

    self.global_step = tf.Variable(0, name='global_step', trainable=False)
    if hps.optimizer == 'momentum':
      optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum)
    elif hps.optimizer == 'adam':
      optimizer = tf.AdamOptimizer(hps.learn_rate)
    else:
      raise ValueError('invalid optimizer: %s' % hps.optimizer)
    optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum)
    grads = optimizer.compute_gradients(loss, net.params + output_layer.params)
    opt_op = optimizer.apply_gradients(grads, global_step=self.global_step)
    self.train_op = tf.with_dependencies([opt_op], loss)

    contrib_deprecated.scalar_summary('loss/mean', loss)
    for target in loss_per_target.axes['target'].labels:
      contrib_deprecated.scalar_summary(
          'loss/' + six.ensure_str(target),
          lt.select(loss_per_target, {'target': target}))
    with tf.name_scope('summarize_grads'):
      slim.learning.add_gradients_summaries(grads)

    tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, self.global_step)
    tf.add_to_collection('train_op', self.train_op)
    tf.add_to_collection('loss', loss)

    self.mbsz = hps.mbsz
    # The log Poisson loss implemented in TensorFlow may sometimes be negative.
    if (hps.loss_name == output_layers.LOSS_POISSON_LOSS or
        hps.loss_name == output_layers.LOSS_ZERO_TRUNCATED_POISSON_LOSS):
      self.min_cost = -float('inf')
      self.min_is_inclusive = False
    else:
      self.min_cost = 0
      self.min_is_inclusive = True
Exemplo n.º 4
0
    def _add_loss_graph(self):
        """Define the loss operation."""
        mc = self.mc

        with tf.variable_scope('class_regression') as scope:
            # cross-entropy: q * -log(p) + (1-q) * -log(1-p)
            # add a small value into log to prevent blowing up
            self.class_loss = tf.truediv(tf.reduce_sum(
                (self.labels * (-tf.log(self.pred_class_probs + mc.EPSILON)) +
                 (1 - self.labels) *
                 (-tf.log(1 - self.pred_class_probs + mc.EPSILON))) *
                self.input_mask * mc.LOSS_COEF_CLASS),
                                         self.num_objects,
                                         name='class_loss')
            tf.add_to_collection('losses', self.class_loss)

        with tf.variable_scope('confidence_score_regression') as scope:
            input_mask = tf.reshape(self.input_mask,
                                    [mc.BATCH_SIZE, mc.ANCHORS])
            self.conf_loss = tf.reduce_mean(tf.abs(
                tf.reduce_sum(
                    tf.square((self.ious - self.pred_conf)) *
                    (input_mask * mc.LOSS_COEF_CONF_POS / self.num_objects +
                     (1 - input_mask) * mc.LOSS_COEF_CONF_NEG /
                     (mc.ANCHORS - self.num_objects)),
                    reduction_indices=[1])),
                                            name='confidence_loss')
            tf.add_to_collection('losses', self.conf_loss)
            tf.summary.scalar('mean iou',
                              tf.reduce_sum(self.ious) / self.num_objects)

        with tf.variable_scope('bounding_box_regression') as scope:
            self.bbox_loss = tf.truediv(tf.reduce_sum(
                mc.LOSS_COEF_BBOX *
                tf.square(self.input_mask *
                          (self.pred_box_delta - self.box_delta_input))),
                                        self.num_objects,
                                        name='bbox_loss')
            tf.add_to_collection('losses', self.bbox_loss)

        # add above losses as well as weight decay losses to form the total loss
        self.loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
Exemplo n.º 5
0
    def build(self):
        '''向量用来训练生成G'''
        self._z_placeholder = tf.placeholder(
            tf.float32, (self._batch_size, self._z_dim))  # 每行向量生成图片
        '''真实图像用来训练判别器D'''
        self._img_placeholder = tf.placeholder(
            tf.float32, (self._batch_size, self._img_size, self._img_size,
                         1))  # [图片序号,长,宽,通道]
        generated_imgs = self._generator(self._z_placeholder,
                                         training=True)  # G生成的图像, 假图像
        fake_img_logits = self._discriminator(generated_imgs,
                                              training=True)  # 假图像判断结果
        real_img_logits = self._discriminator(self._img_placeholder,
                                              training=True)  # 真图像判断结果
        '''定义损失函数,两个,分开训练
            判别器,越真越好
            生成器,尽量避开判别器,让D判断为真
        '''
        # 生成器损失函数
        loss_on_fake_to_real = tf.reduce_mean(  # 真的用1表示,计算两者均值
            tf.nn.sparse_softmax_cross_entropy_with_logits(  # 假的图片,判断为真
                labels=tf.ones([self._batch_size], dtype=tf.int64),
                logits=fake_img_logits))
        # 判别器损失函数
        loss_on_fake_to_fake = tf.reduce_mean(  # 假的判断为假的
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tf.zeros([self._batch_size], dtype=tf.int64),
                logits=fake_img_logits))
        loss_on_real_to_real = tf.reduce_mean(  # 真的判断为真的
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tf.ones([self._batch_size], dtype=tf.int64),
                logits=real_img_logits))
        # 总的损失函数 collection类似字典实现的 keys:value 存取后,算总的加上,这样也方面查询
        tf.add_to_collection('g_losses', loss_on_fake_to_real)
        tf.add_to_collection('d_losses', loss_on_fake_to_fake)
        tf.add_to_collection('d_losses', loss_on_real_to_real)

        loss = {
            'g': tf.add_n(tf.get_collection('g_losses'), name='total_g_loss'),
            'd': tf.add_n(tf.get_collection('d_losses'), name='total_d_loss')
        }
        return self._z_placeholder, self._img_placeholder, generated_imgs, loss
Exemplo n.º 6
0
def init_training_mode():
    """  init_training_mode.

    Creates `is_training` variable and its ops if they haven't be created
    yet. This op is required if you are using layers such as dropout or
    batch normalization independently of TFLearn models (DNN or Trainer class).

    """
    # 'is_training' collection stores the training mode variable
    coll = tf.get_collection('is_training')
    if len(coll) == 0:
        tr_var = variable(
            "is_training", dtype=tf.bool, shape=[],
            initializer=tf.constant_initializer(False),
            trainable=False)
        tf.add_to_collection('is_training', tr_var)
        # 'is_training_ops' stores the ops to update training mode variable
        a = tf.assign(tr_var, True)
        b = tf.assign(tr_var, False)
        tf.add_to_collection('is_training_ops', a)
        tf.add_to_collection('is_training_ops', b)
Exemplo n.º 7
0
def apply_mask(x, scope='', prune_option='weight'):
    """Apply mask to a given weight tensor.

  Args:
    x: Input weight tensor
    scope: The current variable scope. Defaults to "".
    prune_option: pruning option. Defaults to 'weight'. option =
      'first_order_gradient' means using |weight| * |first order gradient| for
      pruning. option = 'second_order_gradient' means using |weight| * |second
      order gradient| for pruning.

  Returns:
    Tensor representing masked_weights
  """

    mask = pruning_utils.weight_mask_variable(x, scope)
    threshold = pruning_utils.weight_threshold_variable(x, scope)
    # Add masked_weights in the weights namescope so as to make it easier
    # for the quantization library to add quant ops.
    masked_weights = tf.multiply(mask, x, MASKED_WEIGHT_NAME)

    if prune_option in ('first_order_gradient', 'second_order_gradient'):
        # absolute value of gradients for gradient based pruning
        gradient = pruning_utils.weight_gradient_variable(x, scope)
        old_weight = pruning_utils.old_weight_variable(x, scope)
        old_old_weight = pruning_utils.old_old_weight_variable(x, scope)

    # Make sure the mask for a given variable are not added multiple times to the
    # collection. This is particularly important when applying mask to RNN's
    # weight variables
    if mask not in tf.get_collection_ref(MASK_COLLECTION):
        tf.add_to_collection(THRESHOLD_COLLECTION, threshold)
        tf.add_to_collection(MASK_COLLECTION, mask)
        tf.add_to_collection(MASKED_WEIGHT_COLLECTION, masked_weights)
        tf.add_to_collection(WEIGHT_COLLECTION, x)
        if prune_option in ('first_order_gradient', 'second_order_gradient'):
            tf.add_to_collection(WEIGHT_GRADIENT_COLLECTION, gradient)
            tf.add_to_collection(OLD_WEIGHT_COLLECTION, old_weight)
            tf.add_to_collection(OLD_OLD_WEIGHT_COLLECTION, old_old_weight)
    return masked_weights
Exemplo n.º 8
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        # MTF setup.
        graph = mtf.Graph()
        mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
        layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)

        ctx = params["context"]
        num_hosts = ctx.num_hosts
        host_placement_fn = ctx.tpu_host_placement_function
        device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)]
        tf.logging.info("device_list = %s" % device_list, )
        replica_cache_size = 300 * 1000000  # 300M per replica
        # Worker 0 caches all the TPU binaries.
        worker0_mem = replica_cache_size * ctx.num_replicas
        devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1)
        var_placer = mtf.utils.BalancedVariablePlacer(device_list,
                                                      devices_memeory_usage)
        mesh_devices = [""] * mesh_shape.size
        physical_shape = list(ctx.device_assignment.topology.mesh_shape)
        logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu(
            mesh_shape.to_integer_list, physical_shape)
        mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
            mesh_shape,
            layout_rules,
            mesh_devices,
            ctx.device_assignment,
            logical_to_physical=logical_to_physical)
        mesh = mtf.Mesh(graph, "bert_mesh", var_placer)

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = tf.squeeze(features["next_sentence_labels"], 1)

        batch_size = input_ids.get_shape()[0].value
        batch_dim = mtf.Dimension("batch", batch_size)

        seq_length = input_ids.get_shape()[1].value
        seq_dim = mtf.Dimension("seq", seq_length)
        max_predictions_per_seq = masked_lm_positions.get_shape()[1].value
        max_predictions_per_seq_dim = mtf.Dimension("max_pred_seq",
                                                    max_predictions_per_seq)

        mtf_input_ids = mtf.import_tf_tensor(mesh, input_ids,
                                             [batch_dim, seq_dim])
        mtf_input_mask = mtf.import_tf_tensor(mesh, input_mask,
                                              [batch_dim, seq_dim])
        mtf_segment_ids = mtf.import_tf_tensor(mesh, segment_ids,
                                               [batch_dim, seq_dim])
        mtf_masked_lm_positions = mtf.import_tf_tensor(
            mesh, masked_lm_positions,
            [batch_dim, max_predictions_per_seq_dim])
        mtf_masked_lm_ids = mtf.import_tf_tensor(
            mesh, masked_lm_ids, [batch_dim, max_predictions_per_seq_dim])

        mtf_masked_lm_weights = mtf.import_tf_tensor(
            mesh, masked_lm_weights, [batch_dim, max_predictions_per_seq_dim])
        mtf_next_sentence_labels = mtf.import_tf_tensor(
            mesh, next_sentence_labels, [batch_dim])

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = bert_lib.BertModel(config=bert_config,
                                   is_training=is_training,
                                   input_ids=mtf_input_ids,
                                   input_mask=mtf_input_mask,
                                   token_type_ids=mtf_segment_ids,
                                   layout=layout_rules,
                                   mesh_shape=mesh_shape)

        (masked_lm_loss, masked_lm_example_loss,
         masked_lm_logits) = model.get_masked_lm_output(
             mtf_masked_lm_positions, mtf_masked_lm_ids, mtf_masked_lm_weights)

        (next_sentence_loss, next_sentence_example_loss, next_sentence_logits
         ) = model.get_next_sentence_output(mtf_next_sentence_labels)

        extra_loss = model.get_extra_loss()

        total_loss = masked_lm_loss + next_sentence_loss
        total_loss = mtf.anonymize(total_loss)
        masked_lm_example_loss = mtf.anonymize(masked_lm_example_loss)
        masked_lm_logits = mtf.anonymize(masked_lm_logits)
        next_sentence_example_loss = mtf.anonymize(next_sentence_example_loss)
        next_sentence_logits = mtf.anonymize(next_sentence_logits)

        # TRAIN mode
        if mode == tf.estimator.ModeKeys.TRAIN:
            _, update_ops = optimization_lib.create_optimizer(
                total_loss + extra_loss,
                learning_rate,
                num_train_steps,
                num_warmup_steps,
                optimizer=FLAGS.optimizer,
                clip_gradients=FLAGS.clip_gradients)

        lowering = mtf.Lowering(graph, {mesh: mesh_impl})

        tf_loss = tf.to_float(lowering.export_to_tf_tensor(total_loss))

        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.train.get_global_step()
            tf_update_ops = [
                lowering.lowered_operation(op) for op in update_ops
            ]
            tf_update_ops.append(tf.assign_add(global_step, 1))
            tf.logging.info("tf_update_ops: {}".format(tf_update_ops))
            train_op = tf.group(tf_update_ops)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_example_loss, masked_lm_logits,
                          masked_lm_ids, masked_lm_weights,
                          next_sentence_example_loss, next_sentence_logits,
                          next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_logits = tf.reshape(masked_lm_logits,
                                              [-1, masked_lm_logits.shape[-1]])
                masked_lm_predictions = tf.argmax(masked_lm_logits,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(masked_lm_example_loss,
                                                    [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                next_sentence_logits = tf.reshape(
                    next_sentence_logits, [-1, next_sentence_logits.shape[-1]])
                next_sentence_predictions = tf.argmax(next_sentence_logits,
                                                      axis=-1,
                                                      output_type=tf.int32)
                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
                next_sentence_accuracy = tf.metrics.accuracy(
                    labels=next_sentence_labels,
                    predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.metrics.mean(
                    values=next_sentence_example_loss)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            eval_metrics = (metric_fn, [
                lowering.export_to_tf_tensor(masked_lm_example_loss),
                lowering.export_to_tf_tensor(masked_lm_logits), masked_lm_ids,
                masked_lm_weights,
                lowering.export_to_tf_tensor(next_sentence_example_loss),
                lowering.export_to_tf_tensor(next_sentence_logits),
                next_sentence_labels
            ])

        with mtf.utils.outside_all_rewrites():
            # Copy master variables to slices. Must be called first.
            restore_hook = mtf.MtfRestoreHook(lowering)
            if mode == tf.estimator.ModeKeys.TRAIN:
                saver = tf.train.Saver(tf.global_variables(),
                                       sharded=True,
                                       max_to_keep=10,
                                       keep_checkpoint_every_n_hours=2,
                                       defer_build=False,
                                       save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                saver_listener = mtf.MtfCheckpointSaverListener(lowering)
                saver_hook = tf.train.CheckpointSaverHook(
                    FLAGS.output_dir,
                    save_steps=1000,
                    saver=saver,
                    listeners=[saver_listener])

                return tf.estimator.tpu.TPUEstimatorSpec(
                    tf.estimator.ModeKeys.TRAIN,
                    loss=tf_loss,
                    train_op=train_op,
                    training_hooks=[restore_hook, saver_hook])
            elif mode == tf.estimator.ModeKeys.EVAL:
                return tf.estimator.tpu.TPUEstimatorSpec(
                    tf.estimator.ModeKeys.EVAL,
                    evaluation_hooks=[restore_hook],
                    loss=tf_loss,
                    eval_metrics=eval_metrics)
Exemplo n.º 9
0
                                                          labels=y_,
                                                          name="xentropy")
    loss = tf.reduce_mean(xentropy, name='loss')
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss, name="train_op")

with tf.name_scope("eval"):
    correct = tf.equal(tf.argmax(logits, axis=1), tf.argmax(y_, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

with tf.name_scope("init_and_save"):
    init_op = tf.global_variables_initializer()
    saver = tf.train.Saver()  # We need to add a saver Op

# Now we add averything we'll need in future to a collection
tf.add_to_collection('train_var', train_op)
tf.add_to_collection('train_var', accuracy)
tf.add_to_collection('train_var', x)
tf.add_to_collection('train_var', y_)

n_epoch = 100

with tf.Session() as sess:
    sess.run(init_op)

    graph = tf.get_default_graph()
    print2(graph.get_name_scope())

    for epoch in range(n_epoch):

        # One step of the training
Exemplo n.º 10
0
def image(name, tensor, is_tpu=True):
  logging.info('Adding image summary {}'.format(Pair(name, tensor)))
  if is_tpu:
    tf.add_to_collection('image_summaries', Pair(name, tensor))
  else:
    tf.summary.image(name, tensor)
Exemplo n.º 11
0
def add_output_tensor_nodes(postprocessed_tensors,
                            output_collection_name='inference_op'):
    """Adds output nodes for detection boxes and scores.

  Adds the following nodes for output tensors -
    * num_detections: float32 tensor of shape [batch_size].
    * detection_boxes: float32 tensor of shape [batch_size, num_boxes, 4]
      containing detected boxes.
    * detection_scores: float32 tensor of shape [batch_size, num_boxes]
      containing scores for the detected boxes.
    * detection_multiclass_scores: (Optional) float32 tensor of shape
      [batch_size, num_boxes, num_classes_with_background] for containing class
      score distribution for detected boxes including background if any.
    * detection_features: (Optional) float32 tensor of shape
      [batch, num_boxes, roi_height, roi_width, depth]
      containing classifier features
      for each detected box
    * detection_classes: float32 tensor of shape [batch_size, num_boxes]
      containing class predictions for the detected boxes.
    * detection_keypoints: (Optional) float32 tensor of shape
      [batch_size, num_boxes, num_keypoints, 2] containing keypoints for each
      detection box.
    * detection_masks: (Optional) float32 tensor of shape
      [batch_size, num_boxes, mask_height, mask_width] containing masks for each
      detection box.

  Args:
    postprocessed_tensors: a dictionary containing the following fields
      'detection_boxes': [batch, max_detections, 4]
      'detection_scores': [batch, max_detections]
      'detection_multiclass_scores': [batch, max_detections,
        num_classes_with_background]
      'detection_features': [batch, num_boxes, roi_height, roi_width, depth]
      'detection_classes': [batch, max_detections]
      'detection_masks': [batch, max_detections, mask_height, mask_width]
        (optional).
      'detection_keypoints': [batch, max_detections, num_keypoints, 2]
        (optional).
      'num_detections': [batch]
    output_collection_name: Name of collection to add output tensors to.

  Returns:
    A tensor dict containing the added output tensor nodes.
  """
    detection_fields = fields.DetectionResultFields
    label_id_offset = 1
    boxes = postprocessed_tensors.get(detection_fields.detection_boxes)
    scores = postprocessed_tensors.get(detection_fields.detection_scores)
    multiclass_scores = postprocessed_tensors.get(
        detection_fields.detection_multiclass_scores)
    box_classifier_features = postprocessed_tensors.get(
        detection_fields.detection_features)
    raw_boxes = postprocessed_tensors.get(detection_fields.raw_detection_boxes)
    raw_scores = postprocessed_tensors.get(
        detection_fields.raw_detection_scores)
    classes = postprocessed_tensors.get(
        detection_fields.detection_classes) + label_id_offset
    keypoints = postprocessed_tensors.get(detection_fields.detection_keypoints)
    masks = postprocessed_tensors.get(detection_fields.detection_masks)
    num_detections = postprocessed_tensors.get(detection_fields.num_detections)
    outputs = {}
    outputs[detection_fields.detection_boxes] = tf.identity(
        boxes, name=detection_fields.detection_boxes)
    outputs[detection_fields.detection_scores] = tf.identity(
        scores, name=detection_fields.detection_scores)
    if multiclass_scores is not None:
        outputs[detection_fields.detection_multiclass_scores] = tf.identity(
            multiclass_scores,
            name=detection_fields.detection_multiclass_scores)
    if box_classifier_features is not None:
        outputs[detection_fields.detection_features] = tf.identity(
            box_classifier_features, name=detection_fields.detection_features)
    outputs[detection_fields.detection_classes] = tf.identity(
        classes, name=detection_fields.detection_classes)
    outputs[detection_fields.num_detections] = tf.identity(
        num_detections, name=detection_fields.num_detections)
    if raw_boxes is not None:
        outputs[detection_fields.raw_detection_boxes] = tf.identity(
            raw_boxes, name=detection_fields.raw_detection_boxes)
    if raw_scores is not None:
        outputs[detection_fields.raw_detection_scores] = tf.identity(
            raw_scores, name=detection_fields.raw_detection_scores)
    if keypoints is not None:
        outputs[detection_fields.detection_keypoints] = tf.identity(
            keypoints, name=detection_fields.detection_keypoints)
    if masks is not None:
        outputs[detection_fields.detection_masks] = tf.identity(
            masks, name=detection_fields.detection_masks)
    for output_key in outputs:
        tf.add_to_collection(output_collection_name, outputs[output_key])

    return outputs
Exemplo n.º 12
0
  def _model_fn(input_fea, input_lab):
    """Creates a model, add summary, modes (train or eval), and hooks."""

    # input_fea and input_lab should be a list (laid_out_tensors).
    if not isinstance(input_fea, list):
      input_fea = [input_fea]
    if not isinstance(input_lab, list):
      input_lab = [input_lab]

    def _add_summary(lowering, train_or_eval, tf_loss, scalars, global_step):
      """Add all summaries."""
      for k in scalars.keys():
        if not isinstance(scalars[k], tf.Tensor):
          scalars[k] = tf.cast(
              lowering.export_to_tf_tensor(scalars[k]), tf.float32)

      def _host_loss_summary(global_step, tf_loss, **scalars):
        """Add summary.scalar in host side."""
        gs = tf.cast(global_step, tf.int64)
        sum_loss = contrib_summary.scalar(
            '{}_loss'.format(train_or_eval), tf_loss, step=gs)
        sum_ops = [sum_loss.op]
        for description, tf_metric in scalars.iteritems():
          sum_metric = contrib_summary.scalar(
              '{}_{}'.format(train_or_eval, description), tf_metric, step=gs)
          sum_ops.append(sum_metric)
        with tf.control_dependencies(sum_ops):
          return tf.identity(tf_loss)

      if FLAGS.use_tpu:
        # Cast the global step to tf.int32, since
        # outside_compilation does not support tf.int64.
        tf_loss = tpu.outside_compilation(
            _host_loss_summary,
            tf.cast(global_step, tf.int32),
            tf_loss,
            **scalars)
      else:
        tf_loss = _host_loss_summary(
            tf.cast(global_step, tf.int32),
            tf_loss,
            **scalars)

      return tf_loss

    global_step = tf.train.get_or_create_global_step()
    graph, mesh, mesh_impl = mesh_context.create_graph_mesh_and_mesh_impl()

    with mtf.utils.outside_all_rewrites():
      # Do not tpu_rewrite this part. Inside this unet, If you use Tensorflow,
      # instead of Mesh-Tensorflor, it will cause host to tpu send/rec.
      preds, loss, scalars, bn_update_ops = (
          unet.unet_with_spatial_partition(
              mesh, mesh_impl, train_or_eval, input_fea, input_lab))

    if train_or_eval == 'train':
      var_grads = mtf.gradients(
          [loss], [v.outputs[0] for v in graph.trainable_variables])

      lr = FLAGS.lr * tf.pow(
          FLAGS.lr_drop_rate,
          tf.floor(tf.cast(global_step, tf.float32) / FLAGS.lr_drop_steps))
      scalars['learning_rate'] = lr

      optimizer = mtf.optimize.AdafactorOptimizer(learning_rate=lr)
      update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables)

      # This is where the actual tf graph got built.
      lowering = mtf.Lowering(graph, {mesh: mesh_impl})

      tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
      tf_update_ops.append(tf.assign_add(global_step, 1))
      tf_update_ops.extend(
          [lowering.lowered_operation(op) for op in bn_update_ops])

    else:  # train_or_eval == 'eval':
      preds = [mtf.anonymize(pred) for pred in preds]

      # This is where the actual tf graph got built.
      lowering = mtf.Lowering(graph, {mesh: mesh_impl})

      tf_preds = [tf.cast(
          lowering.export_to_tf_tensor(pred), tf.float32) for pred in preds]

    tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32)
    if FLAGS.write_summary:
      tf_loss = _add_summary(
          lowering, train_or_eval, tf_loss, scalars, global_step)
    master_to_slice_hook = mtf.MtfRestoreHook(lowering)

    if train_or_eval == 'train':
      with mtf.utils.outside_all_rewrites():
        saver = tf.train.Saver(tf.global_variables(),
                               save_relative_paths=True)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
        saver_listener = mtf.MtfCheckpointSaverListener(lowering)
        slice_to_master_hook = tf.train.CheckpointSaverHook(
            FLAGS.checkpoint_dir,
            save_steps=FLAGS.save_checkpoints_steps,
            saver=saver, listeners=[saver_listener])
        captured_hooks.capture([master_to_slice_hook, slice_to_master_hook])
        return tf.group([tf_loss] + tf_update_ops)

    else:  # train_or_eval == 'eval':
      if FLAGS.use_tpu:
        tf_preds.extend([tf_loss, global_step])
        tf_preds_dtypes = [tf_pred.dtype for tf_pred in tf_preds]
        tf_preds_shapes = [tf_pred.shape for tf_pred in tf_preds]
        captured_hooks.capture([master_to_slice_hook, None])
        captured_output_dtypes_shapes.capture(
            [tf_preds_dtypes, tf_preds_shapes])
        return tpu_ops.outfeed_enqueue_tuple(tf_preds)

      else:
        tf_preds.extend([tf_loss, global_step])
        captured_hooks.capture([master_to_slice_hook, None])
        return tf_preds
Exemplo n.º 13
0
def eval_op(batch, hparams, config_name):
    """Define a evaluation op.

    Args:
      batch: Batch produced by NSynthReader.
      hparams: Hyperparameters.
      config_name: Name of config module.

    Returns:
      eval_op: A complete evaluation op with summaries.
    """
    phase = not (hparams.mag_only or hparams.raw_audio)

    config = utils.get_module("baseline.models.ae_configs.%s" % config_name)
    if hparams.raw_audio:
        x = batch["audio"]
        # Add height and channel dims
        x = tf.expand_dims(tf.expand_dims(x, 1), -1)
    else:
        x = batch["spectrogram"]

    # Define the model
    with tf.name_scope("Model"):
        z = config.encode(x, hparams, is_training=False)
        xhat = config.decode(z, batch, hparams, is_training=False)

    # For interpolation
    tf.add_to_collection("x", x)
    tf.add_to_collection("pitch", batch["pitch"])
    tf.add_to_collection("z", z)
    tf.add_to_collection("xhat", xhat)

    total_loss = compute_mse_loss(x, xhat, hparams)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        "Loss":
        slim.metrics.mean(total_loss),
    })

    # Define the summaries
    for name, value in names_to_values.items():
        slim.summaries.add_scalar_summary(value, name, print_summary=True)

    # Interpolate
    with tf.name_scope("Interpolation"):
        xhat = config.decode(z, batch, hparams, reuse=True, is_training=False)

        # Linear interpolation
        z_shift_one_example = tf.concat([z[1:], z[:1]], 0)
        z_linear_half = (z + z_shift_one_example) / 2.0
        xhat_linear_half = config.decode(z_linear_half,
                                         batch,
                                         hparams,
                                         reuse=True,
                                         is_training=False)

        # Pitch shift

        pitch_plus_2 = tf.clip_by_value(batch["pitch"] + 2, 0, 127)
        pitch_minus_2 = tf.clip_by_value(batch["pitch"] - 2, 0, 127)

        batch["pitch"] = pitch_minus_2
        xhat_pitch_minus_2 = config.decode(z,
                                           batch,
                                           hparams,
                                           reuse=True,
                                           is_training=False)
        batch["pitch"] = pitch_plus_2
        xhat_pitch_plus_2 = config.decode(z,
                                          batch,
                                          hparams,
                                          reuse=True,
                                          is_training=False)

    utils.specgram_summaries(x, "Training Examples", hparams, phase=phase)
    utils.specgram_summaries(xhat, "Reconstructions", hparams, phase=phase)
    utils.specgram_summaries(x - xhat,
                             "Difference",
                             hparams,
                             audio=False,
                             phase=phase)
    utils.specgram_summaries(xhat_linear_half,
                             "Linear Interp. 0.5",
                             hparams,
                             phase=phase)
    utils.specgram_summaries(xhat_pitch_plus_2,
                             "Pitch +2",
                             hparams,
                             phase=phase)
    utils.specgram_summaries(xhat_pitch_minus_2,
                             "Pitch -2",
                             hparams,
                             phase=phase)

    return list(names_to_updates.values())
Exemplo n.º 14
0
weight1 = tf.Variable(tf.truncated_normal([9, 50], stddev=0.1))
bias1 = tf.Variable(tf.constant(0.1, shape=[50]))

weight2 = tf.Variable(tf.truncated_normal([50, 50], stddev=0.1))
bias2 = tf.Variable(tf.constant(0.1, shape=[50]))

weight3 = tf.Variable(tf.truncated_normal([50, 1], stddev=0.1))
bias3 = tf.Variable(tf.constant(0.1, shape=[1]))

sample_size = len(data)
#输出y
y = hidden_layer(x, weight1, bias1, weight2, bias2, weight3, bias3)

#损失函数
error_loss = tf.reduce_sum(tf.pow(y_ - y, 2)) / sample_size
tf.add_to_collection("losses", error_loss)

#加入正则化
#regularizer = tf.contrib.layers.l2_regularizer(0.01)
regularizer = tf.keras.regularizers.l2(0.001)
regularization = regularizer(weight1) + regularizer(weight2) + regularizer(
    weight3)
tf.add_to_collection("losses", regularization)

loss = tf.add_n(tf.get_collection("losses"))

#定义优化器
train_op = tf.train.AdamOptimizer(0.05).minimize(loss)
#train_op = tf.train.GradientDescentOptimizer(0.05).minimize(loss)

#定义准确率
Exemplo n.º 15
0
def merge(tensors_list, mode, axis=1, name="Merge"):
    """ Merge.

    Merge a list of `Tensor` into a single one. A merging 'mode' must be
    specified, check below for the different options.

    Input:
        List of Tensors.

    Output:
        Merged Tensors.

    Arguments:
        tensors_list: A list of `Tensor`, A list of tensors to merge.
        mode: `str`. Merging mode, it supports:
            ```
            'concat': concatenate outputs along specified axis
            'elemwise_sum': outputs element-wise sum
            'elemwise_mul': outputs element-wise mul
            'sum': outputs element-wise sum along specified axis
            'mean': outputs element-wise average along specified axis
            'prod': outputs element-wise multiplication along specified axis
            'max': outputs max elements along specified axis
            'min': outputs min elements along specified axis
            'and': `logical and` btw outputs elements along specified axis
            'or': `logical or` btw outputs elements along specified axis
            ```
        axis: `int`. Represents the axis to use for merging mode.
            In most cases: 0 for concat and 1 for other modes.
        name: A name for this layer (optional). Default: 'Merge'.

    """

    assert len(tensors_list) > 1, "Merge required 2 or more tensors."

    with tf.name_scope(name) as scope:
        tensors = [l for l in tensors_list]
        if mode == 'concat':
            inference = tf.concat(tensors, axis)
        elif mode == 'elemwise_sum':
            inference = tensors[0]
            for i in range(1, len(tensors)):
                inference = tf.add(inference, tensors[i])
        elif mode == 'elemwise_mul':
            inference = tensors[0]
            for i in range(1, len(tensors)):
                inference = tf.multiply(inference, tensors[i])
        elif mode == 'sum':
            inference = tf.reduce_sum(tf.concat(tensors, axis),
                                      reduction_indices=axis)
        elif mode == 'mean':
            inference = tf.reduce_mean(tf.concat(tensors, axis),
                                       reduction_indices=axis)
        elif mode == 'prod':
            inference = tf.reduce_prod(tf.concat(tensors, axis),
                                       reduction_indices=axis)
        elif mode == 'max':
            inference = tf.reduce_max(tf.concat(tensors, axis),
                                      reduction_indices=axis)
        elif mode == 'min':
            inference = tf.reduce_min(tf.concat(tensors, axis),
                                      reduction_indices=axis)
        elif mode == 'and':
            inference = tf.reduce_all(tf.concat(tensors, axis),
                                      reduction_indices=axis)
        elif mode == 'or':
            inference = tf.reduce_any(tf.concat(tensors, axis),
                                      reduction_indices=axis)
        else:
            raise Exception("Unknown merge mode", str(mode))

    # Track output tensor.
    tf.add_to_collection(tf.GraphKeys.LAYER_TENSOR + '/' + name, inference)

    return inference
Exemplo n.º 16
0
  def model_fn(self,
               features,
               labels,
               mode,
               config = None,
               params = None):
    """Estimator model_fn.

    Args:
      features: This is the first item returned from the input_fn and parsed by
        tensorspec_utils.validate_and_pack. A spec_structure which fulfills the
        requirements of the self.get_feature_specification.
      labels: This is the second item returned from the input_fn and parsed by
        tensorspec_utils.validate_and_pack. A spec_structure which fulfills the
        requirements of the self.get_feature_specification.
      mode: (ModeKeys) Specifies if this is training, evaluation or prediction.
      config: (Optional tf.estimator.RunConfig or contrib_tpu.RunConfig) Will
        receive what is passed to Estimator in config parameter, or the default
        config (tf.estimator.RunConfig). Allows updating things in your model_fn
        based on  configuration such as num_ps_replicas, or model_dir.
      params: An optional dict of hyper parameters that will be passed into
        input_fn and model_fn. Keys are names of parameters, values are basic
        python types. There are reserved keys for TPUEstimator, including
        'batch_size'.

    Raises:
      ValueError: If the mode key is not supported, not in [PREDICT, TRAIN,
        EVAL].

    Returns:
      An EstimatorSpec.
    """
    features = tensorspec_utils.validate_and_pack(
        expected_spec=self.get_feature_specification(mode),
        actual_tensors_or_spec=features,
        ignore_batch=True)
    if labels:
      labels = tensorspec_utils.validate_and_pack(
          expected_spec=self.get_label_specification(mode),
          actual_tensors_or_spec=labels,
          ignore_batch=True)
    inference_outputs = self.inference_network_fn(features, labels, mode,
                                                  config, params)
    update_ops = None
    if isinstance(inference_outputs, tuple):
      if len(inference_outputs) != 2:
        raise ValueError('Unknown output of inference_network_fn: '
                         'tuple of length %d' % len(inference_outputs))
      outputs = inference_outputs[0]
      update_ops = inference_outputs[1]
      inference_outputs = outputs

    if mode == tf.estimator.ModeKeys.PREDICT:
      model_fn_results = self.create_export_outputs_fn(features,
                                                       inference_outputs, mode,
                                                       config, params)
      export_outputs = None
      if isinstance(model_fn_results, tuple):
        predictions = model_fn_results[0]
        export_outputs = model_fn_results[1]
      elif isinstance(model_fn_results, dict):
        export_outputs = {}
        if len(model_fn_results) == 1:
          name, output = list(model_fn_results.items())[0]
          export_outputs[name] = tf.estimator.export.RegressionOutput(output)
        export_outputs[tf.saved_model.signature_constants
                       .DEFAULT_SERVING_SIGNATURE_DEF_KEY] = (
                           tf.estimator.export.PredictOutput(model_fn_results))
        predictions = model_fn_results
      else:
        raise ValueError('The create_export_outputs_fn should return a '
                         'tuple(predictions, export_outputs) or predictions.')

      return tf.estimator.EstimatorSpec(
          mode=mode, predictions=predictions, export_outputs=export_outputs)

    train_fn_result = self.model_train_fn(features, labels, inference_outputs,
                                          mode, config, params)
    if isinstance(train_fn_result, tf.Tensor):
      train_loss = train_fn_result
      train_outputs = {}
    elif isinstance(train_fn_result, tuple):
      train_loss = train_fn_result[0]
      train_outputs = train_fn_result[1]
    else:
      raise ValueError('The model_train_fn should return a '
                       'tuple(loss, train_outputs) or loss.')

    if mode == tf.estimator.ModeKeys.TRAIN:
      # Create the tf.train.Optimizer.
      optimizer = self.create_optimizer()

      train_op = self.create_train_op(train_loss, optimizer, update_ops,
                                      train_outputs)

      self.add_summaries(features, labels, inference_outputs, train_loss,
                         train_outputs, mode, config, params)

      # Now the optimizer has been created, therefore, the checkpoint could be
      # initialized.
      # No new variables are allowed to be added, otherwise
      # we would not initialize these variables.
      # Note, this feature is only available for train to bootstrap a model
      # (partially) from a different model. As soon as this checkpoint is
      # written all other modes will use the local checkpoint within model_dir.
      self.maybe_init_from_checkpoint()
      training_hooks = []

      # EstimatorSpec has training_chief_hooks, but TPUEstimatorSpec does not,
      # so we have to use training_hooks here and check is_chief.
      if config and config.is_chief:  # pytype: disable=attribute-error
        training_hooks.append(
            gin_utils.GinConfigSaverHook(
                config.model_dir, summarize_config=True))
        if hasattr(self, 'writer_init_ops'):
          training_hooks.append(V2SummaryInitHook(self.writer_init_ops[mode]))

      # `SyncReplicasOptimizer` needs to attach a training hook.
      if self._sync_replicas_optimizer:
        training_hooks.append(
            self._sync_replicas_optimizer.make_session_run_hook(
                config.is_chief))  # pytype: disable=attribute-error

      # Return the value of the property first since it might be changed.
      scaffold_fn = self.scaffold_fn
      scaffold = scaffold_fn()

      # In order to export asynchronously the saver has to be registered
      # in the graph collection. The scaffold function might register a
      # saver already which is why it is checked here and a saver only
      # added it has none has been added.
      if not tf.get_collection(tf.GraphKeys.SAVERS):
        # TODO(T2R_CONTRIBUTORS): Switch to using gin config for all saver params.
        keep_checkpoint_every_n_hours = None
        max_to_keep = None
        if config is not None:
          keep_checkpoint_every_n_hours = config.keep_checkpoint_every_n_hours
          max_to_keep = config.keep_checkpoint_max
        saver = gin_configurable_saver(
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            max_to_keep=max_to_keep,
        )
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
      return tf.estimator.EstimatorSpec(
          mode=mode,
          loss=train_loss,
          train_op=train_op,
          training_hooks=training_hooks,
          scaffold=scaffold)

    if mode == tf.estimator.ModeKeys.EVAL:
      self.add_summaries(features, labels, inference_outputs, train_loss,
                         train_outputs, mode, config, params)

      eval_metrics = self.model_eval_fn(features, labels, inference_outputs,
                                        train_loss, train_outputs, mode, config,
                                        params)
      evaluation_hooks = self.get_eval_hooks(config, params)
      if config and config.is_chief:  # pytype: disable=attribute-error
        eval_name = params.get('eval_name', 'eval')  # pytype: disable=attribute-error
        evaluation_hooks.append(
            gin_utils.GinConfigSaverHook(
                os.path.join(config.model_dir, eval_name),
                summarize_config=True))
        if hasattr(self, 'writer_init_ops'):
          evaluation_hooks.append(V2SummaryInitHook(self.writer_init_ops[mode]))
      return tf.estimator.EstimatorSpec(
          mode=mode,
          loss=train_loss,
          eval_metric_ops=eval_metrics,
          evaluation_hooks=evaluation_hooks)

    raise ValueError('The mode {} is not supported yet.'.format(mode))
Exemplo n.º 17
0
 def create_swapping_saver_scaffold(saver=None):
   saver = optimizers.create_swapping_saver(optimizer)
   tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
   return tf.train.Scaffold(saver=saver)
def fully_connected(inputs,
                    num_outputs,
                    activation_fn=tf.nn.relu,
                    scope=None,
                    collection=None,
                    distribution=NoiseDistribution.INDEPENDENT,
                    summary_writer=None):
    """Creates a fully connected layer with noise."""
    num_inputs = int(inputs.get_shape()[-1])
    weight_shape = (num_inputs, num_outputs)
    biases_shape = [num_outputs]

    # Parameters for each noise distribution, see Section 3.2 in original paper.
    if distribution == NoiseDistribution.INDEPENDENT:
        stddev = np.sqrt(3. / num_inputs)
        constant = 0.017
        epsilon_w = tf.truncated_normal(weight_shape)
        epsilon_b = tf.truncated_normal(biases_shape)
    elif distribution == NoiseDistribution.FACTORISED:
        stddev = np.sqrt(1. / num_inputs)
        constant = 0.5 * np.sqrt(1 / num_inputs)
        noise_input = tf.truncated_normal(weight_shape)
        noise_output = tf.truncated_normal(biases_shape)
        epsilon_w = tf.matmul(
            signed_sqrt(noise_output)[:, None],
            signed_sqrt(noise_input)[None, :])
        epsilon_b = signed_sqrt(noise_output)
    else:
        raise ValueError('Unknown noise distribution')

    mu_initializer = tf.initializers.random_uniform(minval=-stddev,
                                                    maxval=stddev)
    sigma_initializer = tf.constant_initializer(value=constant)

    with tf.variable_scope(scope):
        mu_w = tf.get_variable('mu_w',
                               weight_shape,
                               trainable=True,
                               initializer=mu_initializer)
        sigma_w = tf.get_variable('sigma_w',
                                  weight_shape,
                                  trainable=True,
                                  initializer=sigma_initializer)
        mu_b = tf.get_variable('mu_b',
                               biases_shape,
                               trainable=True,
                               initializer=mu_initializer)
        sigma_b = tf.get_variable('sigma_b',
                                  biases_shape,
                                  trainable=True,
                                  initializer=sigma_initializer)
        if collection is not None:
            tf.add_to_collection(collection, mu_w)
            tf.add_to_collection(collection, mu_b)
            tf.add_to_collection(collection, sigma_w)
            tf.add_to_collection(collection, sigma_b)

        w = mu_w + sigma_w * epsilon_w
        b = mu_b + sigma_b * epsilon_b
        layer = tf.matmul(inputs, w)
        layer_bias = tf.nn.bias_add(layer, b)

        if summary_writer is not None:
            with tf.variable_scope('Noisy'):
                tf.summary.scalar('Sigma', tf.reduce_mean(sigma_w))

        if activation_fn is not None:
            layer_bias = activation_fn(layer_bias)
    return layer_bias
  def get_customized_apply_compression_op(self,
                                          a_matrix_tfvar,
                                          matrix_compressor,
                                          layer_obj,
                                          weight_params_fn,
                                          weight_init_obj,
                                          scope='default_scope'):
    """Returns pruning + kmeans compressed operator for a customized layer.

    Args:
      a_matrix_tfvar: TF variable representing a tensor variable in a model.
      matrix_compressor: MatrixCompressorInferface object to specify the
        compression algorithm. Must return two matrices b_matrix,c_matrix in its
        compression.
      layer_obj: a customeried layer object that handles variable creation.
      weight_params_fn: functional handle to create model parameters.
      weight_init_obj: a weight initialization object.
      scope: TF scope used for creating new TF variables.

    Returns:
      A TF node that has the compressed version of a_matrix_tfvar.
    """
    self.matrix_compressor = matrix_compressor
    a_matrix = np.zeros(shape=a_matrix_tfvar.shape)
    if getattr(self._spec, 'do_transpose', False):
      a_matrix = np.transpose(a_matrix)
    [b_matrix, c_matrix] = matrix_compressor.static_matrix_compressor(a_matrix)

    self.uncompressed_size = matrix_compressor.uncompressed_size
    self.compressed_size = matrix_compressor.compressed_size

    p = layer_obj.params
    with tf.variable_scope(scope) as scope:
      # Create pruning relevant variables.
      mask_pc = weight_params_fn(a_matrix.shape, weight_init_obj.Constant(1.0),
                                 p.dtype)
      threshold_pc = weight_params_fn([], weight_init_obj.Constant(0.0),
                                      tf.float32)
      self._create_layer_variable(layer_obj, 'mask', mask_pc, None, False)
      self._create_layer_variable(layer_obj, 'threshold', threshold_pc, None,
                                  False)
      if layer_obj.vars.mask not in tf.get_collection(pruning.MASK_COLLECTION):
        tf.add_to_collection(pruning.WEIGHT_COLLECTION, layer_obj.vars.wm)
        tf.add_to_collection(pruning.MASK_COLLECTION, layer_obj.vars.mask)
        tf.add_to_collection(pruning.THRESHOLD_COLLECTION,
                             layer_obj.vars.threshold)
      if self.pruning_obj.get_spec().prune_option in [
          'first_order_gradient', 'second_order_gradient'
      ]:
        grad_pc = weight_params_fn(a_matrix.shape,
                                   weight_init_obj.Constant(0.0), p.dtype)
        self._create_layer_variable(layer_obj, 'gradient', grad_pc, None, False)
        self._create_layer_variable(layer_obj, 'old_weight', grad_pc, None,
                                    False)
        self._create_layer_variable(layer_obj, 'old_old_weight', grad_pc, None,
                                    False)
        tf.add_to_collection(pruning.WEIGHT_GRADIENT_COLLECTION,
                             layer_obj.vars.gradient)
        tf.add_to_collection(pruning.OLD_WEIGHT_COLLECTION,
                             layer_obj.vars.old_weight)
        tf.add_to_collection(pruning.OLD_OLD_WEIGHT_COLLECTION,
                             layer_obj.vars.old_old_weight)

      b_matrix_pc = weight_params_fn(b_matrix.shape,
                                     weight_init_obj.Constant(1.0), p.dtype)
      c_matrix_pc = weight_params_fn(c_matrix.shape,
                                     weight_init_obj.Constant(1), tf.int32)
      alpha_pc = weight_params_fn([], weight_init_obj.Constant(1.0), tf.float32)

      self._create_layer_variable(layer_obj, 'alpha', alpha_pc, None, False)
      self._create_layer_variable(
          layer_obj,
          'b_matrix_tfvar',
          b_matrix_pc,
          None,
          trainable=self.matrix_compressor.get_spec().is_b_matrix_trainable)
      self._create_layer_variable(
          layer_obj,
          'c_matrix_tfvar',
          c_matrix_pc,
          None,
          trainable=self.matrix_compressor.get_spec().is_c_matrix_trainable)

      self.b_matrix_tfvar = layer_obj.vars.b_matrix_tfvar
      self.c_matrix_tfvar = layer_obj.vars.c_matrix_tfvar
      self.alpha = layer_obj.vars.alpha
      self.a_matrix_tfvar = a_matrix_tfvar
      self.mask = layer_obj.vars.mask
      self.threshold = layer_obj.vars.threshold

      self.pruned_a_matrix_tfvar = tf.multiply(layer_obj.vars.wm,
                                               layer_obj.vars.mask,
                                               'masked_weight')

    def maybe_apply_compression():
      """Decide whether global step is within compression range.

      Returns:
        is_step_within_compression_range: bool.
      """
      with tf.compat.v1.name_scope(self._spec.name):
        # Compress if current step is more than begin_compression_step and
        # less than end_compression_step (unless it's negative)
        global_step = tf.train.get_global_step()
        def real_global_step_fn():
          return tf.cast(tf.train.get_global_step(), tf.int32)
        def mock_global_step_fn():
          return self._spec.begin_compression_step
        def is_global_step_none(global_step):
          return tf.constant(global_step is None, dtype=tf.bool)
        global_step = tf.cond(is_global_step_none(global_step),
                              mock_global_step_fn,
                              real_global_step_fn)
        is_step_within_compression_range = tf.logical_and(
            tf.greater_equal(
                tf.cast(global_step, tf.int32),
                self._spec.begin_compression_step),
            tf.logical_or(
                tf.less_equal(
                    tf.cast(global_step, tf.int32),
                    self._spec.end_compression_step),
                tf.less(self._spec.end_compression_step, 0)))
        return is_step_within_compression_range

    if getattr(self._spec, 'do_transpose', False):
      self.pruning_and_compression_op = (
          self.alpha * self.pruned_a_matrix_tfvar +
          (1 - self.alpha) * tf.math.multiply(
              tf.transpose(
                  tf.reshape(
                      tf.nn.embedding_lookup(self.b_matrix_tfvar,
                                             self.c_matrix_tfvar),
                      tf.transpose(a_matrix_tfvar).shape)),
              self.mask,
              name='pruned_compressed_weight'))
    else:
      self.pruning_and_compression_op = (
          self.alpha * self.pruned_a_matrix_tfvar +
          (1 - self.alpha) * tf.math.multiply(
              tf.reshape(
                  tf.nn.embedding_lookup(self.b_matrix_tfvar,
                                         self.c_matrix_tfvar),
                  a_matrix_tfvar.shape),
              self.mask,
              name='pruned_compressed_weight'))

    def pruned_a_matrix_fn():
      return self.pruned_a_matrix_tfvar

    def quantized_pruned_a_matrix_fn():
      return self.pruning_and_compression_op

    self.final_op = tf.cond(maybe_apply_compression(),
                            quantized_pruned_a_matrix_fn, pruned_a_matrix_fn)

    self.add_compression_summaries()
    self.pruning_obj.add_pruning_summaries()
    self.update_op = tf.no_op()
    return [self.final_op, self.update_op]
Exemplo n.º 20
0
    def __call__(self, x, training, distname='batch_normalization'):
        shape = [x.shape[-1]]
        with tf.variable_scope('batch_normalization'):
            ones = tf.initializers.ones()
            zeros = tf.initializers.zeros()
            gamma = tf.get_variable('gamma',
                                    shape,
                                    initializer=ones,
                                    trainable=True,
                                    use_resource=True)
            beta = tf.get_variable('beta',
                                   shape,
                                   initializer=zeros,
                                   trainable=True,
                                   use_resource=True)
            moving_mean = tf.get_variable('moving_mean',
                                          shape,
                                          initializer=zeros,
                                          trainable=False,
                                          use_resource=True)
            moving_variance = tf.get_variable('moving_variance',
                                              shape,
                                              initializer=ones,
                                              trainable=False,
                                              use_resource=True)
        num_replicas = FLAGS.num_replicas

        x = tf.cast(x, tf.float32)
        if training:
            if num_replicas <= 8:
                group_assign = None
                group_shards = tf.cast(num_replicas, tf.float32)
            else:

                group_shards = max(
                    1,
                    int(FLAGS.batch_norm_batch_size /
                        (FLAGS.train_batch_size / num_replicas)))
                group_assign = np.arange(num_replicas, dtype=np.int32)
                group_assign = group_assign.reshape([-1, group_shards])
                group_assign = group_assign.tolist()
                group_shards = tf.cast(group_shards, tf.float32)

            mean = tf.reduce_mean(x, [0, 1, 2])
            mean = tf.tpu.cross_replica_sum(mean, group_assign) / group_shards

            # Var[x] = E[x^2] - E[x]^2
            mean_sq = tf.reduce_mean(tf.math.square(x), [0, 1, 2])
            mean_sq = tf.tpu.cross_replica_sum(mean_sq,
                                               group_assign) / group_shards
            variance = mean_sq - tf.math.square(mean)

            decay = tf.cast(1. - self.momentum, tf.float32)

            def u(moving, normal, name):
                num_replicas_fp = tf.cast(num_replicas, tf.float32)
                normal = tf.tpu.cross_replica_sum(normal) / num_replicas_fp
                diff = decay * (moving - normal)
                return tf.assign_sub(moving, diff, use_locking=True, name=name)

            tf.add_to_collection(tf.GraphKeys.UPDATE_OPS,
                                 u(moving_mean, mean, name='moving_mean'))
            tf.add_to_collection(
                tf.GraphKeys.UPDATE_OPS,
                u(moving_variance, variance, name='moving_variance'))

            x = tf.nn.batch_normalization(x,
                                          mean=mean,
                                          variance=variance,
                                          offset=beta,
                                          scale=gamma,
                                          variance_epsilon=self.epsilon)
        else:

            x, _, _ = tf.nn.fused_batch_norm(x,
                                             scale=gamma,
                                             offset=beta,
                                             mean=moving_mean,
                                             variance=moving_variance,
                                             epsilon=self.epsilon,
                                             is_training=False)

        return x
Exemplo n.º 21
0
 def call(self, inputs, training=None):
   outputs = super().call(inputs, training)
   # A temporary hack for tf1 compatibility with keras batch norm.
   for u in self.updates:
     tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, u)
   return outputs
Exemplo n.º 22
0
  def build():
    """Builds the Tensorflow graph."""
    inputs, labels, lengths = None, None, None

    if mode in ('train', 'eval'):
      if isinstance(no_event_label, numbers.Number):
        label_shape = []
      else:
        label_shape = [len(no_event_label)]
      inputs, labels, lengths = magenta.common.get_padded_batch(
          sequence_example_file_paths, hparams.batch_size, input_size,
          label_shape=label_shape, shuffle=mode == 'train')

    elif mode == 'generate':
      inputs = tf.placeholder(tf.float32, [hparams.batch_size, None,
                                           input_size])

    if isinstance(encoder_decoder,
                  magenta.music.OneHotIndexEventSequenceEncoderDecoder):
      expanded_inputs = tf.one_hot(
          tf.cast(tf.squeeze(inputs, axis=-1), tf.int64),
          encoder_decoder.input_depth)
    else:
      expanded_inputs = inputs

    dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob

    if hparams.use_cudnn:
      outputs, initial_state, final_state = make_cudnn(
          expanded_inputs, hparams.rnn_layer_sizes, hparams.batch_size, mode,
          dropout_keep_prob=dropout_keep_prob,
          residual_connections=hparams.residual_connections)

    else:
      cell = make_rnn_cell(
          hparams.rnn_layer_sizes,
          dropout_keep_prob=dropout_keep_prob,
          attn_length=hparams.attn_length,
          residual_connections=hparams.residual_connections)

      initial_state = cell.zero_state(hparams.batch_size, tf.float32)

      outputs, final_state = tf.nn.dynamic_rnn(
          cell, inputs, sequence_length=lengths, initial_state=initial_state,
          swap_memory=True)

    outputs_flat = magenta.common.flatten_maybe_padded_sequences(
        outputs, lengths)
    if isinstance(num_classes, numbers.Number):
      num_logits = num_classes
    else:
      num_logits = sum(num_classes)
    logits_flat = contrib_layers.linear(outputs_flat, num_logits)

    if mode in ('train', 'eval'):
      labels_flat = magenta.common.flatten_maybe_padded_sequences(
          labels, lengths)

      if isinstance(num_classes, numbers.Number):
        softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=labels_flat, logits=logits_flat)
        predictions_flat = tf.argmax(logits_flat, axis=1)
      else:
        logits_offsets = np.cumsum([0] + num_classes)
        softmax_cross_entropy = []
        predictions = []
        for i in range(len(num_classes)):
          softmax_cross_entropy.append(
              tf.nn.sparse_softmax_cross_entropy_with_logits(
                  labels=labels_flat[:, i],
                  logits=logits_flat[
                      :, logits_offsets[i]:logits_offsets[i + 1]]))
          predictions.append(
              tf.argmax(logits_flat[
                  :, logits_offsets[i]:logits_offsets[i + 1]], axis=1))
        predictions_flat = tf.stack(predictions, 1)

      correct_predictions = tf.to_float(
          tf.equal(labels_flat, predictions_flat))
      event_positions = tf.to_float(tf.not_equal(labels_flat, no_event_label))
      no_event_positions = tf.to_float(tf.equal(labels_flat, no_event_label))

      # Compute the total number of time steps across all sequences in the
      # batch. For some models this will be different from the number of RNN
      # steps.
      def batch_labels_to_num_steps(batch_labels, lengths):
        num_steps = 0
        for labels, length in zip(batch_labels, lengths):
          num_steps += encoder_decoder.labels_to_num_steps(labels[:length])
        return np.float32(num_steps)
      num_steps = tf.py_func(
          batch_labels_to_num_steps, [labels, lengths], tf.float32)

      if mode == 'train':
        loss = tf.reduce_mean(softmax_cross_entropy)
        perplexity = tf.exp(loss)
        accuracy = tf.reduce_mean(correct_predictions)
        event_accuracy = (
            tf.reduce_sum(correct_predictions * event_positions) /
            tf.reduce_sum(event_positions))
        no_event_accuracy = (
            tf.reduce_sum(correct_predictions * no_event_positions) /
            tf.reduce_sum(no_event_positions))

        loss_per_step = tf.reduce_sum(softmax_cross_entropy) / num_steps
        perplexity_per_step = tf.exp(loss_per_step)

        optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)

        train_op = contrib_slim.learning.create_train_op(
            loss, optimizer, clip_gradient_norm=hparams.clip_norm)
        tf.add_to_collection('train_op', train_op)

        vars_to_summarize = {
            'loss': loss,
            'metrics/perplexity': perplexity,
            'metrics/accuracy': accuracy,
            'metrics/event_accuracy': event_accuracy,
            'metrics/no_event_accuracy': no_event_accuracy,
            'metrics/loss_per_step': loss_per_step,
            'metrics/perplexity_per_step': perplexity_per_step,
        }
      elif mode == 'eval':
        vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map({
            'loss':
                tf.metrics.mean(softmax_cross_entropy),
            'metrics/accuracy':
                tf.metrics.accuracy(labels_flat, predictions_flat),
            'metrics/per_class_accuracy':
                tf.metrics.mean_per_class_accuracy(labels_flat,
                                                   predictions_flat,
                                                   num_classes),
            'metrics/event_accuracy':
                tf.metrics.recall(event_positions, correct_predictions),
            'metrics/no_event_accuracy':
                tf.metrics.recall(no_event_positions, correct_predictions),
            'metrics/loss_per_step':
                tf.metrics.mean(
                    tf.reduce_sum(softmax_cross_entropy) / num_steps,
                    weights=num_steps),
        })
        for updates_op in update_ops.values():
          tf.add_to_collection('eval_ops', updates_op)

        # Perplexity is just exp(loss) and doesn't need its own update op.
        vars_to_summarize['metrics/perplexity'] = tf.exp(
            vars_to_summarize['loss'])
        vars_to_summarize['metrics/perplexity_per_step'] = tf.exp(
            vars_to_summarize['metrics/loss_per_step'])

      for var_name, var_value in six.iteritems(vars_to_summarize):
        tf.summary.scalar(var_name, var_value)
        tf.add_to_collection(var_name, var_value)

    elif mode == 'generate':
      temperature = tf.placeholder(tf.float32, [])
      if isinstance(num_classes, numbers.Number):
        softmax_flat = tf.nn.softmax(
            tf.div(logits_flat, tf.fill([num_classes], temperature)))
        softmax = tf.reshape(
            softmax_flat, [hparams.batch_size, -1, num_classes])
      else:
        logits_offsets = np.cumsum([0] + num_classes)
        softmax = []
        for i in range(len(num_classes)):
          sm = tf.nn.softmax(
              tf.div(
                  logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]],
                  tf.fill([num_classes[i]], temperature)))
          sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]])
          softmax.append(sm)

      tf.add_to_collection('inputs', inputs)
      tf.add_to_collection('temperature', temperature)
      tf.add_to_collection('softmax', softmax)
      # Flatten state tuples for metagraph compatibility.
      for state in tf_nest.flatten(initial_state):
        tf.add_to_collection('initial_state', state)
      for state in tf_nest.flatten(final_state):
        tf.add_to_collection('final_state', state)
Exemplo n.º 23
0
def module_fn(is_training):
  """Module function."""
  input_ids = tf.placeholder(tf.int32, [None, None], "input_ids")
  input_mask = tf.placeholder(tf.int32, [None, None], "input_mask")
  segment_ids = tf.placeholder(tf.int32, [None, None], "segment_ids")
  mlm_positions = tf.placeholder(tf.int32, [None, None], "mlm_positions")

  albert_config_path = os.path.join(
      FLAGS.albert_directory, "albert_config.json")
  albert_config = modeling.AlbertConfig.from_json_file(albert_config_path)
  model = modeling.AlbertModel(
      config=albert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=False,
      use_einsum=FLAGS.use_einsum)

  mlm_logits = get_mlm_logits(model, albert_config, mlm_positions)

  vocab_model_path = os.path.join(FLAGS.albert_directory, "30k-clean.model")
  vocab_file_path = os.path.join(FLAGS.albert_directory, "30k-clean.vocab")

  config_file = tf.constant(
      value=albert_config_path, dtype=tf.string, name="config_file")
  vocab_model = tf.constant(
      value=vocab_model_path, dtype=tf.string, name="vocab_model")
  # This is only for visualization purpose.
  vocab_file = tf.constant(
      value=vocab_file_path, dtype=tf.string, name="vocab_file")

  # By adding `config_file, vocab_model and vocab_file`
  # to the ASSET_FILEPATHS collection, TF-Hub will
  # rewrite this tensor so that this asset is portable.
  tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
  tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_model)
  tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)

  hub.add_signature(
      name="tokens",
      inputs=dict(
          input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids),
      outputs=dict(
          sequence_output=model.get_sequence_output(),
          pooled_output=model.get_pooled_output()))

  hub.add_signature(
      name="mlm",
      inputs=dict(
          input_ids=input_ids,
          input_mask=input_mask,
          segment_ids=segment_ids,
          mlm_positions=mlm_positions),
      outputs=dict(
          sequence_output=model.get_sequence_output(),
          pooled_output=model.get_pooled_output(),
          mlm_logits=mlm_logits))

  hub.add_signature(
      name="tokenization_info",
      inputs={},
      outputs=dict(
          vocab_file=vocab_model,
          do_lower_case=tf.constant(FLAGS.do_lower_case)))
Exemplo n.º 24
0
def model_fn(features, labels, mode, params):
    # Get global step
    global_step = tf.train.get_global_step()

    # Construct mtf graph + mesh from params
    graph = mtf.Graph()
    mesh_shape = mtf.convert_to_shape(params["mesh_shape"])
    layout_rules = mtf.convert_to_layout_rules(params["layout"])

    # Mesh setup
    if params["use_tpu"]:
        var_placer, mesh_impl = simd_mesh_setup(params, mesh_shape,
                                                layout_rules)
    else:
        var_placer = None
        gpu_ids = params["gpu_ids"]
        mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
            mesh_shape, layout_rules, gpu_ids)

    # Trainable variable precision
    # Store to checkpoints in master type, train in slice type, compute in activation type
    if params["precision"] == "bfloat16":
        variable_dtype = mtf.VariableDType(master_dtype=tf.bfloat16,
                                           slice_dtype=tf.float32,
                                           activation_dtype=tf.bfloat16)
    else:
        variable_dtype = mtf.VariableDType(master_dtype=tf.float32,
                                           slice_dtype=tf.float32,
                                           activation_dtype=tf.float32)

    # Build mtf mesh object
    mesh = mtf.Mesh(graph, "my_mesh", var_placer)

    # Build mtf_features & seq length dict for getting number of microbatches
    # We need to pack inputs into a dict to pass into serialize_training_step
    features_dict = {"inputs": features, "labels": labels}
    sequence_length_dict = {
        "inputs": params["n_ctx"],
        "labels": params["n_ctx"]
    }

    params = add_mode_to_params(params, mode)
    batch_size = get_batch_size(params)

    batch_dim = mtf.Dimension("batch", batch_size)
    batch_dims = [batch_dim]
    feature_length = sequence_length_dict["inputs"]
    length_dim = mtf.Dimension("sequence", feature_length)

    mtf_features = {}
    for key, x in features_dict.items():
        if x is not None:
            feature_shape = mtf.Shape(batch_dims + [length_dim])
            if type(features_dict[key]) == dict:
                features_dict[key] = features_dict[key]["feature"]
            x = tf.cast(features_dict[key], tf.int32)
            x = tf.reshape(x, feature_shape.to_integer_list)
            mtf_features[key] = mtf.import_fully_replicated(mesh,
                                                            x,
                                                            feature_shape,
                                                            name=key)

    # Instantiate dict for dimensions, bias, etc that can be calculated here once then passed into model
    other_features = {}
    memory_length_dim = mtf.Dimension("memory_length", length_dim.size)

    attn_bias = biasmask_attn_weights(
        mesh, length_dim, memory_length_dim,
        variable_dtype) if params["causal"] else None

    # Add attn_bias into mtf_features
    other_features["attn_bias"] = attn_bias

    # Define other Dimensions that we'll need inside the model
    embd_dim = mtf.Dimension("embd", params["n_embd"])
    vocab_dim = mtf.Dimension("vocab", params["n_vocab"])
    # We need this because gathering when both the args have the same dimension in them breaks things
    # This dim is specifically for the weights
    # This prevents the "Einsum has lhs dimension without corresponding rhs or output dimension." error
    embed_sequence_dim = mtf.Dimension("embed_sequence", params["n_ctx"])

    other_features["embd_dim"] = embd_dim
    other_features["vocab_dim"] = vocab_dim
    other_features["embed_sequence_dim"] = embed_sequence_dim
    other_features["memory_length_dim"] = memory_length_dim

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Set up the model for prediction
        inputs = mtf_features["inputs"]
        if params["remove_partial_sequences"] is None:
            params["remove_partial_sequences"] = False

        export = params.get("export", False)

        if not export:
            mtf_samples = sample_autoregressive(
                inputs,
                other_features=other_features,
                params=params,
                variable_dtype=variable_dtype,
                remove_partial_sequences=params["remove_partial_sequences"],
                stop_at_token=params["eos_id"],
                sampling_use_entmax=params['sampling_use_entmax'])

        else:
            with mtf.utils.outside_all_rewrites():
                with tf.variable_scope('gpt2'):
                    mtf_samples, loss, loss_batch = gpt2.model(
                        mtf_features,
                        other_features,
                        params,
                        mesh,
                        variable_dtype=variable_dtype,
                        context=None)

        mtf_samples = mtf.anonymize(mtf_samples)
        inputs = mtf.anonymize(inputs)
        lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True)
        inputs = lowering.export_to_tf_tensor(inputs)
        outputs = lowering.export_to_tf_tensor(mtf_samples)
        predictions = {"inputs": inputs, "outputs": outputs}

        def scaffold_fn():
            return tf.train.Scaffold(
                local_init_op=tf.group(
                    tf.train.Scaffold.default_local_init_op(),
                    lowering.copy_masters_to_slices(),
                    name="mtf_local_init_op"),
                ready_op=tf.concat([
                    tf.report_uninitialized_variables(),
                    resources.report_uninitialized_resources()
                ],
                                   axis=0,
                                   name="mtf_ready_op"))

        return tpu_estimator.TPUEstimatorSpec(
            mode=tf.estimator.ModeKeys.PREDICT,
            predictions=predictions,
            scaffold_fn=scaffold_fn,
            prediction_hooks=[mtf.MtfRestoreHook(lowering)])

    # We're not predicting, so we better be training or evaluating
    assert (mode == tf.estimator.ModeKeys.TRAIN
            or mode == tf.estimator.ModeKeys.EVAL)

    if mode == tf.estimator.ModeKeys.TRAIN:
        # Gets number of microbatches per batch for serialized training
        # if param tokens_per_mb_per_replica = None, this defaults to 1 and no microbatching is performed
        num_microbatches = int(
            mtf_transformer.utils.serialize_num_microbatches(
                batch_dim=batch_dim,
                sequence_length=sequence_length_dict,
                mesh_shape=mesh_shape,
                layout_rules=layout_rules,
                tokens_per_microbatch_per_replica=params[
                    "tokens_per_mb_per_replica"]))
    else:
        num_microbatches = 1

    params[
        "num_microbatches"] = num_microbatches  # Add num microbatches to params

    if num_microbatches > 1:

        # For serialize_training_step we need to modify the model to output results in a dict
        def serialized_fn(mtf_features):
            if params["model"] == "GPT":
                with tf.variable_scope('gpt2'):
                    logits, loss, loss_batch = gpt2.model(
                        mtf_features,
                        other_features,
                        params,
                        mesh,
                        variable_dtype=variable_dtype)
                return {
                    "logits": logits,
                    "loss": loss,
                    "loss_batch": loss_batch
                }
            else:
                raise Exception(
                    f"'{params['model']}' is not a valid model - please select from [GPT]"
                )

        # Serialize the training step - Gradients are accumulated locally and reduced once.
        var_grads, output_dict = mtf.serialize_training_step(
            mtf_features, serialized_fn, batch_dim, num_microbatches)
        loss = output_dict["loss"]
        loss_batch = output_dict["loss_batch"]
        logits = output_dict["logits"]
    else:
        # If we're not splitting into microbatches, return logits & loss as is
        if params["model"] == "GPT":
            with mtf.utils.outside_all_rewrites():
                with tf.variable_scope('gpt2'):
                    logits, loss, loss_batch = gpt2.model(
                        mtf_features,
                        other_features,
                        params,
                        mesh,
                        variable_dtype=variable_dtype,
                        context=None)
        else:
            raise Exception(
                f"'{params['model']}' is not a valid model - please select from [GPT]"
            )

    # Auto layout generation
    if params["auto_layout"]:
        auto_layout(graph, mesh_shape, logits, loss)
    if params["auto_layout_and_mesh_shape"]:
        auto_layout_and_mesh_shape(graph, params["num_cores"], logits, loss)

    if mode == tf.estimator.ModeKeys.TRAIN:
        # In TRAIN mode, get optimizer
        if params["num_microbatches"] > 1:
            # If we are splitting the batch into microbatches, var grads are created in the serialize_training_step fn
            # So we pass them in here
            _, update_ops, var_grads = get_optimizer(
                mesh,
                loss,
                params,
                variable_dtype=variable_dtype,
                inp_var_grads=var_grads)
        else:
            # Otherwise, they are created in the get_optimizer fn, so we leave inp_var_grads blank
            _, update_ops, var_grads = get_optimizer(
                mesh, loss, params, variable_dtype=variable_dtype)
        # Log summaries to tensorboard
        mtf.scalar_summary("loss", loss)
        # Log gradients if in params
        if params["log_grads"] not in [None, False]:
            for g in var_grads:
                grad_norm = mtf.sqrt(mtf.reduce_sum(mtf.square(g)))
                mtf.scalar_summary("grads/norm" + g.name[:-2], grad_norm)
    else:
        # For now, we can only export fully-replicated tensors.
        # This has to be done before lowering or they will not be included in the graph
        mean_logits = mtf.reduce_mean(logits, reduced_dim=vocab_dim)
        max_logits = mtf.argmax(logits, vocab_dim)
        del logits
        fully_replicated_mean_logits = mtf.anonymize(mean_logits)
        fully_replicated_max_logits = mtf.anonymize(max_logits)
        fully_replicated_loss_batch = mtf.anonymize(loss_batch)

    # Gets & prints info about no. trainable vars in the model & dimension names
    get_graph_info(graph)

    # 'lowers' mtf tensors into a tf graph - this enables us to export results as tf tensors
    lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True)
    tf_loss = lowering.export_to_tf_tensor(loss)
    tf_loss = tf.cast(tf_loss, tf.float32)

    if mode == tf.estimator.ModeKeys.TRAIN:
        # Use our patched version until mtf updates theirs
        host_call = create_host_call(params['model_path'])
        mtf.utils.remove_summaries()

        # Creates train_op
        tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
        tf_update_ops.append(tf.assign_add(
            global_step, 1))  # Need to manually increment global_step
        tf.logging.info(f"tf_update_ops: {tf_update_ops}")
        train_op = tf.group(tf_update_ops)
    else:
        tf_mean_logits = lowering.export_to_tf_tensor(
            fully_replicated_mean_logits)
        tf_max_logits = lowering.export_to_tf_tensor(
            fully_replicated_max_logits)
        tf_loss_batch = tf.to_float(
            lowering.export_to_tf_tensor(fully_replicated_loss_batch))

    with mtf.utils.outside_all_rewrites():
        # Copy master variables to slices. Must be called first.
        restore_hook = mtf.MtfRestoreHook(lowering)
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Set up the checkpoint server and return the TPUEstimatorSpec
            saver = tf.train.Saver(tf.global_variables(),
                                   sharded=True,
                                   max_to_keep=10,
                                   keep_checkpoint_every_n_hours=2,
                                   defer_build=False,
                                   save_relative_paths=True)
            tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
            saver_listener = mtf.MtfCheckpointSaverListener(lowering)
            saver_hook = tf.train.CheckpointSaverHook(
                params["model_path"],
                save_steps=params["steps_per_checkpoint"],
                saver=saver,
                listeners=[saver_listener])

            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.TRAIN,
                loss=tf_loss,
                host_call=host_call,
                train_op=train_op,
                training_hooks=[restore_hook, saver_hook])

        elif mode == tf.estimator.ModeKeys.EVAL:
            # Evaluation metrics
            def _perplexity(loss):
                perplexity = tf.exp(loss)
                return tf.metrics.mean(perplexity)

            def _bits_per_byte(loss):
                bpb = loss * (0.29335 / math.log(2))
                return tf.metrics.mean(bpb)

            def _metric_fn(tf_mean_logits, tf_loss_batch):
                mean_logits = tf.metrics.mean(tf_mean_logits)
                loss = tf.reduce_mean(tf_loss_batch)
                perp = _perplexity(loss)
                bpb = _bits_per_byte(loss)
                return {
                    "mean_logits": mean_logits,
                    "perplexity": perp,
                    "bits per byte": bpb
                }

            def _lambada_metric_fn(labels, tf_max_logits, tf_loss_batch):
                eos_token = params["eos_id"]
                answer_positions = tf.where(
                    tf.math.not_equal(labels, eos_token))

                correct_answers = tf.gather_nd(
                    tf.math.equal(tf_max_logits, labels), answer_positions)
                accuracy = tf.metrics.mean(tf.cast(correct_answers,
                                                   tf.float32))

                # I guess tf_loss_batch has z_loss and maybe other stuff added to it
                # so maybe this should be calculated separately in the future
                answer_loss = tf.gather_nd(tf_loss_batch, answer_positions)
                log_perplexity = tf.metrics.mean(answer_loss)

                return {
                    "lambada_acc": accuracy,
                    "lambada_log_ppl": log_perplexity
                }

            eval_task = params["eval_task"]
            if eval_task == "lambada":
                eval_metrics = (_lambada_metric_fn,
                                [labels, tf_max_logits, tf_loss_batch])
            else:
                eval_metrics = (_metric_fn, [tf_mean_logits, tf_loss_batch])

            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.EVAL,
                evaluation_hooks=[restore_hook],
                loss=tf_loss,
                eval_metrics=eval_metrics)
Exemplo n.º 25
0
def input_fn(dataset,
             filepattern,
             skip_random_fraction_when_training,
             batch_size_means_tokens_param,
             batch_size_multiplier,
             max_length,
             mode,
             hparams,
             data_dir=None,
             params=None,
             config=None,
             force_repeat=False,
             prevent_repeat=False):
    """Builds input pipeline for problem.

  Args:
    dataset: the dataset to make input function from.
    filepattern: the pattern of files to read from.
    skip_random_fraction_when_training: whether to skip randomly when training.
    batch_size_means_tokens_param: whether batch size should mean tokens.
    batch_size_multiplier: how to multiply batch size when bucketing.
    max_length: maximum length,
    mode: tf.estimator.ModeKeys
    hparams: HParams, model hparams
    data_dir: str, data directory; if None, will use hparams.data_dir
    params: dict, may include "batch_size"
    config: RunConfig; should have the data_parallelism attribute if not using
      TPU
    force_repeat: bool, whether to repeat the data even if not training
    prevent_repeat: bool, whether to not repeat when in training mode.
      Overrides force_repeat.

  Returns:
    (features_dict<str name, Tensor feature>, Tensor targets)
  """
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    if config and config.use_tpu:
        num_threads = 64
    else:
        num_threads = cpu_count() if is_training else 1

    if config and hasattr(config,
                          "data_parallelism") and config.data_parallelism:
        num_shards = config.data_parallelism.n
    else:
        num_shards = 1

    if hasattr(hparams,
               'deterministic_dataset') and hparams.deterministic_dataset:
        num_threads = 1
        skip_random_fraction_when_training = False
        hparams.batch_shuffle_size = 0

    def tpu_valid_size(example):
        return example_valid_size(example, hparams.min_length, max_length)

    def gpu_valid_size(example):
        drop_long_sequences = is_training
        max_validate_length = max_length if drop_long_sequences else 10**9
        return example_valid_size(example, hparams.min_length,
                                  max_validate_length)

    def define_shapes(example):
        batch_size = config and config.use_tpu and params["batch_size"]
        return standardize_shapes(example, batch_size=batch_size)

    # Read and preprocess
    data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir)

    if is_training and hparams.use_horovod:
        dataset = dataset.shard(num_shards=hparams.hvd_size,
                                index=hparams.hvd_worker_id)

    if (force_repeat or is_training) and not prevent_repeat:
        # Repeat and skip a random number of records
        dataset = dataset.repeat()

    if is_training and skip_random_fraction_when_training:
        data_files = contrib.slim().parallel_reader.get_data_files(filepattern)
        #  In continuous_train_and_eval when switching between train and
        #  eval, this input_fn method gets called multiple times and it
        #  would give you the exact same samples from the last call
        #  (because the Graph seed is set). So this skip gives you some
        #  shuffling.
        dataset = skip_random_fraction(dataset, data_files[0])

    dataset = dataset.map(cast_ints_to_int32, num_parallel_calls=num_threads)

    if batch_size_means_tokens_param:
        batch_size_means_tokens = True
    else:
        if _are_shapes_fully_defined(dataset.output_shapes):
            batch_size_means_tokens = False
        else:
            tf.logging.warning(
                "Shapes are not fully defined. Assuming batch_size means tokens."
            )
            batch_size_means_tokens = True

    # Batching
    if not batch_size_means_tokens:
        # Batch size means examples per datashard.
        if config and config.use_tpu:
            # on TPU, we use params["batch_size"], which specifies the number of
            # examples across all datashards
            batch_size = params["batch_size"]
            dataset = dataset.batch(batch_size, drop_remainder=True)
        else:
            batch_size = hparams.batch_size * num_shards
            dataset = dataset.batch(batch_size)
    else:
        # batch_size means tokens per datashard
        if config and config.use_tpu:
            dataset = dataset.filter(tpu_valid_size)
            padded_shapes = pad_for_tpu(dataset.output_shapes, hparams,
                                        max_length)
            # on TPU, we use params["batch_size"], which specifies the number of
            # examples across all datashards
            batch_size = params["batch_size"]
            if hparams.pad_batch:
                tf.logging.warn(
                    "Padding the batch to ensure that remainder eval batches are "
                    "processed. This may lead to incorrect metrics for "
                    "non-zero-padded features, e.g. images. Use a smaller batch "
                    "size that has no remainder in that case.")
                dataset = dataset.padded_batch(batch_size,
                                               padded_shapes,
                                               drop_remainder=False)
                dataset = dataset.map(functools.partial(
                    pad_batch, batch_multiple=batch_size),
                                      num_parallel_calls=num_threads)
            else:
                dataset = dataset.padded_batch(batch_size,
                                               padded_shapes,
                                               drop_remainder=True)
        else:
            # On GPU, bucket by length
            dataset = dataset.filter(gpu_valid_size)
            cur_batching_scheme = hparams_to_batching_scheme(
                hparams,
                shard_multiplier=num_shards,
                length_multiplier=batch_size_multiplier)
            if hparams.use_fixed_batch_size:
                # Here  batch_size really means examples per datashard.
                cur_batching_scheme["batch_sizes"] = [hparams.batch_size]
                cur_batching_scheme["boundaries"] = []

            force_fixed_batch_size = hparams.use_static_shapes
            fixed_batch_size = hparams.batch_size // hparams.max_length
            if force_fixed_batch_size:
                cur_batching_scheme["batch_sizes"] = [
                    fixed_batch_size, fixed_batch_size
                ]
                cur_batching_scheme["boundaries"] = [hparams.max_length + 1]

            dataset = dataset.apply(
                tf.data.experimental.bucket_by_sequence_length(
                    example_length,
                    cur_batching_scheme["boundaries"],
                    cur_batching_scheme["batch_sizes"],
                    pad_to_bucket_boundary=force_fixed_batch_size))

            if force_fixed_batch_size and is_training:

                def _force_shape(example):
                    for _, t in six.iteritems(example):
                        shape = t.get_shape().as_list()
                        shape[0] = fixed_batch_size
                        shape[1] = shape[1] or hparams.max_length
                        t.set_shape(t.get_shape().merge_with(shape))
                        # Assert shapes are fully known
                        t.get_shape().assert_is_fully_defined()
                    return example

                dataset = dataset.map(_force_shape,
                                      num_parallel_calls=num_threads)

            if not is_training:
                batch_multiple = num_shards
                if hparams.use_fixed_batch_size:
                    # Make sure the last batch has the same fixed size as the rest.
                    batch_multiple *= hparams.batch_size
                if batch_multiple > 1:
                    tf.logging.warn(
                        "Padding the batch to ensure that remainder eval batches have "
                        "a batch size divisible by the number of data shards. This may "
                        "lead to incorrect metrics for non-zero-padded features, e.g. "
                        "images. Use a single datashard (i.e. 1 GPU) in that case."
                    )
                    dataset = dataset.map(functools.partial(
                        pad_batch, batch_multiple=batch_multiple),
                                          num_parallel_calls=num_threads)

    dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

    # Add shuffling for training batches. This is necessary along with record
    # level shuffling in the dataset generation. Record shuffling will shuffle
    # the examples. However, in some cases, it's possible that the shuffle
    # buffer size for record shuffling is smaller than the batch size. In such
    # cases, adding batch shuffling ensures that the data is in random order
    # during training
    if (is_training and hasattr(hparams, "batch_shuffle_size")
            and hparams.batch_shuffle_size):
        dataset = dataset.shuffle(hparams.batch_shuffle_size)

    # Split batches into chunks if targets are too long.
    # The new "chunk_number" feature is 0 for the first chunk and goes up then.
    # Chunks are reversed so the 0th chunk comes first, then the 1st and so on,
    # so models can attend to them in the order they arrive. The last chunk is
    # usually the one containing the end of the target sentence (EOS).
    chunk_length = hparams.get("split_targets_chunk_length", 0)
    max_chunks = hparams.get("split_targets_max_chunks", 100)
    if chunk_length > 0:

        def is_nonzero_chunk(example):
            """A chunk is zero if all targets are 0s."""
            return tf.less(0, tf.reduce_sum(tf.abs(example["targets"])))

        def split_on_length(example):
            """Split a batch of ditcs on length."""
            x = example["targets"]
            # TODO(kitaev): This code breaks if chunk_length * max_chunks < batch_size
            length_diff = chunk_length * max_chunks - tf.shape(x)[1]
            padded_x = tf.pad(x, [(0, 0), (0, length_diff), (0, 0), (0, 0)])
            chunks = [
                padded_x[:, i * chunk_length:(i + 1) * chunk_length, :, :]
                for i in range(max_chunks - 1)
            ]
            chunks.append(padded_x[:, (max_chunks - 1) * chunk_length:, :, :])
            new_example = {}
            # Setting chunk_number to be tf.range(max_chunks) is incompatible with TPU
            new_example["chunk_number"] = tf.concat([
                tf.expand_dims(tf.ones_like(c) * n, axis=0)
                for n, c in enumerate(chunks)
            ],
                                                    axis=0)
            new_example["targets"] = tf.concat(
                [tf.expand_dims(c, axis=0) for c in chunks], axis=0)
            for k in example:
                if k != "targets":
                    assert k != "chunk_number", (
                        "Chunking code expects the chunk_number feature name to be "
                        "available")
                    new_example[k] = tf.concat([
                        tf.expand_dims(example[k], axis=0)
                        for _ in range(max_chunks)
                    ],
                                               axis=0)
            return tf.data.Dataset.from_tensor_slices(new_example)

        dataset = dataset.flat_map(split_on_length)
        dataset = dataset.filter(is_nonzero_chunk)

        # The chunking data pipeline thus far creates batches of examples where all
        # of the examples have the same chunk number. This can lead to periodic
        # fluctuations in the loss; for example, when all examples in the batch have
        # chunk number 0 the loss may be higher than midway through a sequence.
        # Enabling split_targets_strided_training adjusts the data so that each
        # batch includes examples at various points within a sequence.
        if is_training and hparams.split_targets_strided_training:
            # TODO(kitaev): make sure that shape inference works on GPU, not just TPU.
            inferred_batch_size = dataset.output_shapes["targets"].as_list()[0]
            if inferred_batch_size is None:
                raise ValueError(
                    "Strided training is only implemented when the batch size can be "
                    "inferred statically, for example when training on TPU.")
            chunk_stride = inferred_batch_size * max(
                1, max_chunks // inferred_batch_size) + 1

            def collapse_nested_datasets(example):
                """Converts a dataset of datasets to a dataset of tensor features."""
                new_example = {}
                for k, v in example.items():
                    v = tf.data.experimental.get_single_element(
                        v.batch(inferred_batch_size, drop_remainder=True))
                    new_example[k] = v
                return tf.data.Dataset.from_tensor_slices(new_example)

            dataset = dataset.unbatch()
            dataset = dataset.window(inferred_batch_size, inferred_batch_size,
                                     chunk_stride)
            dataset = dataset.flat_map(collapse_nested_datasets)
            dataset = dataset.batch(inferred_batch_size, drop_remainder=True)

    def prepare_for_output(example):
        if mode == tf.estimator.ModeKeys.PREDICT:
            example["infer_targets"] = example.pop("targets")
            return example
        else:
            return example, example[hparams.get(key="labels_feature_name",
                                                default="targets")]

    dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads)
    dataset = dataset.prefetch(2)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # This is because of a bug in the Estimator that short-circuits prediction
        # if it doesn't see a QueueRunner. DummyQueueRunner implements the
        # minimal expected interface but does nothing.
        tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, DummyQueueRunner())

    return dataset
Exemplo n.º 26
0
def get_variable(name,
                 dtype=None,
                 initializer=None,
                 regularizer=None,
                 trainable=True,
                 collections=None,
                 caching_device=None,
                 validate_shape=True):
    """Returns TensorTrain object with tf.Variables as the TT-cores.

  Args:
    name: The name of the new or existing TensorTrain variable.
      Used to name the TT-cores.
    dtype: Type of the new or existing TensorTrain variable TT-cores (defaults
      to DT_FLOAT).
    initializer: TensorTrain or TensorTrainBatch, initializer for the variable
      if one is created.
    regularizer: A (TensorTrain -> Tensor or None) function; the result of
      applying it on a newly created variable will be added to the collection
      GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
    trainable: If True also add the variable to the graph collection
      GraphKeys.TRAINABLE_VARIABLES (see tf.Variable).
    collections:  List of graph collections keys to add the Variables
      (underlying TT-cores). Defaults to [GraphKeys.GLOBAL_VARIABLES]
      (see tf.Variable).
    caching_device: Optional device string or function describing where
      the Variable should be cached for reading. Defaults to the Variable's
      device. If not None, caches on another device. Typical use is to cache
      on the device where the Ops using the Variable reside, to deduplicate
      copying through Switch and other conditional statements.
    validate_shape: If False, allows the variable to be initialized with a value
      of unknown shape. If True, the default, the shape of initial_value must be
      known.

  Returns:
    The created or existing `TensorTrain` object with tf.Variables TT-cores.

  Raises:
    `ValueError`: when creating a new variable and shape is not declared, when
      violating reuse during variable creation, or when initializer dtype and
      dtype don't match. Reuse is set inside variable_scope.
  """
    # TODO: support validate shape: check that the tensor dimensions are correct,
    # but ignore the ranks.
    # TODO: add validate ranks flag.

    reuse = tf.get_variable_scope().reuse
    if not reuse and initializer is None:
        raise ValueError(
            'Scope reuse is False and initializer is not provided.')

    variable_cores = []

    if reuse and not utils.in_eager_mode():
        # Find an existing variable in the collection.
        path = tf.get_variable_scope().name
        if path != '' and path[-1] != '/':
            path += '/'
        path += name

        found_v = None
        for v in tf.get_collection('TensorTrainVariables'):
            if v.name == path:
                found_v = v
                break
        if found_v is None:
            raise ValueError(
                'ValueError: Variable %s does not exist, or was not '
                'created with t3f.get_tt_variable(). Did you mean to '
                'set reuse=None in VarScope?' % name)
        with tf.variable_scope(name):
            # Try to get the first core through tf.get_variable to check that we don't
            # violate reuse: it will raise a ValueError otherwise.
            tf.get_variable('core_0', dtype=dtype)
        return found_v
    else:
        # Create new variable.
        with tf.variable_scope(name):
            num_dims = initializer.ndims()
            for i in range(num_dims):
                curr_core_var = tf.get_variable(
                    'core_%d' % i,
                    initializer=initializer.tt_cores[i],
                    dtype=dtype,
                    trainable=trainable,
                    collections=collections,
                    caching_device=caching_device)
                variable_cores.append(curr_core_var)
        if isinstance(initializer, TensorTrain):
            v = TensorTrain(variable_cores,
                            initializer.get_raw_shape(),
                            initializer.get_tt_ranks(),
                            convert_to_tensors=False)
        else:
            v = TensorTrainBatch(variable_cores,
                                 initializer.get_raw_shape(),
                                 initializer.get_tt_ranks(),
                                 initializer.batch_size,
                                 convert_to_tensors=False)

        # Add the create TensorTrain object into a collection so that we can
        # retrieve it in the future by get_tt_variable('name').
        tf.add_to_collection('TensorTrainVariables', v)

        # Run the regularizer if requested and save the resulting loss.
        if regularizer:
            with tf.name_scope(name + "/Regularizer/"):
                loss = regularizer(v)
            if loss is not None:
                tf.logging.vlog(
                    1, "Applied regularizer to %s and added the result %s "
                    "to REGULARIZATION_LOSSES.", v.name, loss.name)
                tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, loss)
        return v
Exemplo n.º 27
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        # Set policy for mixed-precision training with Keras-based models.
        if use_tpu and train_config.use_bfloat16:
            from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
            # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0.
            base_layer_utils.enable_v2_dtype_behavior()
            tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            provide_groundtruth(detection_model, labels)

        preprocessed_images = features[fields.InputDataFields.image]

        side_inputs = detection_model.get_side_inputs(features)

        if use_tpu and train_config.use_bfloat16:
            with tf.tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape],
                    **side_inputs)
                prediction_dict = ops.bfloat16_to_float32_nested(
                    prediction_dict)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape],
                **side_inputs)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = tf.tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            load_pretrained = hparams.load_pretrained if hparams else False
            if train_config.fine_tune_checkpoint and load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            if (mode == tf.estimator.ModeKeys.EVAL
                    and eval_config.use_dummy_loss_in_eval):
                total_loss = tf.constant(1.0)
                losses_dict = {'Loss/total_loss': total_loss}
            else:
                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if use_tpu and train_config.use_bfloat16:
                        regularization_losses = ops.bfloat16_to_float32_nested(
                            regularization_losses)
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                        losses.append(regularization_loss)
                        losses_dict[
                            'Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = slim.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = slim.optimizers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if fields.InputDataFields.image_additional_channels in features:
                eval_dict[fields.InputDataFields.
                          image_additional_channels] = features[
                              fields.InputDataFields.image_additional_channels]

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                keypoint_edges = [(kp.start, kp.end)
                                  for kp in eval_config.keypoint_edge]

                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False,
                    keypoint_edges=keypoint_edges or None)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
Exemplo n.º 28
0
 def add_to_collection(trainable_variables, prefix):
   """Put variables into graph collection."""
   for after_block, variables in trainable_variables.items():
     collection = prefix + str(after_block)
     for var in variables:
       tf.add_to_collection(collection, var)
Exemplo n.º 29
0
def get_loss(mask_label, center_label, \
             heading_class_label, heading_residual_label, \
             size_class_label, size_residual_label, \
             end_points, \
             corner_loss_weight=10.0, \
             box_loss_weight=1.0):
    ''' Loss functions for 3D object detection.
    Input:
        mask_label: TF int32 tensor in shape (B,N)
        center_label: TF tensor in shape (B,3)
        heading_class_label: TF int32 tensor in shape (B,) 
        heading_residual_label: TF tensor in shape (B,) 
        size_class_label: TF tensor int32 in shape (B,)
        size_residual_label: TF tensor tensor in shape (B,)
        end_points: dict, outputs from our model
        corner_loss_weight: float scalar
        box_loss_weight: float scalar
    Output:
        total_loss: TF scalar tensor
            the total_loss is also added to the losses collection
    '''
    # 3D Segmentation loss
    mask_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\
        logits=end_points['mask_logits'], labels=mask_label))
    tf.summary.scalar('3d mask loss', mask_loss)

    # Center regression losses
    center_dist = tf.norm(center_label - end_points['center'], axis=-1)
    center_loss = huber_loss(center_dist, delta=2.0)
    tf.summary.scalar('center loss', center_loss)
    stage1_center_dist = tf.norm(center_label - \
        end_points['stage1_center'], axis=-1)
    stage1_center_loss = huber_loss(stage1_center_dist, delta=1.0)
    tf.summary.scalar('stage1 center loss', stage1_center_loss)

    # Heading loss
    heading_class_loss = tf.reduce_mean( \
        tf.nn.sparse_softmax_cross_entropy_with_logits( \
        logits=end_points['heading_scores'], labels=heading_class_label))
    tf.summary.scalar('heading class loss', heading_class_loss)

    hcls_onehot = tf.one_hot(heading_class_label,
                             depth=NUM_HEADING_BIN,
                             on_value=1,
                             off_value=0,
                             axis=-1)  # BxNUM_HEADING_BIN
    heading_residual_normalized_label = \
        heading_residual_label / (np.pi/NUM_HEADING_BIN)
    heading_residual_normalized_loss = huber_loss(tf.reduce_sum( \
        end_points['heading_residuals_normalized']*tf.to_float(hcls_onehot), axis=1) - \
        heading_residual_normalized_label, delta=1.0)
    tf.summary.scalar('heading residual normalized loss',
                      heading_residual_normalized_loss)

    # Size loss
    size_class_loss = tf.reduce_mean( \
        tf.nn.sparse_softmax_cross_entropy_with_logits( \
        logits=end_points['size_scores'], labels=size_class_label))
    tf.summary.scalar('size class loss', size_class_loss)

    scls_onehot = tf.one_hot(size_class_label,
                             depth=NUM_SIZE_CLUSTER,
                             on_value=1,
                             off_value=0,
                             axis=-1)  # BxNUM_SIZE_CLUSTER
    scls_onehot_tiled = tf.tile(tf.expand_dims( \
        tf.to_float(scls_onehot), -1), [1,1,3]) # BxNUM_SIZE_CLUSTERx3
    predicted_size_residual_normalized = tf.reduce_sum( \
        end_points['size_residuals_normalized']*scls_onehot_tiled, axis=[1]) # Bx3

    mean_size_arr_expand = tf.expand_dims( \
        tf.constant(g_mean_size_arr, dtype=tf.float32),0) # 1xNUM_SIZE_CLUSTERx3
    mean_size_label = tf.reduce_sum( \
        scls_onehot_tiled * mean_size_arr_expand, axis=[1]) # Bx3
    size_residual_label_normalized = size_residual_label / mean_size_label
    size_normalized_dist = tf.norm( \
        size_residual_label_normalized - predicted_size_residual_normalized,
        axis=-1)
    size_residual_normalized_loss = huber_loss(size_normalized_dist, delta=1.0)
    tf.summary.scalar('size residual normalized loss',
                      size_residual_normalized_loss)

    # Corner loss
    # We select the predicted corners corresponding to the
    # GT heading bin and size cluster.
    corners_3d = get_box3d_corners(
        end_points['center'], end_points['heading_residuals'],
        end_points['size_residuals'])  # (B,NH,NS,8,3)
    gt_mask = tf.tile(tf.expand_dims(hcls_onehot, 2), [1,1,NUM_SIZE_CLUSTER]) * \
        tf.tile(tf.expand_dims(scls_onehot,1), [1,NUM_HEADING_BIN,1]) # (B,NH,NS)
    corners_3d_pred = tf.reduce_sum( \
        tf.to_float(tf.expand_dims(tf.expand_dims(gt_mask,-1),-1)) * corners_3d,
        axis=[1,2]) # (B,8,3)

    heading_bin_centers = tf.constant( \
        np.arange(0,2*np.pi,2*np.pi/NUM_HEADING_BIN), dtype=tf.float32) # (NH,)
    heading_label = tf.expand_dims(heading_residual_label,1) + \
        tf.expand_dims(heading_bin_centers, 0) # (B,NH)
    heading_label = tf.reduce_sum(tf.to_float(hcls_onehot) * heading_label, 1)
    mean_sizes = tf.expand_dims( \
        tf.constant(g_mean_size_arr, dtype=tf.float32), 0) # (1,NS,3)
    size_label = mean_sizes + \
        tf.expand_dims(size_residual_label, 1) # (1,NS,3) + (B,1,3) = (B,NS,3)
    size_label = tf.reduce_sum( \
        tf.expand_dims(tf.to_float(scls_onehot),-1)*size_label, axis=[1]) # (B,3)
    corners_3d_gt = get_box3d_corners_helper( \
        center_label, heading_label, size_label) # (B,8,3)
    corners_3d_gt_flip = get_box3d_corners_helper( \
        center_label, heading_label+np.pi, size_label) # (B,8,3)

    corners_dist = tf.minimum(
        tf.norm(corners_3d_pred - corners_3d_gt, axis=-1),
        tf.norm(corners_3d_pred - corners_3d_gt_flip, axis=-1))
    corners_loss = huber_loss(corners_dist, delta=1.0)
    tf.summary.scalar('corners loss', corners_loss)

    # Weighted sum of all losses
    total_loss = mask_loss + box_loss_weight * (center_loss + \
        heading_class_loss + size_class_loss + \
        heading_residual_normalized_loss*20 + \
        size_residual_normalized_loss*20 + \
        stage1_center_loss + \
        corner_loss_weight*corners_loss)
    tf.add_to_collection('losses', total_loss)

    return total_loss
Exemplo n.º 30
0
    def train(self,
              input_fn,
              checkpoint_path=None,
              save_checkpoint_steps=None,
              save_checkpoint_secs=None):
        if self._cluster_spec is not None:
            device_fn = tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % self._worker_rank,
                merge_devices=True,
                cluster=self._cluster_spec)
            cluster_def = self._cluster_spec.as_cluster_def()
            local_address = self._cluster_spec.job_tasks('worker')[
                self._worker_rank]
            server = tf.train.Server(tf.train.ClusterSpec(
                {'local': {
                    0: local_address
                }}),
                job_name='local',
                task_index=0)
            target = 'grpc://' + local_address
        else:
            device_fn = None
            cluster_def = None
            target = None

        config = tf.ConfigProto(cluster_def=cluster_def)
        config.inter_op_parallelism_threads = 4
        config.intra_op_parallelism_threads = 4
        config.experimental.share_session_state_in_clusterspec_propagation \
            = True
        tf.config.set_soft_device_placement(False)

        with tf.Graph().as_default() as g:
            with tf.device(device_fn):
                features, labels = self._get_features_and_labels_from_input_fn(
                    input_fn, ModeKeys.TRAIN)
                spec, _ = self._get_model_spec(features, labels, ModeKeys.TRAIN)

            # Explicitly add a Saver
            if not tf.get_collection(tf.GraphKeys.SAVERS):
                saver = tf.train.Saver(
                    sharded=True,
                    defer_build=True,
                    save_relative_paths=True)  # Must set for portability
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)

            listener = DataCheckpointSaverListener(self._trainer_master,
                                                   self._application_id)
            saver_hook = tf.estimator.CheckpointSaverHook(
                checkpoint_path, save_secs=save_checkpoint_secs,
                save_steps=save_checkpoint_steps, listeners=[listener])
            self._bridge.connect()

            try:
                with tf.train.MonitoredTrainingSession(
                    master=target,
                    config=config,
                    is_chief=(self._worker_rank == 0),
                    chief_only_hooks=[saver_hook],
                    checkpoint_dir=checkpoint_path,
                    save_checkpoint_steps=None,
                    save_checkpoint_secs=None,
                    hooks=spec.training_hooks) as sess:
                    iter_id = 0

                    data_checkpoint_value = None
                    if hasattr(saver_hook, "data_checkpoint"):
                        data_checkpoint_value = saver_hook.data_checkpoint
                    if not self._restore_datablock(data_checkpoint_value):
                        raise ValueError("Restore data checkpoint error")

                    while not sess.should_stop():
                        self._bridge.start(iter_id)
                        logging.debug('after bridge start.')
                        start_time = time.time()
                        sess.run(spec.train_op, feed_dict={})
                        end_time = time.time()
                        metrics.emit_timer(
                            name="iter_timer",
                            value=end_time-start_time,
                            tags={})
                        logging.debug('after session run.')
                        self._bridge.commit()
                        logging.debug('after bridge commit.')
                        iter_id += 1
            finally:
                self._bridge.terminate()

        return self