def _get_squad_model():
     """Get Squad model and optimizer."""
     squad_model, core_model = bert_models.squad_model(
         bert_config,
         max_seq_length,
         float_type=tf.float16 if use_float16 else tf.float32,
         hub_module_url=FLAGS.hub_module_url)
     squad_model.optimizer = optimization.create_optimizer(
         FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps)
     if use_float16:
         # Wraps optimizer with a LossScaleOptimizer. This is done automatically
         # in compile() with the "mixed_float16" policy, but since we do not call
         # compile(), we must wrap the optimizer manually.
         squad_model.optimizer = (
             tf.keras.mixed_precision.experimental.LossScaleOptimizer(
                 squad_model.optimizer,
                 loss_scale=common_flags.get_loss_scale()))
     if FLAGS.fp16_implementation == 'graph_rewrite':
         # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
         # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
         # which will ensure tf.compat.v2.keras.mixed_precision and
         # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
         # up.
         squad_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
             squad_model.optimizer)
     return squad_model, core_model
Exemplo n.º 2
0
def predict_squad_customized(strategy, input_meta_data, bert_config,
                             predict_tfrecord_path, num_steps):
    """Make predictions using a Bert-based squad model."""
    primary_cpu_task = '/job:worker' if FLAGS.tpu else ''

    with tf.device(primary_cpu_task):
        predict_dataset = input_pipeline.create_squad_dataset(
            predict_tfrecord_path,
            input_meta_data['max_seq_length'],
            FLAGS.predict_batch_size,
            is_training=False)
        predict_iterator = iter(
            strategy.experimental_distribute_dataset(predict_dataset))

        with strategy.scope():
            # Prediction always uses float32, even if training uses mixed precision.
            tf.keras.mixed_precision.experimental.set_policy('float32')
            squad_model, _ = bert_models.squad_model(
                bert_config,
                input_meta_data['max_seq_length'],
                float_type=tf.float32)

        checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
        logging.info('Restoring checkpoints from %s', checkpoint_path)
        checkpoint = tf.train.Checkpoint(model=squad_model)
        checkpoint.restore(checkpoint_path).expect_partial()

        @tf.function
        def predict_step(iterator):
            """Predicts on distributed devices."""
            def _replicated_step(inputs):
                """Replicated prediction calculation."""
                x, _ = inputs
                unique_ids, start_logits, end_logits = squad_model(
                    x, training=False)
                return dict(unique_ids=unique_ids,
                            start_logits=start_logits,
                            end_logits=end_logits)

            outputs = strategy.experimental_run_v2(_replicated_step,
                                                   args=(next(iterator), ))
            return tf.nest.map_structure(strategy.experimental_local_results,
                                         outputs)

        all_results = []
        for _ in range(num_steps):
            predictions = predict_step(predict_iterator)
            for result in get_raw_results(predictions):
                all_results.append(result)
            if len(all_results) % 100 == 0:
                logging.info('Made predictions for %d records.',
                             len(all_results))
        return all_results
Exemplo n.º 3
0
 def _get_squad_model():
     """Get Squad model and optimizer."""
     squad_model, core_model = bert_models.squad_model(
         bert_config,
         max_seq_length,
         float_type=tf.float16 if FLAGS.use_fp16 else tf.float32,
         hub_module_url=FLAGS.hub_module_url)
     learning_rate = FLAGS.learning_rate * hvd.size(
     ) if FLAGS.use_horovod else FLAGS.learning_rate
     squad_model.optimizer = optimization.create_optimizer(
         learning_rate, steps_per_epoch * epochs, warmup_steps,
         FLAGS.optimizer_type)
     if FLAGS.use_fp16:
         squad_model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
             squad_model.optimizer, dynamic=True)
     return squad_model, core_model
def export_squad(model_export_path, input_meta_data):
  """Exports a trained model as a `SavedModel` for inference.

  Args:
    model_export_path: a string specifying the path to the SavedModel directory.
    input_meta_data: dictionary containing meta data about input and model.

  Raises:
    Export path is not specified, got an empty string or None.
  """
  if not model_export_path:
    raise ValueError('Export path is not specified: %s' % model_export_path)
  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  squad_model, _ = bert_models.squad_model(
      bert_config, input_meta_data['max_seq_length'], float_type=tf.float32)
  model_saving_utils.export_bert_model(
      model_export_path, model=squad_model, checkpoint_dir=FLAGS.model_dir)
Exemplo n.º 5
0
def export_squad(model_export_path, input_meta_data):
    """Exports a trained model as a `SavedModel` for inference.

  Args:
    model_export_path: a string specifying the path to the SavedModel directory.
    input_meta_data: dictionary containing meta data about input and model.

  Raises:
    Export path is not specified, got an empty string or None.
  """
    if not model_export_path:
        raise ValueError('Export path is not specified: %s' %
                         model_export_path)
    bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file(
        FLAGS.bert_config_file)
    squad_model, _ = bert_models.squad_model(bert_config,
                                             input_meta_data['max_seq_length'],
                                             float_type=tf.float32)
    model_saving_utils.export_bert_model(model_export_path + '/savedmodel',
                                         model=squad_model,
                                         checkpoint_dir=FLAGS.model_dir)

    model_name = FLAGS.triton_model_name

    model_folder = model_export_path + "/triton_models/" + model_name
    version_folder = model_folder + "/" + str(FLAGS.triton_model_version)
    final_model_folder = version_folder + "/model.savedmodel"

    if not os.path.exists(version_folder):
        os.makedirs(version_folder)
    if (not os.path.exists(final_model_folder)):
        os.rename(model_export_path + '/savedmodel', final_model_folder)
        print("Model saved to dir", final_model_folder)
    else:
        if (FLAGS.triton_model_overwrite):
            shutil.rmtree(final_model_folder)
            os.rename(model_export_path + '/savedmodel', final_model_folder)
            print("WARNING: Existing model was overwritten. Model dir: {}".
                  format(final_model_folder))
        else:
            print(
                "ERROR: Could not save Triton model. Folder already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model. Model dir: {}"
                .format(final_model_folder))
            return

    config_filename = os.path.join(model_folder, "config.pbtxt")
    if (os.path.exists(config_filename) and not FLAGS.triton_model_overwrite):
        print(
            "ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}"
            .format(config_filename))
        return

    config_template = r"""
name: "{model_name}"
platform: "tensorflow_savedmodel"
max_batch_size: {max_batch_size}
input [
    {{
        name: "input_mask"
        data_type: TYPE_INT32
        dims: {seq_length}
    }},
    {{
        name: "input_type_ids"
        data_type: TYPE_INT32
        dims: {seq_length}
    }},
    {{
        name: "input_word_ids"
        data_type: TYPE_INT32
        dims: {seq_length}
    }}
    ]
    output [
    {{
        name: "end_positions"
        data_type: TYPE_FP32
        dims: {seq_length}
    }},
    {{
        name: "start_positions"
        data_type: TYPE_FP32
        dims: {seq_length}
    }}
]
{dynamic_batching}
instance_group [
    {{
        count: {engine_count}
        kind: KIND_GPU
        gpus: [{gpu_list}]
    }}
]"""

    batching_str = ""
    max_batch_size = FLAGS.triton_max_batch_size

    if (FLAGS.triton_dyn_batching_delay > 0):
        # Use only full and half full batches
        pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]

        batching_str = r"""
dynamic_batching {{
    preferred_batch_size: [{0}]
    max_queue_delay_microseconds: {1}
}}""".format(", ".join([str(x) for x in pref_batch_size]),
             int(FLAGS.triton_dyn_batching_delay * 1000.0))

    config_values = {
        "model_name":
        model_name,
        "max_batch_size":
        max_batch_size,
        "seq_length":
        input_meta_data['max_seq_length'],
        "dynamic_batching":
        batching_str,
        "gpu_list":
        ", ".join([
            x.name.split(":")[-1]
            for x in tf.config.list_physical_devices('GPU')
        ]),
        "engine_count":
        FLAGS.triton_engine_count
    }

    with open(model_folder + "/config.pbtxt", "w") as file:
        final_config_str = config_template.format_map(config_values)
        file.write(final_config_str)
Exemplo n.º 6
0
def predict_squad_customized(strategy, input_meta_data, bert_config,
                             predict_tfrecord_path, num_steps):
    """Make predictions using a Bert-based squad model."""
    predict_dataset_fn = get_dataset_fn(predict_tfrecord_path,
                                        input_meta_data['max_seq_length'],
                                        FLAGS.predict_batch_size,
                                        is_training=False,
                                        use_horovod=False)
    if strategy:
        predict_iterator = iter(
            strategy.experimental_distribute_datasets_from_function(
                predict_dataset_fn))
    else:
        predict_iterator = iter(predict_dataset_fn())

    with distribution_utils.get_strategy_scope(strategy):
        squad_model, _ = bert_models.squad_model(
            bert_config,
            input_meta_data['max_seq_length'],
            float_type=tf.float16 if FLAGS.use_fp16 else tf.float32)

    if FLAGS.init_checkpoint:
        checkpoint = tf.train.Checkpoint(model=squad_model)
        checkpoint.restore(FLAGS.init_checkpoint).expect_partial()

    checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
    logging.info('Restoring checkpoints from %s', checkpoint_path)
    checkpoint = tf.train.Checkpoint(model=squad_model)
    checkpoint.restore(checkpoint_path).expect_partial()

    @tf.function
    def predict_step(iterator):
        """Predicts on distributed devices."""
        def _replicated_step(inputs):
            """Replicated prediction calculation."""
            x, _ = inputs
            unique_ids = x.pop('unique_ids')
            if FLAGS.benchmark:
                t0 = tf.timestamp()
                unique_ids = t0
            start_logits, end_logits = squad_model(x, training=False)
            return dict(unique_ids=unique_ids,
                        start_logits=start_logits,
                        end_logits=end_logits)

        def tuple_fun(x):
            return (x, )

        if strategy:
            outputs = strategy.experimental_run_v2(_replicated_step,
                                                   args=(next(iterator), ))
            map_func = strategy.experimental_local_results
        else:
            outputs = _replicated_step(next(iterator), )
            map_func = tuple_fun
        return tf.nest.map_structure(map_func, outputs)

    all_results = []
    time_list = []
    eval_start_time = time.time()
    elapsed_secs = 0

    for _ in range(num_steps):
        predictions = predict_step(predict_iterator)
        if FLAGS.benchmark:
            # transfer tensor to CPU for synchronization
            t0 = predictions['unique_ids'][0]
            start_logits = predictions['start_logits'][0]
            start_logits.numpy()
            elapsed_secs = time.time() - t0.numpy()
            # Removing first 4 (arbitrary) number of startup iterations from perf evaluations
            if _ > 3:
                time_list.append(elapsed_secs)
            continue

        for result in get_raw_results(predictions):
            all_results.append(result)

        if len(all_results) % 100 == 0:
            logging.info('Made predictions for %d records.', len(all_results))

    eval_time_elapsed = time.time() - eval_start_time
    logging.info("-----------------------------")
    logging.info("Summary Inference Statistics")
    logging.info("Batch size = %d", FLAGS.predict_batch_size)
    logging.info("Sequence Length = %d", input_meta_data['max_seq_length'])
    logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
    logging.info("Total Inference Time = %0.2f for Sentences = %d",
                 eval_time_elapsed, num_steps * FLAGS.predict_batch_size)

    if FLAGS.benchmark:
        eval_time_wo_overhead = sum(time_list)
        time_list.sort()
        num_sentences = (num_steps - 4) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead, (num_steps - 4) * FLAGS.predict_batch_size)
        logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
        logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
        logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
        logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
        logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                     cf_100 * 1000)
        logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        logging.info("Throughput Average (sentences/sec) = %0.2f",
                     ss_sentences_per_second)

        dllogging = input_meta_data['dllogging']
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)

    logging.info("-----------------------------")

    return all_results