Пример #1
0
def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads,
                   batch_sizes, sequence_lengths, repeat_times, cache_dir,
                   verbose):
    results = []

    import tensorflow as tf
    tf.config.threading.set_intra_op_parallelism_threads(num_threads)

    if not use_gpu:
        tf.config.set_visible_devices([], 'GPU')

    if use_gpu and not tf.test.is_built_with_cuda():
        logger.error(
            "Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance."
        )
        return results

    if use_gpu:  # Restrict TensorFlow to only use the first GPU
        physical_devices = tf.config.list_physical_devices('GPU')
        try:
            tf.config.set_visible_devices(physical_devices[0], 'GPU')
        except RuntimeError as e:
            logger.exception(e)

    if precision == Precision.FLOAT16 or precision == Precision.INT8:
        raise NotImplementedError(
            "Mixed precision is currently not supported.")

    for model_name in model_names:
        config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)

        model = load_pretrained_model(model_name,
                                      config=config,
                                      cache_dir=cache_dir,
                                      custom_model_class=model_class,
                                      is_tf_model=True)

        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  cache_dir=cache_dir)

        max_input_size = tokenizer.max_model_input_sizes[
            model_name] if model_name in tokenizer.max_model_input_sizes else 1024

        for batch_size in batch_sizes:
            if batch_size <= 0:
                continue

            for sequence_length in sequence_lengths:
                if max_input_size is not None and sequence_length > max_input_size:
                    continue

                logger.info("Run Tensorflow on {} with input shape {}".format(
                    model_name, [batch_size, sequence_length]))

                import random
                rng = random.Random()
                values = [
                    rng.randint(0, config.vocab_size - 1)
                    for i in range(batch_size * sequence_length)
                ]
                input_ids = tf.constant(values,
                                        shape=(batch_size, sequence_length),
                                        dtype=tf.int32)

                try:

                    def encoder_forward():
                        return model(input_ids, training=False)

                    def encoder_decoder_forward():
                        return model(input_ids,
                                     decoder_input_ids=input_ids,
                                     training=False)

                    inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward

                    inference()

                    runtimes = timeit.repeat(lambda: inference(),
                                             repeat=repeat_times,
                                             number=1)

                    result = {
                        "engine": "tensorflow",
                        "version": tf.__version__,
                        "device": "cuda" if use_gpu else "cpu",
                        "optimizer": "",
                        "precision": precision,
                        "io_binding": "",
                        "model_name": model_name,
                        "inputs": 1,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "datetime": str(datetime.now()),
                    }
                    result.update(get_latency_result(runtimes, batch_size))
                    logger.info(result)
                    results.append(result)
                except RuntimeError as e:
                    logger.exception(e)
                    from numba import cuda
                    device = cuda.get_current_device()
                    device.reset()

    return results
Пример #2
0
def run_pytorch(use_gpu, model_names, model_class, precision, num_threads,
                batch_sizes, sequence_lengths, repeat_times, torchscript,
                cache_dir, verbose):
    results = []
    if use_gpu and not torch.cuda.is_available():
        logger.error(
            "Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance."
        )
        return results

    torch.set_grad_enabled(False)

    for model_name in model_names:
        config = AutoConfig.from_pretrained(model_name,
                                            torchscript=torchscript,
                                            cache_dir=cache_dir)
        model = load_pretrained_model(model_name,
                                      config=config,
                                      cache_dir=cache_dir,
                                      custom_model_class=model_class)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  cache_dir=cache_dir)

        max_input_size = tokenizer.max_model_input_sizes[
            model_name] if model_name in tokenizer.max_model_input_sizes else 1024

        logger.debug(f"Model {model}")
        logger.debug(f"Number of parameters {model.num_parameters()}")

        if precision == Precision.FLOAT16:
            model.half()

        device = torch.device("cuda:0" if use_gpu else "cpu")
        model.to(device)

        if precision == Precision.INT8:
            model = QuantizeHelper.quantize_torch_model(model)

        for batch_size in batch_sizes:
            if batch_size <= 0:
                continue

            for sequence_length in sequence_lengths:
                if max_input_size is not None and sequence_length > max_input_size:
                    continue

                logger.info("Run PyTorch on {} with input shape {}".format(
                    model_name, [batch_size, sequence_length]))
                input_ids = torch.randint(low=0,
                                          high=config.vocab_size - 1,
                                          size=(batch_size, sequence_length),
                                          dtype=torch.long,
                                          device=device)
                try:
                    inference = torch.jit.trace(
                        model, input_ids) if torchscript else model
                    inference(input_ids)

                    runtimes = timeit.repeat(lambda: inference(input_ids),
                                             repeat=repeat_times,
                                             number=1)

                    result = {
                        "engine": "torchscript" if torchscript else "torch",
                        "version": torch.__version__,
                        "device": "cuda" if use_gpu else "cpu",
                        "optimizer": "",
                        "precision": precision,
                        "io_binding": "",
                        "model_name": model_name,
                        "inputs": 1,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "datetime": str(datetime.now()),
                    }
                    result.update(get_latency_result(runtimes, batch_size))
                    logger.info(result)
                    results.append(result)
                except RuntimeError as e:
                    logger.exception(e)
                    torch.cuda.empty_cache()

    return results