Exemplo n.º 1
0
def test_torch_latency(
    device,
    model,
    model_name,
    batch_sizes,
    sequence_lengths,
    global_lengths,
    test_times,
    num_threads,
    verbose,
):
    if num_threads > 0:
        torch.set_num_threads(num_threads)

    results = []
    for batch_size in batch_sizes:
        for sequence_length in sequence_lengths:
            for global_length in global_lengths:
                print(
                    f"batch_size={batch_size} sequence_length={sequence_length} global_length={global_length}..."
                )
                inputs: LongformerInputs = LongformerHelper.get_dummy_inputs(
                    batch_size, sequence_length, global_length, device)
                input_list = inputs.to_list()

                _ = model(*input_list)
                runtimes = timeit.repeat(lambda: model(*input_list),
                                         repeat=test_times,
                                         number=1)
                result = {
                    "engine": "torch",  # TODO: test torchscript
                    "version": torch.__version__,
                    "device": "cuda",
                    "optimizer": "",
                    "precision": "fp32",
                    "io_binding": "",
                    "model_name": model_name,
                    "description": model_name + " [torch]",
                    "inputs": 3,
                    "threads": num_threads,
                    "batch_size": batch_size,
                    "sequence_length": sequence_length,
                    "global_length": global_length,
                    "datetime": str(datetime.now()),
                    "memory": "NA",
                    "diff_max": 0,
                    "diff_90_percentile": 0,
                    "diff_95_percentile": 0,
                    "diff_99_percentile": 0,
                    "use_compact_memory": "NA",
                }
                result.update(
                    benchmark_helper.get_latency_result(runtimes, batch_size))

                print(result)
                results.append(result)
    return results
Exemplo n.º 2
0
def test_torch(device, model, model_name, batch_sizes, sequence_lengths,
               global_lengths, test_times, num_threads):
    # Comment the following so that PyTorch use default setting as well.
    #if num_threads <= 0:
    #    import psutil
    #    num_threads = psutil.cpu_count(logical=False)
    if num_threads > 0:
        torch.set_num_threads(num_threads)

    results = []
    for batch_size in batch_sizes:
        for sequence_length in sequence_lengths:  # This is total length of <query, document>.
            for global_length in global_lengths:  # This is length of <query>. Short query (8) for search keywords, and longer query (16) for question like
                print(
                    f"batch_size={batch_size} sequence_length={sequence_length} global_length={global_length}..."
                )
                input_ids, attention_mask, global_attention_mask = get_dummy_inputs(
                    sequence_length, global_length, device)

                # Run PyTorch
                _ = model(input_ids,
                          attention_mask=attention_mask,
                          global_attention_mask=global_attention_mask)
                runtimes = timeit.repeat(lambda: model(
                    input_ids, attention_mask, global_attention_mask),
                                         repeat=test_times,
                                         number=1)
                result = {
                    "engine": "torch",  #TODO: test torchscript
                    "version": torch.__version__,
                    "device": "cuda",
                    "optimizer": "",
                    "precision": "fp32",
                    "io_binding": "",
                    "model_name": model_name,
                    "inputs": 3,
                    "threads": num_threads,
                    "batch_size": batch_size,
                    "sequence_length": sequence_length,
                    "global_length": global_length,
                    "datetime": str(datetime.now()),
                }
                result.update(
                    benchmark_helper.get_latency_result(runtimes, batch_size))

                print(result)
                results.append(result)
    return results
def test_torch_performance(args, model, input_ids, attention_mask, eos_token_id, pad_token_id, bad_words_ids):
    if args.use_gpu and not torch.cuda.is_available():
        logger.error("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
        return None

    if args.precision == Precision.FLOAT16:
        model.half()

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.to(device)

    torch.set_grad_enabled(False)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    torch_latency = []
    for _ in range(args.total_runs):
        start = time.time()
        _ = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=args.max_length,
            min_length=args.min_length,
            num_beams=args.num_beams,
            early_stopping=args.early_stopping,
            no_repeat_ngram_size=args.no_repeat_ngram_size,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            num_return_sequences=args.num_return_sequences,
            temperature=args.temperature,
            length_penalty=args.length_penalty,
            repetition_penalty=args.repetition_penalty,
            bad_words_ids=bad_words_ids,
            return_dict_in_generate=True,
            output_scores=args.output_sequences_scores or args.output_token_scores,
        )
        torch_latency.append(time.time() - start)
    batch_size = input_ids.shape[0]
    from benchmark_helper import get_latency_result

    return get_latency_result(torch_latency, batch_size)
Exemplo n.º 4
0
def run_tensorflow(use_gpu, model_names, model_class, precision, num_threads,
                   batch_sizes, sequence_lengths, repeat_times, cache_dir,
                   verbose):
    results = []

    import tensorflow as tf
    tf.config.threading.set_intra_op_parallelism_threads(num_threads)

    if not use_gpu:
        tf.config.set_visible_devices([], 'GPU')

    if use_gpu and not tf.test.is_built_with_cuda():
        logger.error(
            "Please install Tensorflow-gpu, and use a machine with GPU for testing gpu performance."
        )
        return results

    if use_gpu:  # Restrict TensorFlow to only use the first GPU
        physical_devices = tf.config.list_physical_devices('GPU')
        try:
            tf.config.set_visible_devices(physical_devices[0], 'GPU')
        except RuntimeError as e:
            logger.exception(e)

    if precision == Precision.FLOAT16 or precision == Precision.INT8:
        raise NotImplementedError(
            "Mixed precision is currently not supported.")

    for model_name in model_names:
        config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir)

        model = load_pretrained_model(model_name,
                                      config=config,
                                      cache_dir=cache_dir,
                                      custom_model_class=model_class,
                                      is_tf_model=True)

        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  cache_dir=cache_dir)

        max_input_size = tokenizer.max_model_input_sizes[
            model_name] if model_name in tokenizer.max_model_input_sizes else 1024

        for batch_size in batch_sizes:
            if batch_size <= 0:
                continue

            for sequence_length in sequence_lengths:
                if max_input_size is not None and sequence_length > max_input_size:
                    continue

                logger.info("Run Tensorflow on {} with input shape {}".format(
                    model_name, [batch_size, sequence_length]))

                import random
                rng = random.Random()
                values = [
                    rng.randint(0, config.vocab_size - 1)
                    for i in range(batch_size * sequence_length)
                ]
                input_ids = tf.constant(values,
                                        shape=(batch_size, sequence_length),
                                        dtype=tf.int32)

                try:

                    def encoder_forward():
                        return model(input_ids, training=False)

                    def encoder_decoder_forward():
                        return model(input_ids,
                                     decoder_input_ids=input_ids,
                                     training=False)

                    inference = encoder_decoder_forward if config.is_encoder_decoder else encoder_forward

                    inference()

                    runtimes = timeit.repeat(lambda: inference(),
                                             repeat=repeat_times,
                                             number=1)

                    result = {
                        "engine": "tensorflow",
                        "version": tf.__version__,
                        "device": "cuda" if use_gpu else "cpu",
                        "optimizer": "",
                        "precision": precision,
                        "io_binding": "",
                        "model_name": model_name,
                        "inputs": 1,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "datetime": str(datetime.now()),
                    }
                    result.update(get_latency_result(runtimes, batch_size))
                    logger.info(result)
                    results.append(result)
                except RuntimeError as e:
                    logger.exception(e)
                    from numba import cuda
                    device = cuda.get_current_device()
                    device.reset()

    return results
Exemplo n.º 5
0
def run_pytorch(use_gpu, model_names, model_class, precision, num_threads,
                batch_sizes, sequence_lengths, repeat_times, torchscript,
                cache_dir, verbose):
    results = []
    if use_gpu and not torch.cuda.is_available():
        logger.error(
            "Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance."
        )
        return results

    torch.set_grad_enabled(False)

    for model_name in model_names:
        config = AutoConfig.from_pretrained(model_name,
                                            torchscript=torchscript,
                                            cache_dir=cache_dir)
        model = load_pretrained_model(model_name,
                                      config=config,
                                      cache_dir=cache_dir,
                                      custom_model_class=model_class)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  cache_dir=cache_dir)

        max_input_size = tokenizer.max_model_input_sizes[
            model_name] if model_name in tokenizer.max_model_input_sizes else 1024

        logger.debug(f"Model {model}")
        logger.debug(f"Number of parameters {model.num_parameters()}")

        if precision == Precision.FLOAT16:
            model.half()

        device = torch.device("cuda:0" if use_gpu else "cpu")
        model.to(device)

        if precision == Precision.INT8:
            model = QuantizeHelper.quantize_torch_model(model)

        for batch_size in batch_sizes:
            if batch_size <= 0:
                continue

            for sequence_length in sequence_lengths:
                if max_input_size is not None and sequence_length > max_input_size:
                    continue

                logger.info("Run PyTorch on {} with input shape {}".format(
                    model_name, [batch_size, sequence_length]))
                input_ids = torch.randint(low=0,
                                          high=config.vocab_size - 1,
                                          size=(batch_size, sequence_length),
                                          dtype=torch.long,
                                          device=device)
                try:
                    inference = torch.jit.trace(
                        model, input_ids) if torchscript else model
                    inference(input_ids)

                    runtimes = timeit.repeat(lambda: inference(input_ids),
                                             repeat=repeat_times,
                                             number=1)

                    result = {
                        "engine": "torchscript" if torchscript else "torch",
                        "version": torch.__version__,
                        "device": "cuda" if use_gpu else "cpu",
                        "optimizer": "",
                        "precision": precision,
                        "io_binding": "",
                        "model_name": model_name,
                        "inputs": 1,
                        "threads": num_threads,
                        "batch_size": batch_size,
                        "sequence_length": sequence_length,
                        "datetime": str(datetime.now()),
                    }
                    result.update(get_latency_result(runtimes, batch_size))
                    logger.info(result)
                    results.append(result)
                except RuntimeError as e:
                    logger.exception(e)
                    torch.cuda.empty_cache()

    return results
def test_model(args, use_vocab_mask: bool = False, sentences: List[str] = None):
    if args.model_type != "gpt2":
        print(
            f"Skipping parity test since the support for model type {args.model_type} is not implemented in OnnxRuntime"
        )
        return True

    if args.temperature != 1.0:
        # TODO: implement temperature in BeamSearch operator.
        print("Skipping parity test as temperature is not implemented in BeamSearch operator")
        return True

    if args.prefix_vocab_mask:
        print("Skipping parity test as prefix vocab mask is not implemented by Hugging Face")
        return True

    from transformers import GPT2LMHeadModel, GPT2Tokenizer

    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token

    model = GPT2LMHeadModel.from_pretrained(
        args.model_name_or_path,
        cache_dir=args.cache_dir,
        pad_token_id=tokenizer.eos_token_id,
    )

    # Use different length sentences to test batching
    if sentences is None:
        sentences = [
            "The product is released",
            "I enjoy walking in the park",
            "Test best way to invest",
        ]

    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    bad_words = "walk in park"
    bad_words_ids = tokenizer.encode(bad_words, add_prefix_space=True)
    bad_words_ids = [[word_id] for word_id in bad_words_ids]  # Convert to list of list
    if use_vocab_mask:
        print("bad_words_ids", bad_words_ids)
    else:
        bad_words_ids = None

    global config
    config = model.config
    eos_token_id = config.eos_token_id
    pad_token_id = config.eos_token_id
    vocab_size = config.vocab_size

    torch_decoded_sequences = []
    if not args.disable_parity:
        print("-" * 50)
        print("Test PyTorch model and beam search with huggingface transformers...")
        beam_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=args.max_length,
            min_length=args.min_length,
            num_beams=args.num_beams,
            early_stopping=args.early_stopping,
            no_repeat_ngram_size=args.no_repeat_ngram_size,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            num_return_sequences=args.num_return_sequences,
            temperature=args.temperature,
            length_penalty=args.length_penalty,
            repetition_penalty=args.repetition_penalty,
            bad_words_ids=bad_words_ids,
            return_dict_in_generate=True,
            output_scores=args.output_sequences_scores or args.output_token_scores,
        )
        print("input_ids", input_ids)
        print("huggingface transformers outputs:")
        print("sequences", beam_outputs.sequences)
        if args.output_sequences_scores:
            print("sequences_scores", beam_outputs.sequences_scores)
        if args.output_token_scores:
            print("scores", beam_outputs.scores)
        for i, sequence in enumerate(beam_outputs.sequences):
            decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True)
            torch_decoded_sequences.append(decoded_sequence)
            print("{}: {}".format(i, decoded_sequence))

    print("-" * 50)
    print("Test ONNX model and bream search with onnxruntime...")

    ort_session = create_ort_session(args.output, args.use_gpu)

    vocab_mask = np.ones((vocab_size), dtype=np.int32)
    if use_vocab_mask:
        for bad_word_id in bad_words_ids:
            vocab_mask[bad_word_id] = 0

    inputs = {
        "input_ids": input_ids.cpu().numpy().astype(np.int32),
        "max_length": np.array([args.max_length], dtype=np.int32),
        "min_length": np.array([args.min_length], dtype=np.int32),
        "num_beams": np.array([args.num_beams], dtype=np.int32),
        "num_return_sequences": np.array([args.num_return_sequences], dtype=np.int32),
        "temperature": np.array([args.temperature], dtype=np.float32),
        "length_penalty": np.array([args.length_penalty], dtype=np.float32),
        "repetition_penalty": np.array([args.repetition_penalty], dtype=np.float32),
        "vocab_mask": vocab_mask,
    }

    test_data_dir = Path(args.output).parent.as_posix()
    print("test_data_dir", test_data_dir)
    from bert_test_data import output_test_data

    all_inputs = [inputs]
    for i, inputs in enumerate(all_inputs):
        dir = os.path.join(test_data_dir, "test_data_set_" + str(i))
        output_test_data(dir, inputs)

    print("inputs", inputs)

    # Test performance
    latency = []
    for _ in range(args.total_runs):
        start = time.time()
        result = ort_session.run(None, inputs)
        latency.append(time.time() - start)
    batch_size = input_ids.shape[0]
    from benchmark_helper import get_latency_result

    output = get_latency_result(latency, batch_size)

    print("ORT outputs:")
    sequences = result[0]
    print("sequences", sequences)
    if args.output_sequences_scores:
        print("sequences_scores", result[1])
    if args.output_token_scores:
        print("scores", result[2])

    (batch_size, num_sequences, max_length) = sequences.shape
    ort_decoded_sequences = []
    for i in range(batch_size):
        for j in range(num_sequences):
            decoded_sequence = tokenizer.decode(sequences[i][j], skip_special_tokens=True)
            ort_decoded_sequences.append(decoded_sequence)
            print(f"batch {i} sequence {j}: {decoded_sequence}")

    if not args.disable_parity:
        torch_sequences = beam_outputs.sequences.reshape(batch_size, args.num_return_sequences, -1)
        ort_sequences = torch.LongTensor(sequences)
        print("-" * 50)
        print("Torch Sequences:")
        print(torch_sequences)
        print(torch_decoded_sequences)
        print("-" * 50)
        print("ORT Sequences:")
        print(ort_sequences)
        print(ort_decoded_sequences)
        print("-" * 50)
        # Compare the generated text instead of word IDs since ORT pads to max sequence length but Torch not.
        is_same = torch_decoded_sequences == ort_decoded_sequences
        print("Torch and ORT result is ", "same" if is_same else "different")
        output["parity"] = is_same

    if args.torch_performance:
        torch_latency_output = test_torch_performance(
            args,
            model,
            input_ids,
            attention_mask,
            eos_token_id,
            pad_token_id,
            bad_words_ids,
        )
        print("Torch Latency", torch_latency_output)

    print("ORT", output)
    return output