示例#1
0
def inference(model_path, dummy_inputs, outputs_path, use_gpu):
    environ_reset()
    environ_setting_nodes()
    environ_setting_paths(outputs_path)
    session = create_onnxruntime_session(model_path,
                                         use_gpu,
                                         enable_all_optimization=False)
    Gpt2Helper.onnxruntime_inference(session, dummy_inputs)
示例#2
0
def run_parity(task: ParityTask, args):
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        'onnx_models',
        args.model_name_or_path,
        new_folder=args.use_external_data_format,
        remove_existing=[])

    fp32_baseline, fp16_baseline = get_baselines(args)

    task.run(fp32_baseline, "FP32 baseline")

    # The following tests for fp16 requires GPU
    if not args.use_gpu:
        logger.info("skip mixed precision since --use_gpu is not specified")
        return

    task.run(fp16_baseline, "FP16 baseline")

    last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"])

    # Mixed precision baseline
    run_candidate(task, args, last_matmul_node_name, op_block_list=[])

    # Result from tuning step 1
    run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"])

    if args.all:
        run_tuning_step0(task, fp16_baseline)
        mixed_precision_baseline = get_mixed_precision_parameters(
            args, last_matmul_node_name, op_block_list=[])
        run_tuning_step1(task, mixed_precision_baseline)
        run_tuning_step2(task, mixed_precision_baseline)
    else:
        run_candidate(task,
                      args,
                      last_matmul_node_name,
                      op_block_list=["LayerNormalization", "Add"])
        run_candidate(task,
                      args,
                      last_matmul_node_name,
                      op_block_list=["FastGelu", "Add"])

    # Run a few good candidates
    run_candidate(task,
                  args,
                  last_matmul_node_name,
                  op_block_list=["FastGelu", "LayerNormalization", "Add"])
    run_candidate(
        task,
        args,
        last_matmul_node_name,
        op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather"])
    run_candidate(task, args, last_matmul_node_name, \
                  op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather", "MatMul"])
示例#3
0
    def get_dummy_inputs(
            batch_size: int,
            past_sequence_length: int,
            sequence_length: int,
            num_attention_heads: int,
            hidden_size: int,
            num_layer: int,
            vocab_size: int,
            device: torch.device,
            float16: bool = False,
            has_position_ids: bool = True,
            has_attention_mask: bool = True) -> Gpt2BeamSearchInputs:
        """Create random inputs for GPT2 model.
        Returns torch tensors of input_ids, position_ids, attention_mask and a list of past state tensors.
        """
        gpt2_dummy_inputs = Gpt2Helper.get_dummy_inputs(
            batch_size, past_sequence_length, sequence_length,
            num_attention_heads, hidden_size, num_layer, vocab_size, device,
            float16, has_position_ids, has_attention_mask)
        float_type = torch.float16 if float16 else torch.float32

        beam_select_idx = torch.zeros([1, batch_size], device=device).long()
        input_log_probs = torch.zeros([batch_size, 1],
                                      dtype=float_type,
                                      device=device)
        input_unfinished_sents = torch.ones([batch_size, 1],
                                            dtype=torch.bool,
                                            device=device)
        if has_position_ids:
            prev_step_results = torch.randint(
                low=0,
                high=vocab_size - 1,
                size=(batch_size, sequence_length),
                dtype=torch.int64,
                device=device,
            )
        else:
            prev_step_results = None

        prev_step_scores = torch.zeros([batch_size, 1],
                                       dtype=float_type,
                                       device=device)

        return Gpt2BeamSearchInputs(
            gpt2_dummy_inputs.input_ids,
            gpt2_dummy_inputs.past,
            gpt2_dummy_inputs.position_ids,
            gpt2_dummy_inputs.attention_mask,
            beam_select_idx,
            input_log_probs,
            input_unfinished_sents,
            prev_step_results,
            prev_step_scores,
        )
示例#4
0
def run_parity_disable_half2(task: ParityTask, args):
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        'onnx_models',
        args.model_name_or_path,
        new_folder=args.use_external_data_format,
        remove_existing=[])
    last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"])
    run_candidate(task, args, last_matmul_node_name, op_block_list=[])
    run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"])
    run_candidate(task,
                  args,
                  last_matmul_node_name,
                  op_block_list=["LayerNormalization", "Add"])
示例#5
0
    def prepare_io_binding(
        ort_session,
        input_ids,
        position_ids,
        attention_mask,
        past,
        output_buffers,
        output_shapes,
        beam_select_idx=None,
        input_log_probs=None,
        input_unfinished_sents=None,
        prev_step_results=None,
        prev_step_scores=None,
    ):
        """Returnas IO binding object for a session."""

        # Bind (input_ids, position_ids, attention_mask and past_*) and all outputs
        io_binding = Gpt2Helper.prepare_io_binding(
            ort_session,
            input_ids,
            position_ids,
            attention_mask,
            past=past,
            output_buffers=output_buffers,
            output_shapes=output_shapes,
        )

        # Bind the remaining inputs
        other_inputs = {
            "beam_select_idx": beam_select_idx,
            "input_log_probs": input_log_probs,
            "input_unfinished_sents": input_unfinished_sents,
            "prev_step_results": prev_step_results,
            "prev_step_scores": prev_step_scores,
        }
        name_to_np_type = TypeHelper.get_io_numpy_type_map(ort_session)
        for name, tensor in other_inputs.items():
            if tensor is not None:
                assert tensor.is_contiguous()
                io_binding.bind_input(
                    name,
                    tensor.device.type,
                    0,
                    name_to_np_type[name],
                    list(tensor.size()),
                    tensor.data_ptr(),
                )

        return io_binding
示例#6
0
def main(args):
    logger.info(f"Arguments:{args}")
    if args.precision == Precision.FLOAT16:
        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    torch.set_num_threads(
        psutil.cpu_count(
            logical=True) if args.thread_num <= 0 else args.thread_num)
    print(torch.__config__.parallel_info())

    cache_dir = args.cache_dir
    output_dir = args.onnx_dir
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    model_class = MODEL_CLASSES[args.model_class][0]

    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        torchscript=args.torchscript,
                                        cache_dir=cache_dir)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    # This scirpt does not support float16 for PyTorch.
    #if args.float16:
    #    model.half()

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.to(device)
    use_external_data_format = (config.n_layer > 24
                                )  #TODO: find a way to check model size > 2GB
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        has_past=True,
        new_folder=use_external_data_format)

    onnx_model_path = onnx_model_paths["raw"]
    use_padding = MODEL_CLASSES[args.model_class][2]
    Gpt2Helper.export_onnx(model,
                           device,
                           onnx_model_path,
                           args.verbose,
                           use_external_data_format,
                           has_position_ids=use_padding,
                           has_attention_mask=use_padding)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        onnx_model_path = onnx_model_paths[str(
            args.precision) if args.precision != Precision.INT8 else 'fp32']
        Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size,
                                 use_external_data_format)

        if args.precision == Precision.INT8:
            logger.info("quantizing model...")
            QuantizeHelper.quantize_onnx_model(onnx_model_path,
                                               onnx_model_paths["int8"],
                                               use_external_data_format)
            model = QuantizeHelper.quantize_torch_model(model)
            logger.info("finished quantizing model")
            onnx_model_path = onnx_model_paths["int8"]

    if args.torchscript:
        model = Gpt2Helper.torchscript(model,
                                       config,
                                       device,
                                       has_position_ids=use_padding,
                                       has_attention_mask=use_padding)

    session = create_onnxruntime_session(onnx_model_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         num_threads=args.thread_num,
                                         verbose=args.verbose)
    if session is None:
        return

    # Allocate output buffers for IO Binding
    max_output_shapes = Gpt2Helper.get_output_shapes(
        max(args.batch_sizes), max(args.past_sequence_lengths),
        max(args.sequence_lengths), config, args.model_class)
    output_buffers = Gpt2Helper.get_output_buffers(
        max_output_shapes, device, args.precision == Precision.FLOAT16)

    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(
        datetime.now().strftime("%Y%m%d-%H%M%S"))
    with open(csv_filename, mode="a", newline='') as csv_file:
        column_names = [
            "model_name", "model_class", "gpu", "precision", "optimizer",
            "torchscript", "batch_size", "sequence_length",
            "past_sequence_length", "torch_latency", "onnxruntime_latency",
            "onnxruntime_io_binding_latency"
        ]
        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
        csv_writer.writeheader()

        for batch_size in args.batch_sizes:
            for sequence_length in args.sequence_lengths:
                for past_sequence_length in args.past_sequence_lengths:
                    assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0
                    logger.debug(
                        f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..."
                    )
                    dummy_inputs = Gpt2Helper.get_dummy_inputs(
                        batch_size,
                        past_sequence_length,
                        sequence_length,
                        config.num_attention_heads,
                        config.hidden_size,
                        config.n_layer,
                        config.vocab_size,
                        device,
                        float16=(args.precision == Precision.FLOAT16),
                        has_position_ids=use_padding,
                        has_attention_mask=use_padding)
                    output_shapes = Gpt2Helper.get_output_shapes(
                        batch_size, past_sequence_length, sequence_length,
                        config, args.model_class)

                    try:
                        outputs, torch_latency = Gpt2Helper.pytorch_inference(
                            model, dummy_inputs, args.test_times)
                        ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference(
                            session, dummy_inputs, args.test_times)
                        ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io(
                            session,
                            dummy_inputs,
                            output_buffers,
                            output_shapes,
                            args.test_times,
                            return_numpy=False,
                            include_copy_output_latency=args.
                            include_copy_output_latency)

                        if args.validate_onnx:
                            if Gpt2Helper.compare_outputs(
                                    outputs,
                                    ort_outputs,
                                    rtol=DEFAULT_TOLERANCE[args.precision],
                                    atol=DEFAULT_TOLERANCE[args.precision]):
                                logger.info(
                                    f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                                )

                            # Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
                            copy_outputs = []
                            for output in ort_io_outputs:
                                copy_outputs.append(output.cpu().numpy())

                            if Gpt2Helper.compare_outputs(
                                    outputs,
                                    copy_outputs,
                                    rtol=DEFAULT_TOLERANCE[args.precision],
                                    atol=DEFAULT_TOLERANCE[args.precision]):
                                logger.info(
                                    f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                                )

                        logger.info(
                            f"batch_size={batch_size}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, onnxruntime_latency={ort_latency:.2f}, onnxruntime_io_binding_latency={ort_io_latency:.2f}"
                        )

                        row = {
                            "model_name":
                            args.model_name_or_path,
                            "model_class":
                            args.model_class,
                            "gpu":
                            args.use_gpu,
                            "precision":
                            args.precision,
                            "optimizer":
                            args.optimize_onnx,
                            "torchscript":
                            args.torchscript,
                            "batch_size":
                            batch_size,
                            "sequence_length":
                            sequence_length,
                            "past_sequence_length":
                            past_sequence_length,
                            "torch_latency":
                            f"{torch_latency:.2f}",
                            "onnxruntime_latency":
                            f"{ort_latency:.2f}",
                            "onnxruntime_io_binding_latency":
                            f"{ort_io_latency:.2f}"
                        }
                        csv_writer.writerow(row)
                    except:
                        logger.error(f"Exception", exc_info=True)

    logger.info(f"Results are saved to file {csv_filename}")
    return csv_filename
示例#7
0

if __name__ == '__main__':
    # Below example shows how to use this helper to investigate parity issue of gpt-2 fp32 and fp16 onnx model
    # Please build ORT with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=ON !!
    multiprocessing.set_start_method('spawn')

    # Generate Inputs
    sequence_length = 8
    past_sequence_length = 8
    batch_size = 5
    dummy_inputs_fp16 = Gpt2Helper.get_dummy_inputs(batch_size,
                                                    past_sequence_length,
                                                    sequence_length,
                                                    12,
                                                    768,
                                                    12,
                                                    50257,
                                                    device=torch.device("cpu"),
                                                    float16=True)
    dummy_inputs_fp32 = dummy_inputs_fp16.to_fp32()

    # Get GPT-2 model from huggingface using convert_to_onnx.py
    os.system(
        'python convert_to_onnx.py -m gpt2 --output gpt2_fp32.onnx -o -p fp32 --use_gpu'
    )
    os.system(
        'python convert_to_onnx.py -m gpt2 --output gpt2_fp16.onnx -o -p fp16 --use_gpu'
    )

    # Specify the directory to dump the node's I/O
示例#8
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class)

    logger.info(f"Done. Output model: {output_path}")
示例#9
0
    def test_generation(session,
                        model,
                        device,
                        test_inputs,
                        precision=Precision.FLOAT32,
                        model_class='Gpt2LMHeadModel',
                        top_k=20,
                        top_k_no_order=True,
                        max_steps=24,
                        max_inputs=0,
                        verbose=False):
        """
        Test Generation using greedy beam search (without sampling) to compare PyTorch and ONNX model.
        It will print top 1 and top k errors on the given test inputs.
        """
        print(
            f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})"
        )
        n_layer = model.config.n_layer
        n_head = model.config.n_head
        n_embd = model.config.n_embd
        vocab_size = model.config.vocab_size
        eos_token_id = model.config.eos_token_id

        torch_float = torch.float16 if precision == Precision.FLOAT16 else torch.float32
        is_float16 = (precision == Precision.FLOAT16)
        if is_float16:
            assert 'float16' in session.get_outputs()[0].type

        model.eval().to(device).to(torch_float)

        # Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later.
        init_output_shapes = Gpt2Helper.get_output_shapes(
            batch_size=4,
            past_sequence_length=128,
            sequence_length=32,
            config=model.config,
            model_class=model_class)
        output_buffers = Gpt2Helper.get_output_buffers(
            init_output_shapes,
            device,
            is_float16=(torch_float == torch.float16))

        baseline_name = 'Torch'
        treatment_name = 'Quantized Onnx' if precision == Precision.INT8 else "Onnx"
        torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k)
        onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k)
        onnx_io_metric = Gpt2Metric(treatment_name + ' with IO Binding',
                                    baseline_name, top_k)

        for i, inputs in enumerate(test_inputs):
            if (max_inputs > 0 and i == max_inputs):
                break
            if i % 10 == 0:
                print(f"{i}")
            input_ids = inputs["input_ids"]
            position_ids = inputs[
                "position_ids"] if "position_ids" in inputs else None
            attention_mask = inputs[
                "attention_mask"] if "attention_mask" in inputs else None

            onnx_runner = Gpt2Tester(input_ids, position_ids, attention_mask,
                                     n_head, n_embd, n_layer, device,
                                     is_float16, top_k, not top_k_no_order)
            onnx_io_runner = Gpt2Tester(input_ids, position_ids,
                                        attention_mask, n_head, n_embd,
                                        n_layer, device, is_float16, top_k,
                                        not top_k_no_order)
            torch_runner = Gpt2Tester(input_ids, position_ids, attention_mask,
                                      n_head, n_embd, n_layer, device,
                                      is_float16, top_k, not top_k_no_order)

            batch_size = torch_runner.batch_size
            onnx_metric.start_batch(batch_size)
            onnx_io_metric.start_batch(batch_size)

            with torch.no_grad():
                done = torch.zeros(batch_size, dtype=torch.bool)
                for step in range(max_steps):
                    seq_len = list(onnx_runner.input_ids.size())[1]
                    past_seq_len = list(onnx_runner.past[0].size())[3]

                    start_time = timeit.default_timer()
                    pytorch_output = Gpt2Helper.pytorch_inference(
                        model, torch_runner.get_inputs())
                    torch_metric.add_latency(
                        past_seq_len,
                        timeit.default_timer() - start_time)
                    torch_runner.update(pytorch_output, step, device)

                    onnx_output, avg_latency_ms = Gpt2Helper.onnxruntime_inference(
                        session, onnx_runner.get_inputs(), total_runs=1)
                    onnx_metric.add_latency(past_seq_len,
                                            avg_latency_ms / 1000.0)
                    onnx_runner.update(onnx_output, step, device)

                    output_shapes = Gpt2Helper.get_output_shapes(
                        batch_size,
                        past_seq_len,
                        seq_len,
                        model.config,
                        model_class=model_class)
                    Gpt2Helper.auto_increase_buffer_size(
                        output_buffers, output_shapes)

                    onnx_io_output, avg_latency_ms = Gpt2Helper.onnxruntime_inference_with_binded_io(
                        session,
                        onnx_io_runner.get_inputs(),
                        output_buffers,
                        output_shapes,
                        total_runs=1,
                        return_numpy=False,
                        include_copy_output_latency=True)
                    onnx_io_metric.add_latency(past_seq_len,
                                               avg_latency_ms / 1000.0)
                    onnx_io_runner.update(onnx_io_output, step, device)

                    if verbose:
                        onnx_runner.diff(onnx_io_runner)
                        Gpt2Tester.diff_present(onnx_output, onnx_io_output,
                                                n_layer)

                        print("Top 1 tokens:")
                        print("\tTorch", torch_runner.top_1_tokens)
                        print("\ONNX", onnx_runner.top_1_tokens)
                        print("\ONNX with IO binding",
                              onnx_io_runner.top_1_tokens)

                    onnx_metric.eval_batch(torch_runner,
                                           onnx_runner,
                                           past_seq_len,
                                           verbose=verbose)
                    onnx_io_metric.eval_batch(torch_runner,
                                              onnx_io_runner,
                                              past_seq_len,
                                              verbose=verbose)

                    done = done | (torch_runner.top_1_tokens
                                   == eos_token_id).any()
                    if torch.all(done):
                        break

            onnx_metric.end_batch()
            onnx_io_metric.end_batch()

        torch_metric.print()
        onnx_metric.print()
        onnx_io_metric.print()
示例#10
0
    def prepare_io_binding(ort_session,
                           input_ids,
                           position_ids,
                           attention_mask,
                           past,
                           output_buffers,
                           output_shapes,
                           beam_select_idx=None,
                           input_log_probs=None,
                           input_unfinished_sents=None,
                           prev_step_results=None,
                           prev_step_scores=None):
        """Returnas IO binding object for a session."""

        # Bind inputs and outputs to onnxruntime session
        io_binding = Gpt2Helper.prepare_io_binding(
            ort_session,
            input_ids,
            position_ids,
            attention_mask,
            past=past,
            output_buffers=output_buffers,
            output_shapes=output_shapes)

        # Bind inputs
        data_type = output_buffers[ort_session.get_outputs()[1].name].dtype
        float_type = numpy.float16 if data_type == torch.float16 else numpy.float32

        if past is not None:
            for i, past_i in enumerate(past):
                assert past_i.is_contiguous()

                data_ptr = past_i.data_ptr()
                if data_ptr == 0:
                    # When past_sequence_length is 0, its data_ptr will be zero. IO Binding asserts that data_ptr shall not be zero.
                    # Here we workaround and pass data pointer of input_ids. Actual data is not used for past so it does not matter.
                    data_ptr = input_ids.data_ptr()

                io_binding.bind_input(f'past_{i}',
                                      past_i.device.type, 0, float_type,
                                      list(past_i.size()), data_ptr)

        if attention_mask is not None:
            assert attention_mask.is_contiguous()
            io_binding.bind_input('attention_mask', attention_mask.device.type,
                                  0, float_type, list(attention_mask.size()),
                                  attention_mask.data_ptr())

        if beam_select_idx is not None:
            assert beam_select_idx.is_contiguous()
            io_binding.bind_input(
                "beam_select_idx",
                beam_select_idx.device.type,
                0,
                numpy.longlong,
                list(beam_select_idx.size()),
                beam_select_idx.data_ptr(),
            )

        if input_log_probs is not None:
            assert input_log_probs.is_contiguous()
            io_binding.bind_input(
                "input_log_probs",
                input_log_probs.device.type,
                0,
                float_type,
                list(input_log_probs.size()),
                input_log_probs.data_ptr(),
            )

        if input_unfinished_sents is not None:
            assert input_unfinished_sents.is_contiguous()
            io_binding.bind_input(
                "input_unfinished_sents",
                input_unfinished_sents.device.type,
                0,
                numpy.bool,
                list(input_unfinished_sents.size()),
                input_unfinished_sents.data_ptr(),
            )

        if prev_step_results is not None:
            assert prev_step_results.is_contiguous()
            io_binding.bind_input(
                "prev_step_results",
                prev_step_results.device.type,
                0,
                numpy.longlong,
                list(prev_step_results.size()),
                prev_step_results.data_ptr(),
            )

        if prev_step_scores is not None:
            assert prev_step_scores.is_contiguous()
            io_binding.bind_input(
                "prev_step_scores",
                prev_step_scores.device.type,
                0,
                float_type,
                list(prev_step_scores.size()),
                prev_step_scores.data_ptr(),
            )

        # Bind outputs
        for output in ort_session.get_outputs():
            output_name = output.name
            output_buffer = output_buffers[output_name]
            logger.debug(
                f"{output_name} device type={output_buffer.device.type} shape={list(output_buffer.size())}"
            )
            if (output_name == "output_selected_indices"
                    or output_name == "last_state"
                    or output_name == "current_step_results"):
                io_binding.bind_output(
                    output_name,
                    output_buffer.device.type,
                    0,
                    numpy.longlong,
                    output_shapes[output_name],
                    output_buffer.data_ptr(),
                )
            elif output_name == "output_unfinished_sents":
                io_binding.bind_output(
                    output_name,
                    output_buffer.device.type,
                    0,
                    numpy.bool,
                    output_shapes[output_name],
                    output_buffer.data_ptr(),
                )
            else:
                io_binding.bind_output(
                    output_name,
                    output_buffer.device.type,
                    0,
                    float_type,
                    output_shapes[output_name],
                    output_buffer.data_ptr(),
                )

        return io_binding
示例#11
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir,
                                                 args.model_name_or_path,
                                                 args.model_class)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class)

    if args.input_test_file:
        test_inputs = []
        with open(args.input_test_file) as read_f:
            for i, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)
                position_ids = torch.from_numpy(
                    numpy.asarray(data["position_ids"],
                                  dtype=numpy.int64)).to(device)
                numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                attention_mask = torch.from_numpy(
                    numpy.asarray(data["attention_mask"],
                                  dtype=numpy_float)).to(device)
                inputs = {
                    "input_ids": input_ids,
                    "position_ids": position_ids,
                    "attention_mask": attention_mask
                }
                test_inputs.append(inputs)
        Gpt2Tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose)

    logger.info(f"Done. Output model: {output_path}")
示例#12
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    logger.info(f"Arguments:{args}")
    if args.precision == Precision.FLOAT16:
        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    torch.set_num_threads(
        psutil.cpu_count(
            logical=True) if args.thread_num <= 0 else args.thread_num)
    print(torch.__config__.parallel_info())

    cache_dir = args.cache_dir
    output_dir = args.onnx_dir
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    model_class = MODEL_CLASSES[args.model_class][0]

    config = AutoConfig.from_pretrained(args.model_name,
                                        torchscript=args.torchscript,
                                        cache_dir=cache_dir)
    if hasattr(config, 'return_tuple'):
        config.return_tuple = True
    model = model_class.from_pretrained(args.model_name,
                                        config=config,
                                        cache_dir=cache_dir)

    # This scirpt does not support float16 for PyTorch.
    #if args.float16:
    #    model.half()

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.to(device)

    onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name,
                                                 args.model_class)

    onnx_model_path = onnx_model_paths["raw"]
    Gpt2Helper.export_onnx(model, device, onnx_model_path, args.verbose)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        onnx_model_path = onnx_model_paths[str(args.precision)]
        Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

        if args.precision == Precision.INT8:
            logger.info("quantizing model...")
            QuantizeHelper.quantize_onnx_model(onnx_model_path,
                                               onnx_model_path)
            model = QuantizeHelper.quantize_torch_model(model)
            logger.info("finished quantizing model")

    if args.torchscript:
        model = Gpt2Helper.torchscript(model, config, device)

    session = create_onnxruntime_session(onnx_model_path,
                                         args.use_gpu,
                                         enable_all_optimization=False,
                                         num_threads=args.thread_num,
                                         verbose=args.verbose)
    if session is None:
        return

    # One word is generated for each inference. This length does not include that of past state.
    sequence_length = 1

    # Allocate output buffers for IO Binding
    max_output_shapes = Gpt2Helper.get_output_shapes(
        max(args.batch_sizes), max(args.past_sequence_lengths),
        sequence_length, config, args.model_class)
    output_buffers = Gpt2Helper.get_output_buffers(
        max_output_shapes, device, args.precision == Precision.FLOAT16)

    csv_filename = args.result_csv or "benchmark_result_{}.csv".format(
        datetime.now().strftime("%Y%m%d-%H%M%S"))
    with open(csv_filename, mode="a", newline='') as csv_file:
        column_names = [
            "model_name", "model_class", "gpu", "precision", "optimizer",
            "torchscript", "batch_size", "past_sequence_length",
            "torch_latency", "ort_latency", "ort_io_latency"
        ]
        csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
        csv_writer.writeheader()

        for batch_size in args.batch_sizes:
            for past_sequence_length in args.past_sequence_lengths:
                logger.debug(
                    f"Running test for batch_size={batch_size} past_sequence_length={past_sequence_length}..."
                )
                dummy_inputs = Gpt2Helper.get_dummy_inputs(
                    batch_size, past_sequence_length, sequence_length,
                    config.num_attention_heads, config.hidden_size,
                    config.n_layer, config.vocab_size, device,
                    args.precision == Precision.FLOAT16)
                output_shapes = Gpt2Helper.get_output_shapes(
                    batch_size, past_sequence_length, sequence_length, config,
                    args.model_class)

                try:
                    outputs, torch_latency = Gpt2Helper.pytorch_inference(
                        model, dummy_inputs, args.test_times)
                    ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference(
                        session, dummy_inputs, args.test_times)
                    ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io(
                        session, dummy_inputs, output_buffers, output_shapes,
                        args.test_times)
                    if args.validate_onnx:
                        if Gpt2Helper.compare_outputs(
                                outputs,
                                ort_outputs,
                                rtol=DEFAULT_TOLERANCE[args.precision],
                                atol=DEFAULT_TOLERANCE[args.precision]):
                            logger.info(
                                f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                            )
                        if Gpt2Helper.compare_outputs(
                                outputs,
                                ort_io_outputs,
                                rtol=DEFAULT_TOLERANCE[args.precision],
                                atol=DEFAULT_TOLERANCE[args.precision]):
                            logger.info(
                                f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).'
                            )

                    logger.info(
                        f"batch_size={batch_size}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, ort_latency={ort_latency:.2f}, ort_io_latency={ort_io_latency:.2f}"
                    )

                    row = {
                        "model_name": args.model_name,
                        "model_class": args.model_class,
                        "gpu": args.use_gpu,
                        "precision": args.precision,
                        "optimizer": args.optimize_onnx,
                        "torchscript": args.torchscript,
                        "batch_size": batch_size,
                        "past_sequence_length": past_sequence_length,
                        "torch_latency": f"{torch_latency:.2f}",
                        "ort_latency": f"{ort_latency:.2f}",
                        "ort_io_latency": f"{ort_io_latency:.2f}"
                    }
                    csv_writer.writerow(row)
                except:
                    logger.error(f"Exception", exc_info=True)

    logger.info(f"Results are saved to file {csv_filename}")
示例#13
0
def main():
    args = parse_arguments()
    setup_logger(args.verbose)

    if args.tolerance == 0:
        args.tolerance = DEFAULT_TOLERANCE[args.precision]

    logger.info(f"Arguments:{args}")

    cache_dir = args.cache_dir
    output_dir = args.output if not args.output.endswith(
        ".onnx") else os.path.dirname(args.output)
    prepare_environment(cache_dir, output_dir, args.use_gpu)

    if args.precision != Precision.FLOAT32:
        assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx"

    if args.precision == Precision.FLOAT16:
        assert args.use_gpu, "fp16 requires --use_gpu"

    if args.precision == Precision.INT8:
        assert not args.use_gpu, "quantization only supports CPU"

    model_class = MODEL_CLASSES[args.model_class][0]
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        cache_dir=cache_dir)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=cache_dir)

    device = torch.device("cuda:0" if args.use_gpu else "cpu")
    model.eval().to(device)

    use_external_data_format = (config.n_layer > 24
                                )  #TODO: find a way to check model size > 2GB
    onnx_model_paths = Gpt2Helper.get_onnx_paths(
        output_dir,
        args.model_name_or_path,
        args.model_class,
        new_folder=use_external_data_format)
    raw_onnx_model = args.output if args.output.endswith(
        '.onnx') else onnx_model_paths["raw"]
    output_path = raw_onnx_model if (
        args.output.endswith('.onnx') or
        (args.precision == Precision.FLOAT32 and not args.optimize_onnx)
    ) else onnx_model_paths[str(args.precision)]

    logger.info(f"Exporting ONNX model to {raw_onnx_model}")
    use_padding = MODEL_CLASSES[args.model_class][2]
    Gpt2Helper.export_onnx(model,
                           device,
                           raw_onnx_model,
                           args.verbose,
                           use_external_data_format,
                           has_position_ids=use_padding,
                           has_attention_mask=use_padding)

    if args.optimize_onnx or args.precision != Precision.FLOAT32:
        logger.info(f"Optimizing model to {output_path}")
        Gpt2Helper.optimize_onnx(raw_onnx_model, output_path,
                                 args.precision == Precision.FLOAT16,
                                 model.config.num_attention_heads,
                                 model.config.hidden_size)

    if args.precision == Precision.INT8:
        logger.info("quantizing model...")
        QuantizeHelper.quantize_onnx_model(output_path, output_path)
        model = QuantizeHelper.quantize_torch_model(model)
        logger.info("finished quantizing model")

    session = create_onnxruntime_session(output_path,
                                         args.use_gpu,
                                         enable_all_optimization=True,
                                         verbose=args.verbose)
    if session is not None:
        Gpt2Helper.test_parity(session,
                               model,
                               device,
                               args.precision == Precision.FLOAT16,
                               rtol=args.tolerance,
                               atol=args.tolerance,
                               model_class=args.model_class,
                               has_position_ids=use_padding,
                               has_attention_mask=use_padding)

    if args.input_test_file:
        test_inputs = []
        # Each line of test file is a JSON string like:
        # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]}
        with open(args.input_test_file) as read_f:
            for i, line in enumerate(read_f):
                line = line.rstrip()
                data = json.loads(line)
                input_ids = torch.from_numpy(
                    numpy.asarray(data["input_ids"],
                                  dtype=numpy.int64)).to(device)

                if use_padding:
                    if "attention_mask" in data:
                        numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
                        attention_mask = torch.from_numpy(
                            numpy.asarray(data["attention_mask"],
                                          dtype=numpy_float)).to(device)
                    else:
                        padding = -1
                        attention_mask = (
                            input_ids !=
                            padding).type(torch.float16 if args.precision ==
                                          Precision.FLOAT16 else torch.float32)
                        input_ids.masked_fill_(input_ids == padding, 0)

                    if "position_ids" in data:
                        position_ids = torch.from_numpy(
                            numpy.asarray(data["position_ids"],
                                          dtype=numpy.int64)).to(device)
                    else:
                        position_ids = (attention_mask.long().cumsum(-1) - 1)
                        position_ids.masked_fill_(position_ids < 0, 0)

                    inputs = {
                        "input_ids": input_ids,
                        "position_ids": position_ids,
                        "attention_mask": attention_mask
                    }
                else:
                    inputs = {"input_ids": input_ids}

                test_inputs.append(inputs)

        Gpt2Tester.test_generation(session,
                                   model,
                                   device,
                                   test_inputs,
                                   precision=args.precision,
                                   model_class=args.model_class,
                                   top_k=20,
                                   top_k_no_order=True,
                                   max_steps=24,
                                   max_inputs=0,
                                   verbose=args.verbose)

    logger.info(f"Done. Output model: {output_path}")