def inference(model_path, dummy_inputs, outputs_path, use_gpu): environ_reset() environ_setting_nodes() environ_setting_paths(outputs_path) session = create_onnxruntime_session(model_path, use_gpu, enable_all_optimization=False) Gpt2Helper.onnxruntime_inference(session, dummy_inputs)
def run_parity(task: ParityTask, args): onnx_model_paths = Gpt2Helper.get_onnx_paths( 'onnx_models', args.model_name_or_path, new_folder=args.use_external_data_format, remove_existing=[]) fp32_baseline, fp16_baseline = get_baselines(args) task.run(fp32_baseline, "FP32 baseline") # The following tests for fp16 requires GPU if not args.use_gpu: logger.info("skip mixed precision since --use_gpu is not specified") return task.run(fp16_baseline, "FP16 baseline") last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"]) # Mixed precision baseline run_candidate(task, args, last_matmul_node_name, op_block_list=[]) # Result from tuning step 1 run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"]) if args.all: run_tuning_step0(task, fp16_baseline) mixed_precision_baseline = get_mixed_precision_parameters( args, last_matmul_node_name, op_block_list=[]) run_tuning_step1(task, mixed_precision_baseline) run_tuning_step2(task, mixed_precision_baseline) else: run_candidate(task, args, last_matmul_node_name, op_block_list=["LayerNormalization", "Add"]) run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "Add"]) # Run a few good candidates run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization", "Add"]) run_candidate( task, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather"]) run_candidate(task, args, last_matmul_node_name, \ op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather", "MatMul"])
def get_dummy_inputs( batch_size: int, past_sequence_length: int, sequence_length: int, num_attention_heads: int, hidden_size: int, num_layer: int, vocab_size: int, device: torch.device, float16: bool = False, has_position_ids: bool = True, has_attention_mask: bool = True) -> Gpt2BeamSearchInputs: """Create random inputs for GPT2 model. Returns torch tensors of input_ids, position_ids, attention_mask and a list of past state tensors. """ gpt2_dummy_inputs = Gpt2Helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, num_attention_heads, hidden_size, num_layer, vocab_size, device, float16, has_position_ids, has_attention_mask) float_type = torch.float16 if float16 else torch.float32 beam_select_idx = torch.zeros([1, batch_size], device=device).long() input_log_probs = torch.zeros([batch_size, 1], dtype=float_type, device=device) input_unfinished_sents = torch.ones([batch_size, 1], dtype=torch.bool, device=device) if has_position_ids: prev_step_results = torch.randint( low=0, high=vocab_size - 1, size=(batch_size, sequence_length), dtype=torch.int64, device=device, ) else: prev_step_results = None prev_step_scores = torch.zeros([batch_size, 1], dtype=float_type, device=device) return Gpt2BeamSearchInputs( gpt2_dummy_inputs.input_ids, gpt2_dummy_inputs.past, gpt2_dummy_inputs.position_ids, gpt2_dummy_inputs.attention_mask, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores, )
def run_parity_disable_half2(task: ParityTask, args): onnx_model_paths = Gpt2Helper.get_onnx_paths( 'onnx_models', args.model_name_or_path, new_folder=args.use_external_data_format, remove_existing=[]) last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"]) run_candidate(task, args, last_matmul_node_name, op_block_list=[]) run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"]) run_candidate(task, args, last_matmul_node_name, op_block_list=["LayerNormalization", "Add"])
def prepare_io_binding( ort_session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes, beam_select_idx=None, input_log_probs=None, input_unfinished_sents=None, prev_step_results=None, prev_step_scores=None, ): """Returnas IO binding object for a session.""" # Bind (input_ids, position_ids, attention_mask and past_*) and all outputs io_binding = Gpt2Helper.prepare_io_binding( ort_session, input_ids, position_ids, attention_mask, past=past, output_buffers=output_buffers, output_shapes=output_shapes, ) # Bind the remaining inputs other_inputs = { "beam_select_idx": beam_select_idx, "input_log_probs": input_log_probs, "input_unfinished_sents": input_unfinished_sents, "prev_step_results": prev_step_results, "prev_step_scores": prev_step_scores, } name_to_np_type = TypeHelper.get_io_numpy_type_map(ort_session) for name, tensor in other_inputs.items(): if tensor is not None: assert tensor.is_contiguous() io_binding.bind_input( name, tensor.device.type, 0, name_to_np_type[name], list(tensor.size()), tensor.data_ptr(), ) return io_binding
def main(args): logger.info(f"Arguments:{args}") if args.precision == Precision.FLOAT16: assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" torch.set_num_threads( psutil.cpu_count( logical=True) if args.thread_num <= 0 else args.thread_num) print(torch.__config__.parallel_info()) cache_dir = args.cache_dir output_dir = args.onnx_dir prepare_environment(cache_dir, output_dir, args.use_gpu) model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, torchscript=args.torchscript, cache_dir=cache_dir) model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) # This scirpt does not support float16 for PyTorch. #if args.float16: # model.half() device = torch.device("cuda:0" if args.use_gpu else "cpu") model.to(device) use_external_data_format = (config.n_layer > 24 ) #TODO: find a way to check model size > 2GB onnx_model_paths = Gpt2Helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, has_past=True, new_folder=use_external_data_format) onnx_model_path = onnx_model_paths["raw"] use_padding = MODEL_CLASSES[args.model_class][2] Gpt2Helper.export_onnx(model, device, onnx_model_path, args.verbose, use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) if args.optimize_onnx or args.precision != Precision.FLOAT32: onnx_model_path = onnx_model_paths[str( args.precision) if args.precision != Precision.INT8 else 'fp32'] Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, use_external_data_format) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_paths["int8"], use_external_data_format) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") onnx_model_path = onnx_model_paths["int8"] if args.torchscript: model = Gpt2Helper.torchscript(model, config, device, has_position_ids=use_padding, has_attention_mask=use_padding) session = create_onnxruntime_session(onnx_model_path, args.use_gpu, enable_all_optimization=False, num_threads=args.thread_num, verbose=args.verbose) if session is None: return # Allocate output buffers for IO Binding max_output_shapes = Gpt2Helper.get_output_shapes( max(args.batch_sizes), max(args.past_sequence_lengths), max(args.sequence_lengths), config, args.model_class) output_buffers = Gpt2Helper.get_output_buffers( max_output_shapes, device, args.precision == Precision.FLOAT16) csv_filename = args.result_csv or "benchmark_result_{}.csv".format( datetime.now().strftime("%Y%m%d-%H%M%S")) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "model_name", "model_class", "gpu", "precision", "optimizer", "torchscript", "batch_size", "sequence_length", "past_sequence_length", "torch_latency", "onnxruntime_latency", "onnxruntime_io_binding_latency" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) csv_writer.writeheader() for batch_size in args.batch_sizes: for sequence_length in args.sequence_lengths: for past_sequence_length in args.past_sequence_lengths: assert batch_size > 0 and sequence_length > 0 and past_sequence_length >= 0 logger.debug( f"Running test for batch_size={batch_size} sequence_length={sequence_length} past_sequence_length={past_sequence_length}..." ) dummy_inputs = Gpt2Helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, float16=(args.precision == Precision.FLOAT16), has_position_ids=use_padding, has_attention_mask=use_padding) output_shapes = Gpt2Helper.get_output_shapes( batch_size, past_sequence_length, sequence_length, config, args.model_class) try: outputs, torch_latency = Gpt2Helper.pytorch_inference( model, dummy_inputs, args.test_times) ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference( session, dummy_inputs, args.test_times) ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io( session, dummy_inputs, output_buffers, output_shapes, args.test_times, return_numpy=False, include_copy_output_latency=args. include_copy_output_latency) if args.validate_onnx: if Gpt2Helper.compare_outputs( outputs, ort_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) # Results of IO binding might be in GPU. Copy outputs to CPU for comparison. copy_outputs = [] for output in ort_io_outputs: copy_outputs.append(output.cpu().numpy()) if Gpt2Helper.compare_outputs( outputs, copy_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) logger.info( f"batch_size={batch_size}, sequence_length={sequence_length}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, onnxruntime_latency={ort_latency:.2f}, onnxruntime_io_binding_latency={ort_io_latency:.2f}" ) row = { "model_name": args.model_name_or_path, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "torchscript": args.torchscript, "batch_size": batch_size, "sequence_length": sequence_length, "past_sequence_length": past_sequence_length, "torch_latency": f"{torch_latency:.2f}", "onnxruntime_latency": f"{ort_latency:.2f}", "onnxruntime_io_binding_latency": f"{ort_io_latency:.2f}" } csv_writer.writerow(row) except: logger.error(f"Exception", exc_info=True) logger.info(f"Results are saved to file {csv_filename}") return csv_filename
if __name__ == '__main__': # Below example shows how to use this helper to investigate parity issue of gpt-2 fp32 and fp16 onnx model # Please build ORT with --cmake_extra_defines onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS=ON !! multiprocessing.set_start_method('spawn') # Generate Inputs sequence_length = 8 past_sequence_length = 8 batch_size = 5 dummy_inputs_fp16 = Gpt2Helper.get_dummy_inputs(batch_size, past_sequence_length, sequence_length, 12, 768, 12, 50257, device=torch.device("cpu"), float16=True) dummy_inputs_fp32 = dummy_inputs_fp16.to_fp32() # Get GPT-2 model from huggingface using convert_to_onnx.py os.system( 'python convert_to_onnx.py -m gpt2 --output gpt2_fp32.onnx -o -p fp32 --use_gpu' ) os.system( 'python convert_to_onnx.py -m gpt2 --output gpt2_fp16.onnx -o -p fp16 --use_gpu' ) # Specify the directory to dump the node's I/O
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=False, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class) logger.info(f"Done. Output model: {output_path}")
def test_generation(session, model, device, test_inputs, precision=Precision.FLOAT32, model_class='Gpt2LMHeadModel', top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=False): """ Test Generation using greedy beam search (without sampling) to compare PyTorch and ONNX model. It will print top 1 and top k errors on the given test inputs. """ print( f"start test generation: (top_k={top_k} top_k_no_order={top_k_no_order} max_steps={max_steps} test_inputs={len(test_inputs)} max_inputs={max_inputs})" ) n_layer = model.config.n_layer n_head = model.config.n_head n_embd = model.config.n_embd vocab_size = model.config.vocab_size eos_token_id = model.config.eos_token_id torch_float = torch.float16 if precision == Precision.FLOAT16 else torch.float32 is_float16 = (precision == Precision.FLOAT16) if is_float16: assert 'float16' in session.get_outputs()[0].type model.eval().to(device).to(torch_float) # Allocate initial buffers for IO Binding of ONNX Runtimne. The buffer size will automatically increase later. init_output_shapes = Gpt2Helper.get_output_shapes( batch_size=4, past_sequence_length=128, sequence_length=32, config=model.config, model_class=model_class) output_buffers = Gpt2Helper.get_output_buffers( init_output_shapes, device, is_float16=(torch_float == torch.float16)) baseline_name = 'Torch' treatment_name = 'Quantized Onnx' if precision == Precision.INT8 else "Onnx" torch_metric = Gpt2Metric(baseline_name, baseline_name, top_k) onnx_metric = Gpt2Metric(treatment_name, baseline_name, top_k) onnx_io_metric = Gpt2Metric(treatment_name + ' with IO Binding', baseline_name, top_k) for i, inputs in enumerate(test_inputs): if (max_inputs > 0 and i == max_inputs): break if i % 10 == 0: print(f"{i}") input_ids = inputs["input_ids"] position_ids = inputs[ "position_ids"] if "position_ids" in inputs else None attention_mask = inputs[ "attention_mask"] if "attention_mask" in inputs else None onnx_runner = Gpt2Tester(input_ids, position_ids, attention_mask, n_head, n_embd, n_layer, device, is_float16, top_k, not top_k_no_order) onnx_io_runner = Gpt2Tester(input_ids, position_ids, attention_mask, n_head, n_embd, n_layer, device, is_float16, top_k, not top_k_no_order) torch_runner = Gpt2Tester(input_ids, position_ids, attention_mask, n_head, n_embd, n_layer, device, is_float16, top_k, not top_k_no_order) batch_size = torch_runner.batch_size onnx_metric.start_batch(batch_size) onnx_io_metric.start_batch(batch_size) with torch.no_grad(): done = torch.zeros(batch_size, dtype=torch.bool) for step in range(max_steps): seq_len = list(onnx_runner.input_ids.size())[1] past_seq_len = list(onnx_runner.past[0].size())[3] start_time = timeit.default_timer() pytorch_output = Gpt2Helper.pytorch_inference( model, torch_runner.get_inputs()) torch_metric.add_latency( past_seq_len, timeit.default_timer() - start_time) torch_runner.update(pytorch_output, step, device) onnx_output, avg_latency_ms = Gpt2Helper.onnxruntime_inference( session, onnx_runner.get_inputs(), total_runs=1) onnx_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0) onnx_runner.update(onnx_output, step, device) output_shapes = Gpt2Helper.get_output_shapes( batch_size, past_seq_len, seq_len, model.config, model_class=model_class) Gpt2Helper.auto_increase_buffer_size( output_buffers, output_shapes) onnx_io_output, avg_latency_ms = Gpt2Helper.onnxruntime_inference_with_binded_io( session, onnx_io_runner.get_inputs(), output_buffers, output_shapes, total_runs=1, return_numpy=False, include_copy_output_latency=True) onnx_io_metric.add_latency(past_seq_len, avg_latency_ms / 1000.0) onnx_io_runner.update(onnx_io_output, step, device) if verbose: onnx_runner.diff(onnx_io_runner) Gpt2Tester.diff_present(onnx_output, onnx_io_output, n_layer) print("Top 1 tokens:") print("\tTorch", torch_runner.top_1_tokens) print("\ONNX", onnx_runner.top_1_tokens) print("\ONNX with IO binding", onnx_io_runner.top_1_tokens) onnx_metric.eval_batch(torch_runner, onnx_runner, past_seq_len, verbose=verbose) onnx_io_metric.eval_batch(torch_runner, onnx_io_runner, past_seq_len, verbose=verbose) done = done | (torch_runner.top_1_tokens == eos_token_id).any() if torch.all(done): break onnx_metric.end_batch() onnx_io_metric.end_batch() torch_metric.print() onnx_metric.print() onnx_io_metric.print()
def prepare_io_binding(ort_session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes, beam_select_idx=None, input_log_probs=None, input_unfinished_sents=None, prev_step_results=None, prev_step_scores=None): """Returnas IO binding object for a session.""" # Bind inputs and outputs to onnxruntime session io_binding = Gpt2Helper.prepare_io_binding( ort_session, input_ids, position_ids, attention_mask, past=past, output_buffers=output_buffers, output_shapes=output_shapes) # Bind inputs data_type = output_buffers[ort_session.get_outputs()[1].name].dtype float_type = numpy.float16 if data_type == torch.float16 else numpy.float32 if past is not None: for i, past_i in enumerate(past): assert past_i.is_contiguous() data_ptr = past_i.data_ptr() if data_ptr == 0: # When past_sequence_length is 0, its data_ptr will be zero. IO Binding asserts that data_ptr shall not be zero. # Here we workaround and pass data pointer of input_ids. Actual data is not used for past so it does not matter. data_ptr = input_ids.data_ptr() io_binding.bind_input(f'past_{i}', past_i.device.type, 0, float_type, list(past_i.size()), data_ptr) if attention_mask is not None: assert attention_mask.is_contiguous() io_binding.bind_input('attention_mask', attention_mask.device.type, 0, float_type, list(attention_mask.size()), attention_mask.data_ptr()) if beam_select_idx is not None: assert beam_select_idx.is_contiguous() io_binding.bind_input( "beam_select_idx", beam_select_idx.device.type, 0, numpy.longlong, list(beam_select_idx.size()), beam_select_idx.data_ptr(), ) if input_log_probs is not None: assert input_log_probs.is_contiguous() io_binding.bind_input( "input_log_probs", input_log_probs.device.type, 0, float_type, list(input_log_probs.size()), input_log_probs.data_ptr(), ) if input_unfinished_sents is not None: assert input_unfinished_sents.is_contiguous() io_binding.bind_input( "input_unfinished_sents", input_unfinished_sents.device.type, 0, numpy.bool, list(input_unfinished_sents.size()), input_unfinished_sents.data_ptr(), ) if prev_step_results is not None: assert prev_step_results.is_contiguous() io_binding.bind_input( "prev_step_results", prev_step_results.device.type, 0, numpy.longlong, list(prev_step_results.size()), prev_step_results.data_ptr(), ) if prev_step_scores is not None: assert prev_step_scores.is_contiguous() io_binding.bind_input( "prev_step_scores", prev_step_scores.device.type, 0, float_type, list(prev_step_scores.size()), prev_step_scores.data_ptr(), ) # Bind outputs for output in ort_session.get_outputs(): output_name = output.name output_buffer = output_buffers[output_name] logger.debug( f"{output_name} device type={output_buffer.device.type} shape={list(output_buffer.size())}" ) if (output_name == "output_selected_indices" or output_name == "last_state" or output_name == "current_step_results"): io_binding.bind_output( output_name, output_buffer.device.type, 0, numpy.longlong, output_shapes[output_name], output_buffer.data_ptr(), ) elif output_name == "output_unfinished_sents": io_binding.bind_output( output_name, output_buffer.device.type, 0, numpy.bool, output_shapes[output_name], output_buffer.data_ptr(), ) else: io_binding.bind_output( output_name, output_buffer.device.type, 0, float_type, output_shapes[output_name], output_buffer.data_ptr(), ) return io_binding
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=False, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class) if args.input_test_file: test_inputs = [] with open(args.input_test_file) as read_f: for i, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } test_inputs.append(inputs) Gpt2Tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose) logger.info(f"Done. Output model: {output_path}")
def main(): args = parse_arguments() setup_logger(args.verbose) logger.info(f"Arguments:{args}") if args.precision == Precision.FLOAT16: assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" torch.set_num_threads( psutil.cpu_count( logical=True) if args.thread_num <= 0 else args.thread_num) print(torch.__config__.parallel_info()) cache_dir = args.cache_dir output_dir = args.onnx_dir prepare_environment(cache_dir, output_dir, args.use_gpu) model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name, torchscript=args.torchscript, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name, config=config, cache_dir=cache_dir) # This scirpt does not support float16 for PyTorch. #if args.float16: # model.half() device = torch.device("cuda:0" if args.use_gpu else "cpu") model.to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name, args.model_class) onnx_model_path = onnx_model_paths["raw"] Gpt2Helper.export_onnx(model, device, onnx_model_path, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: onnx_model_path = onnx_model_paths[str(args.precision)] Gpt2Helper.optimize_onnx(onnx_model_paths["raw"], onnx_model_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") if args.torchscript: model = Gpt2Helper.torchscript(model, config, device) session = create_onnxruntime_session(onnx_model_path, args.use_gpu, enable_all_optimization=False, num_threads=args.thread_num, verbose=args.verbose) if session is None: return # One word is generated for each inference. This length does not include that of past state. sequence_length = 1 # Allocate output buffers for IO Binding max_output_shapes = Gpt2Helper.get_output_shapes( max(args.batch_sizes), max(args.past_sequence_lengths), sequence_length, config, args.model_class) output_buffers = Gpt2Helper.get_output_buffers( max_output_shapes, device, args.precision == Precision.FLOAT16) csv_filename = args.result_csv or "benchmark_result_{}.csv".format( datetime.now().strftime("%Y%m%d-%H%M%S")) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "model_name", "model_class", "gpu", "precision", "optimizer", "torchscript", "batch_size", "past_sequence_length", "torch_latency", "ort_latency", "ort_io_latency" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) csv_writer.writeheader() for batch_size in args.batch_sizes: for past_sequence_length in args.past_sequence_lengths: logger.debug( f"Running test for batch_size={batch_size} past_sequence_length={past_sequence_length}..." ) dummy_inputs = Gpt2Helper.get_dummy_inputs( batch_size, past_sequence_length, sequence_length, config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, args.precision == Precision.FLOAT16) output_shapes = Gpt2Helper.get_output_shapes( batch_size, past_sequence_length, sequence_length, config, args.model_class) try: outputs, torch_latency = Gpt2Helper.pytorch_inference( model, dummy_inputs, args.test_times) ort_outputs, ort_latency = Gpt2Helper.onnxruntime_inference( session, dummy_inputs, args.test_times) ort_io_outputs, ort_io_latency = Gpt2Helper.onnxruntime_inference_with_binded_io( session, dummy_inputs, output_buffers, output_shapes, args.test_times) if args.validate_onnx: if Gpt2Helper.compare_outputs( outputs, ort_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) if Gpt2Helper.compare_outputs( outputs, ort_io_outputs, rtol=DEFAULT_TOLERANCE[args.precision], atol=DEFAULT_TOLERANCE[args.precision]): logger.info( f'Pytorch and ONNX Runtime IO Binding outputs are all close (tolerance={DEFAULT_TOLERANCE[args.precision]}).' ) logger.info( f"batch_size={batch_size}, past_sequence_length={past_sequence_length}, torch_latency={torch_latency:.2f}, ort_latency={ort_latency:.2f}, ort_io_latency={ort_io_latency:.2f}" ) row = { "model_name": args.model_name, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "torchscript": args.torchscript, "batch_size": batch_size, "past_sequence_length": past_sequence_length, "torch_latency": f"{torch_latency:.2f}", "ort_latency": f"{ort_latency:.2f}", "ort_io_latency": f"{ort_io_latency:.2f}" } csv_writer.writerow(row) except: logger.error(f"Exception", exc_info=True) logger.info(f"Results are saved to file {csv_filename}")
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) use_external_data_format = (config.n_layer > 24 ) #TODO: find a way to check model size > 2GB onnx_model_paths = Gpt2Helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, new_folder=use_external_data_format) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] logger.info(f"Exporting ONNX model to {raw_onnx_model}") use_padding = MODEL_CLASSES[args.model_class][2] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose, use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) if args.optimize_onnx or args.precision != Precision.FLOAT32: logger.info(f"Optimizing model to {output_path}") Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding) if args.input_test_file: test_inputs = [] # Each line of test file is a JSON string like: # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]} with open(args.input_test_file) as read_f: for i, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) if use_padding: if "attention_mask" in data: numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 attention_mask = ( input_ids != padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) else: position_ids = (attention_mask.long().cumsum(-1) - 1) position_ids.masked_fill_(position_ids < 0, 0) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } else: inputs = {"input_ids": input_ids} test_inputs.append(inputs) Gpt2Tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose) logger.info(f"Done. Output model: {output_path}")