def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source): import onnxruntime results = [] if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): logger.warning( "Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] if 'pt' in model_source: with torch.no_grad(): onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) if 'tf' in model_source: onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, cache_dir, onnx_dir, input_names, use_gpu, precision, optimize_onnx, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, enable_all_optimization=True, num_threads=num_threads, verbose=verbose) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = {"last_state": None, "pooler": None} device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size) ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if 'pt' in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, input_value_type) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "device": device, "optimizer": optimize_onnx, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort(ort_session, ort_inputs, result_template, repeat_times, batch_size) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) data_type = numpy.longlong if 'pt' in model_source else numpy.int32 result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, max_last_state_size, max_pooler_size, batch_size, device, data_type) logger.info(result) results.append(result) return results
def test_ort_latency(device, model, model_name, description, ort_session, batch_sizes, sequence_lengths, global_lengths, test_times, num_threads, optimizer=False, precision='fp32', validate_onnx=True, disable_io_binding=False, verbose=True): results = [] for batch_size in batch_sizes: for sequence_length in sequence_lengths: for global_length in global_lengths: assert global_length <= model.config.attention_window[ 0], "Limitation of current implementation: number of global token <= attention_window" print( f"Testing batch_size={batch_size} sequence_length={sequence_length} global_length={global_length} optimizer={optimizer}, precision={precision} io_binding={not disable_io_binding}..." ) dummy_inputs: LongforerInputs = LongformerHelper.get_dummy_inputs(batch_size, sequence_length, global_length, device) # Run OnnxRuntime ort_inputs = dummy_inputs.get_ort_inputs() if verbose: print(ort_inputs) # run one query for warm up ort_outputs = ort_session.run(None, ort_inputs) result_template = { "model_name": model_name, "description": description, "inputs": 3, "engine": "OnnxRuntime", "version": onnxruntime.__version__, "device": "cuda", "precision": precision, "optimizer": optimizer, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "global_length": global_length, "test_times": test_times, "datetime": str(datetime.now()), "memory": "", } if not disable_io_binding: max_last_state_size = max(batch_sizes) * max(sequence_lengths) * model.config.hidden_size max_pooler_size = max(batch_sizes) * max(sequence_lengths) result = benchmark_helper.inference_ort_with_io_binding( ort_session, ort_inputs, result_template=result_template, repeat_times=test_times, ort_output_names=["last_state", "pooler"], ort_outputs=ort_outputs, output_buffers=[], output_buffer_max_sizes=[max_last_state_size, max_pooler_size], batch_size=batch_size, device=device, data_type=np.longlong, #input data type ) else: result = benchmark_helper.inference_ort(ort_session, ort_inputs, result_template=result_template, repeat_times=test_times, batch_size=batch_size) if validate_onnx: max_diff = test_parity(device, model, ort_session, batch_size, sequence_length, global_length, verbose) result["description"] += f"(max_diff={max_diff})" results.append(result) return results
def run_onnxruntime( use_gpu, provider, model_names, model_class, config_modifier, precision, num_threads, batch_sizes, sequence_lengths, repeat_times, input_counts, optimizer_info, validate_onnx, cache_dir, onnx_dir, verbose, overwrite, disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source, args, ): import onnxruntime results = [] if (use_gpu and ("CUDAExecutionProvider" not in onnxruntime.get_available_providers()) and ("ROCMExecutionProvider" not in onnxruntime.get_available_providers())): logger.error( "Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) return results warm_up_repeat = 0 if provider == "tensorrt": optimizer_info = OptimizerInfo.NOOPT warm_up_repeat = 5 if "TensorrtExecutionProvider" not in onnxruntime.get_available_providers( ): logger.error( "Please install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance." ) return results if optimizer_info == OptimizerInfo.NOOPT: logger.warning( f"OptimizerInfo is set to {optimizer_info}, graph optimizations specified in FusionOptions are not applied." ) for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: if num_inputs > len(all_input_names): break input_names = all_input_names[:num_inputs] args.model_type = MODELS[model_name][3] fusion_options = FusionOptions.parse(args) if "pt" in model_source: with torch.no_grad(): ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_pt( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if "tf" in model_source: ( onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length, ) = export_onnx_model_from_tf( model_name, MODELS[model_name][1], MODELS[model_name][2], MODELS[model_name][3], model_class, config_modifier, cache_dir, onnx_dir, input_names, use_gpu, precision, optimizer_info, validate_onnx, use_raw_attention_mask, overwrite, model_fusion_statistics, fusion_options, ) if not is_valid_onnx_model: continue ort_session = create_onnxruntime_session( onnx_model_file, use_gpu, provider, enable_all_optimization=True, num_threads=num_threads, verbose=verbose, ) if ort_session is None: continue ort_output_names = [ node_arg.name for node_arg in ort_session.get_outputs() ] output_buffers = [] device = "cuda" if use_gpu else "cpu" config = AutoConfig.from_pretrained(model_name, cache_dir=cache_dir) max_last_state_size = numpy.prod([ max(batch_sizes), max(sequence_lengths), max(vocab_size, config.hidden_size), ]) max_pooler_size = numpy.prod( [max(batch_sizes), config.hidden_size]) for batch_size in batch_sizes: if batch_size <= 0: continue for sequence_length in sequence_lengths: if max_sequence_length is not None and sequence_length > max_sequence_length: continue input_value_type = numpy.int64 if "pt" in model_source else numpy.int32 ort_inputs = create_onnxruntime_input( vocab_size, batch_size, sequence_length, input_names, config, input_value_type, ) result_template = { "engine": "onnxruntime", "version": onnxruntime.__version__, "providers": provider, "device": device, "optimizer": optimizer_info, "precision": precision, "io_binding": not disable_ort_io_binding, "model_name": model_name, "inputs": num_inputs, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "custom_layer_num": config_modifier.get_layer_num(), "datetime": str(datetime.now()), } logger.info( "Run onnxruntime on {} with input shape {}".format( model_name, [batch_size, sequence_length])) if disable_ort_io_binding: result = inference_ort( ort_session, ort_inputs, result_template, repeat_times, batch_size, warm_up_repeat, ) else: # Get output sizes from a dummy ort run ort_outputs = ort_session.run(ort_output_names, ort_inputs) output_buffer_max_sizes = [max_last_state_size] for i in range(len(ort_outputs)): if i == 2 and MODELS[model_name][3] == "gpt": # past state output max size output_buffer_max_sizes.append(max_pooler_size) else: output_buffer_max_sizes.append( max_last_state_size) data_type = numpy.longlong if "pt" in model_source else numpy.intc result = inference_ort_with_io_binding( ort_session, ort_inputs, result_template, repeat_times, ort_output_names, ort_outputs, output_buffers, output_buffer_max_sizes, batch_size, device, data_type, warm_up_repeat, ) logger.info(result) results.append(result) return results
def test_ort_latency( device, model, model_name, description, ort_session, batch_sizes, sequence_lengths, global_lengths, test_times, num_threads, optimizer=False, precision="fp32", disable_io_binding=False, verbose=True, use_compact_memory=False, use_half4=False, disable_parity=False, ) -> List[Dict[str, Any]]: results = [] for batch_size in batch_sizes: for sequence_length in sequence_lengths: for global_length in global_lengths: assert ( global_length <= model.config.attention_window[0] ), "Limitation of current implementation: number of global token <= attention_window" logger.info( f"Testing batch_size={batch_size} sequence_length={sequence_length} global_length={global_length} " f"optimizer={optimizer}, precision={precision} io_binding={not disable_io_binding}..." ) dummy_inputs: LongformerInputs = LongformerHelper.get_dummy_inputs( batch_size, sequence_length, global_length, device) # Run OnnxRuntime ort_inputs = dummy_inputs.get_ort_inputs() if verbose: print(ort_inputs) # run one query for warm up ort_outputs = ort_session.run(None, ort_inputs) result_template = { "model_name": model_name, "description": description, "inputs": 3, "engine": "OnnxRuntime", "version": str(onnxruntime.__version__), "device": "cuda", "precision": str(precision), "optimizer": int(optimizer), "threads": int(num_threads), "batch_size": int(batch_size), "sequence_length": int(sequence_length), "global_length": int(global_length), "test_times": int(test_times), "datetime": str(datetime.now()), "memory": "", "diff_max": None, "diff_90_percentile": None, "diff_95_percentile": None, "diff_99_percentile": None, "use_compact_memory": use_compact_memory, "use_half4": use_half4, } if not disable_io_binding: max_last_state_size = max(batch_sizes) * max( sequence_lengths) * model.config.hidden_size max_pooler_size = max(batch_sizes) * max(sequence_lengths) result = benchmark_helper.inference_ort_with_io_binding( ort_session, ort_inputs, result_template=result_template, repeat_times=test_times, ort_output_names=["last_state", "pooler"], ort_outputs=ort_outputs, output_buffers=[], output_buffer_max_sizes=[ max_last_state_size, max_pooler_size ], batch_size=batch_size, device=device, data_type=np.longlong, # input data type ) else: result = benchmark_helper.inference_ort( ort_session, ort_inputs, result_template=result_template, repeat_times=test_times, batch_size=batch_size, ) # measure result difference between PyTorch and OnnxRuntime if not disable_parity: diff_results = [ test_parity( device, model, ort_session, batch_size, sequence_length, global_length, verbose, ) for _ in range(test_times) ] result["diff_max"] = max(diff_results) result["diff_90_percentile"] = np.percentile( diff_results, 90) result["diff_95_percentile"] = np.percentile( diff_results, 95) result["diff_99_percentile"] = np.percentile( diff_results, 99) results.append(result) return results
def test_onnxruntime(device, model, model_name, ort_session, batch_sizes, sequence_lengths, global_lengths, test_times, num_threads, optimizer=False, precision='fp32', verbose=True): results = [] for batch_size in batch_sizes: for sequence_length in sequence_lengths: # This is total length of <query, document>. for global_length in global_lengths: # This is length of <query>. Short query (8) for search keywords, and longer query (16) for question like print( f"Testing batch_size={batch_size} sequence_length={sequence_length} global_length={global_length} optimizer={optimizer}, precision={precision}..." ) input_ids, attention_mask, global_attention_mask = get_dummy_inputs( batch_size, sequence_length, global_length, device) # Run OnnxRuntime ort_inputs = { "input_ids": input_ids.cpu().numpy(), "attention_mask": attention_mask.cpu().numpy(), "global_attention_mask": global_attention_mask.cpu().numpy() } if verbose: pprint.pprint(ort_inputs) # run one query for warm up ort_outputs = ort_session.run(None, ort_inputs) if verbose: # Run PyTorch then compare the results with OnnxRuntime. torch_outputs = model( input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask) max_diff = diff_outputs(ort_outputs, torch_outputs) print("max diff for outputs", max_diff) if max(max_diff) > 0.001: print("ort_inputs", ort_inputs) print("ort_outputs", ort_outputs) device = input_ids.device result_template = { "model_name": model_name, "inputs": 3, "engine": "OnnxRuntime", "version": onnxruntime.__version__, "device": "cuda", "precision": precision, "optimizer": optimizer, "threads": num_threads, "batch_size": batch_size, "sequence_length": sequence_length, "global_length": global_length, "test_times": test_times, "datetime": str(datetime.now()), } max_last_state_size = max(batch_sizes) * max( sequence_lengths) * model.config.hidden_size max_pooler_size = max(batch_sizes) * max(sequence_lengths) """ result = benchmark_helper.inference_ort_with_io_binding( ort_session, ort_inputs, result_template=result_template, repeat_times=test_times, ort_output_names=["last_state", "pooler"], ort_outputs=ort_outputs, output_buffers=[], output_buffer_max_sizes=[max_last_state_size, max_pooler_size], batch_size=batch_size, device=device) """ result = benchmark_helper.inference_ort( ort_session, ort_inputs, result_template=result_template, repeat_times=test_times, batch_size=batch_size) pprint.pprint(result) results.append(result) return results