def optimize_onnx_model(onnx_model_filename, model_type, num_attention_heads, hidden_size, fp16): optimized_model_filename = onnx_model_filename.replace( ".onnx", "_fp16.onnx" if fp16 else "_fp32.onnx") if not os.path.exists(optimized_model_filename): import bert_model_optimization as bert_opt # Use onnxruntime to optimize model, which will be saved to *_ort_cpu.onnx opt_model = bert_opt.optimize_model(onnx_model_filename, model_type, num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=99, only_onnxruntime=True) optimize_model_statistics[ onnx_model_filename] = opt_model.get_fused_operator_statistics() # Use script to optimize model. opt_model = bert_opt.optimize_model(onnx_model_filename, model_type, num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0) optimize_model_statistics[ optimized_model_filename] = opt_model.get_fused_operator_statistics( ) if fp16: opt_model.convert_model_float32_to_float16() opt_model.save_model_to_file(optimized_model_filename) else: logger.info( f"Skip optimization since model existed: {optimized_model_filename}" ) return optimized_model_filename
def test_pytorch_model_0_gpu(self): if 'CUDAExecutionProvider' not in onnxruntime.get_available_providers( ): print("skip test_pytorch_model_0_gpu since no gpu found") return input = BERT_TEST_MODELS['bert_pytorch_0'] bert_model = optimize_model(input, 'bert', gpu_only=True, num_heads=2, hidden_size=8, sequence_length=10, input_int32=False, float16=False) expected_node_count = { 'EmbedLayerNormalization': 1, 'Attention': 12, 'SkipLayerNormalization': 24, 'FastGelu': 12, 'Gelu': 0, 'BiasGelu': 0 } self.verify_node_count(bert_model, expected_node_count)
def test_tensorflow_model_1_cpu(self): input = self.get_model("tensorflow", 1) # The model need constant folding. Use onnxruntime to do so for now. temp = 'temp.onnx' run_onnxruntime(input, use_gpu=False, optimized_model_path=temp) bert_model = optimize_model(temp, framework='tensorflow', gpu_only=False, num_heads=2, hidden_size=8, sequence_length=7, input_int32=False, float16=False, verbose=False) os.remove(temp) # Optimization for tensorflow model is still on-going. # TODO: update this after code complete. expected_node_count = { 'EmbedLayerNormalization': 0, 'Attention': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 25, 'BiasGelu': 0, 'Gelu': 12, 'FastGelu': 0 } self.verify_node_count(bert_model, expected_node_count)
def test_keras_squad_model(self): input = BERT_TEST_MODELS['bert_keras_squad'] bert_model = optimize_model(input, 'bert_keras', num_heads=2, hidden_size=8) self.assertTrue(bert_model.is_fully_optimized())
def test_pytorch_model_2_cpu(self): input = BERT_TEST_MODELS['bert_squad_pytorch1.4_opset10_fp32'] bert_model = optimize_model(input, 'bert', gpu_only=False, num_heads=2, hidden_size=8, sequence_length=10, input_int32=False, float16=False) self.assertTrue(bert_model.is_fully_optimized())
def test_keras_squad_model_cpu(self): input = BERT_TEST_MODELS['bert_keras_squad'] bert_model = optimize_model(input, 'bert_keras', gpu_only=False, num_heads=2, hidden_size=8, sequence_length=7, input_int32=False, float16=False) self.assertTrue(bert_model.is_fully_optimized())
def test_pytorch_model_0(self): input = BERT_TEST_MODELS['bert_pytorch_0'] bert_model = optimize_model(input, 'bert', num_heads=2, hidden_size=8) expected_node_count = { 'EmbedLayerNormalization': 1, 'Attention': 12, 'SkipLayerNormalization': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12 } self.verify_node_count(bert_model, expected_node_count)
def test_gpt2(self): input = BERT_TEST_MODELS['gpt2'] bert_model = optimize_model(input, 'gpt2', num_heads=2, hidden_size=4) expected_node_count = { 'EmbedLayerNormalization': 0, 'Attention': 12, 'Gelu': 0, 'FastGelu': 12, 'BiasGelu': 0, 'LayerNormalization': 25, 'SkipLayerNormalization': 0 } self.verify_node_count(bert_model, expected_node_count)
def test_pytorch_model_0_cpu(self): input = BERT_TEST_MODELS['bert_pytorch_0'] bert_model = optimize_model(input, 'bert', gpu_only=False, num_heads=2, hidden_size=8, sequence_length=10, input_int32=False, float16=False) expected_node_count = { 'EmbedLayerNormalization': 1, 'Attention': 12, 'SkipLayerNormalization': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12 } self.verify_node_count(bert_model, expected_node_count)
def test_gpt2(self): input = BERT_TEST_MODELS['gpt2'] bert_model = optimize_model(input, 'gpt2', gpu_only=False, num_heads=2, hidden_size=4, sequence_length=2, input_int32=False, float16=False) expected_node_count = { 'EmbedLayerNormalization': 0, 'Attention': 12, 'Gelu': 0, 'FastGelu': 12, 'BiasGelu': 0, 'LayerNormalization': 25, 'SkipLayerNormalization': 0 } self.verify_node_count(bert_model, expected_node_count)
def test_pytorch_model_0_cpu(self): input = self.get_model("pytorch", 0) bert_model = optimize_model(input, framework='pytorch', gpu_only=False, num_heads=2, hidden_size=8, sequence_length=10, input_int32=False, float16=False, verbose=False) expected_node_count = { 'EmbedLayerNormalization': 1, 'Attention': 12, 'SkipLayerNormalization': 24, 'Gelu': 12, 'FastGelu': 0, 'BiasGelu': 0 } self.verify_node_count(bert_model, expected_node_count)
def test_pytorch_model_2(self): input = BERT_TEST_MODELS['bert_squad_pytorch1.4_opset10_fp32'] bert_model = optimize_model(input, 'bert', num_heads=2, hidden_size=8) self.assertTrue(bert_model.is_fully_optimized())
def main(): args = parse_arguments() setup_logger(args.verbose) dump_environment() enable_past_input = args.enable_past_input cache_dir = args.cache_dir if not os.path.exists(cache_dir): os.makedirs(cache_dir) output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) (model_class, tokenizer_class, model_name_or_path) = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) model = model_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) model.eval().cpu() inputs = tokenizer.encode_plus("Here is an example input for GPT2 model", add_special_tokens=True, return_tensors='pt') input_ids = inputs['input_ids'] outputs = model(input_ids=input_ids, past=None) num_layer = model.config.n_layer present_names = [f'present_{i}' for i in range(num_layer)] output_names = ["last_state"] + present_names input_names = ['input_ids'] dynamic_axes = { 'input_ids': { 0: 'batch_size', 1: 'seq_len' }, 'last_state': { 0: 'batch_size', 1: 'seq_len' } } for name in present_names: dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} if enable_past_input: past_names = [f'past_{i}' for i in range(num_layer)] input_names = ['input_ids'] + past_names dummy_past = [ torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer) ] for name in past_names: dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} export_inputs = (inputs['input_ids'], tuple(dummy_past)) else: export_inputs = (inputs['input_ids']) export_model_path = os.path.join( output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input))) torch.onnx.export(model, args=export_inputs, f=export_model_path, input_names=input_names, output_names=output_names, dynamic_axes=dynamic_axes, opset_version=11, do_constant_folding=True, verbose=False) # Let's run performance test on PyTorch before updating environment variable. past = dummy_past if enable_past_input else None outputs = pytorch_inference(model, input_ids, past, total_runs=args.total_runs) # setup environment variables before importing onnxruntime. setup_environment(args.use_openmp) import onnxruntime onnx_model_path = export_model_path if enable_past_input else remove_past_outputs( export_model_path) if args.enable_optimization: from bert_model_optimization import optimize_model m = optimize_model(onnx_model_path, model_type='gpt2', gpu_only=False, num_heads=12, hidden_size=768, sequence_length=64, input_int32=False, float16=False, opt_level=0) onnx_model_path = os.path.join( output_dir, 'gpt2_past{}_optimized.onnx'.format(int(enable_past_input))) m.save_model_to_file(onnx_model_path) if 'CUDAExecutionProvider' in onnxruntime.get_available_providers(): logger.warning( "onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference." ) sess_options = onnxruntime.SessionOptions() if args.use_openmp: sess_options.intra_op_num_threads = 1 else: sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) logger.info( f"session option: intra_op_num_threads={sess_options.intra_op_num_threads}" ) logger.info(f"Start inferencing onnx model: {onnx_model_path}") session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=['CPUExecutionProvider']) ort_outputs = onnxruntime_inference(session, input_ids, past, args.total_runs) if args.verify_outputs: logger.info( 'PyTorch and OnnxRuntime output 0 (last_state) are close:'.format( 0), numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04)) for layer in range(model.config.n_layer): logger.info( 'PyTorch and OnnxRuntime layer {} state (present_{}) are close:' .format(layer, layer), numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04))