def test_attention_fusion(self): model = create_bert_attention() dir = '.' model_path = os.path.join(dir, "attention.onnx") onnx.save(model, model_path) optimized_model = optimize_model(model_path) os.remove(model_path) expected_model_path = os.path.join(os.path.dirname(__file__), 'test_data', 'models', 'attention_opt.onnx') expected = onnx.load(expected_model_path) self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
def test_gpt2_past_mask(self): input = _get_test_model_path('gpt2_past_mask') model = optimize_model(input, 'gpt2', num_heads=2, hidden_size=4) expected_node_count = { 'EmbedLayerNormalization': 0, 'Attention': 1, 'Gelu': 0, 'FastGelu': 1, 'BiasGelu': 0, 'LayerNormalization': 2, 'SkipLayerNormalization': 0 } self.verify_node_count(model, expected_node_count, 'test_gpt2_past_mask')
def test_3d_attention_fusion_tf2onnx_model(self): model = create_tf2onnx_attention_3d() dir = '.' model_path = os.path.join(dir, 'bert_3d_attention.onnx') onnx.save(model, model_path) optimized_model = optimize_model(model_path, model_type='bert_tf', num_heads=4, hidden_size=16) os.remove(model_path) expected_model_path = os.path.join(os.path.dirname(__file__), 'test_data', 'models', 'bert_3d_attention_opt.onnx') expected = onnx.load(expected_model_path) self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
def test_attention_fusion_for_varied_qkv_dimensions(self): model = create_bert_attention(input_hidden_size=16, num_heads=2, pruned_qk_hidden_size=24, pruned_v_hidden_size=16) dir = '.' model_path = os.path.join(dir, "attention_with_varied_qkv.onnx") onnx.save(model, model_path) optimized_model = optimize_model(model_path) os.remove(model_path) expected_model_path = os.path.join( os.path.dirname(__file__), 'test_data', 'models', 'attention_with_varied_qkv_opt.onnx') expected = onnx.load(expected_model_path) self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
def test_multiple_embed(self): input_model_path = _get_test_model_path('multiple_embed') model = optimize_model(input_model_path, 'bert', num_heads=2, hidden_size=4) expected_node_count = { 'EmbedLayerNormalization': 2, 'Attention': 2, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 0 } self.verify_node_count(model, expected_node_count, 'test_multiple_embed')
def test_attention_fusion_reverse_add_order(self): model = create_bert_attention(input_hidden_size=16, num_heads=2, pruned_qk_hidden_size=8, pruned_v_hidden_size=8, switch_add_inputs=True) dir = '.' model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx") onnx.save(model, model_path) optimized_model = optimize_model(model_path) os.remove(model_path) # reverse add input order will get same optimized model expected_model_path = os.path.join(os.path.dirname(__file__), 'test_data', 'models', 'pruned_attention_opt.onnx') expected = onnx.load(expected_model_path) self.assertEqual(str(optimized_model.model.graph), str(expected.graph))
def test_fusions(self): sys.path.append(os.path.join(os.path.dirname(__file__), '..')) from onnxruntime.transformers.optimizer import optimize_model for test_case in test_cases: source, operator, model_class = test_case model = model_class() dummy_input = torch.ones(3, dtype=torch.float32) test_name = f"{operator}_{source}" onnx_path = f"{test_name}.onnx" torch.onnx.export(model, (dummy_input), onnx_path, input_names=['input'], output_names=['output']) optimizer = optimize_model(onnx_path, 'bert') # optimizer.save_model_to_file(f"{operator}_{source}_opt.onnx") os.remove(onnx_path) expected_node_count = {operator: 1} self.verify_node_count(optimizer, expected_node_count, test_name)
dummy_dataloader, benchmark=True) latency = np.array(results).mean() / args.eval_batch_size print('Latency: {:.3f} ms'.format(latency * 1000)) print('Throughput: {:.3f} items/sec'.format(args.eval_batch_size * 1. / latency)) print('--------------------------------------------------------------') if args.tune: from onnxruntime.transformers import optimizer from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False model_optimizer = optimizer.optimize_model( args.model_path, 'bert', num_heads=12, hidden_size=768, optimization_options=opt_options) model = model_optimizer.model from lpot.experimental import Quantization, common quantize = Quantization(args.config) quantize.model = common.Model(model) quantize.calib_dataloader = eval_dataloader quantize.eval_func = eval_func q_model = quantize() q_model.save(args.output_model)
def export_model_to_onnx(self, fpath, quantize=False, target_opset=None, verbose=1): """ ``` Export model to onnx Args: fpath(str): String representing full path to model file where ONNX model will be saved. Example: '/tmp/my_model.onnx' quantize(str): If True, will create a total of three model files will be created using transformers.convert_graph_to_onnx: 1) ONNX model (created directly using keras2onnx 2) an optimized ONNX model (created by transformers library) 3) a quantized version of optimized ONNX model (created by transformers library) All files will be created in the parent folder of fpath: Example: If fpath='/tmp/model.onnx', then both /tmp/model-optimized.onnx and /tmp/model-optimized-quantized.onnx will also be created. verbose(bool): verbosity Returns: str: string representing fpath. If quantize=True, returned fpath will be different than supplied fpath ``` """ try: import onnxruntime, onnx except ImportError: raise Exception('This method requires ONNX libraries to be installed: '+\ 'pip install -q --upgrade onnxruntime==1.10.0 onnx sympy tf2onnx') from pathlib import Path if type(self.preproc).__name__ == 'BERTPreprocessor': raise Exception('currently_unsupported: BERT models created with text_classifier("bert",...) are not supported (i.e., keras_bert models). ' +\ 'Only BERT models created with Transformer(...) are supported.') if verbose: print( 'converting to ONNX format by way of TFLite ... this may take a few moments...' ) if U.is_huggingface(model=self.model): tokenizer = self.preproc.get_tokenizer() maxlen = self.preproc.maxlen input_dict = tokenizer('Name', return_tensors='tf', padding='max_length', max_length=maxlen) if version.parse(tf.__version__) < version.parse('2.2'): raise Exception( 'export_model_to_tflite requires tensorflow>=2.2') #self.model._set_inputs(input_spec, training=False) # for tf < 2.2 self.model._saved_model_inputs_spec = None # for tf > 2.2 self.model._set_save_spec(input_dict) # for tf > 2.2 self.model._get_save_spec() #onnx_model = keras2onnx.convert_keras(self.model, self.model.name, target_opset=target_opset) #keras2onnx.save_model(onnx_model, fpath) tflite_model_path = self.export_model_to_tflite(fpath + '-TFLITE_TMP', verbose=verbose) import subprocess if verbose: print('converting to ONNX using tf2onnx...') proc = subprocess.run( f'python -m tf2onnx.convert --tflite {tflite_model_path} --output {fpath}' .split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) if verbose: print(proc.returncode) print(proc.stdout.decode('ascii')) print(proc.stderr.decode('ascii')) return_fpath = fpath if quantize: from transformers.convert_graph_to_onnx import optimize, quantize #opt_path = optimize(Path(fpath)) if U.is_huggingface(model=self.model) and\ type(self.model).__name__ in ['TFDistilBertForSequenceClassification', 'TFBertForSequenceClassification']: try: from onnxruntime.transformers import optimizer from onnxruntime.transformers.onnx_model_bert import BertOptimizationOptions # disable embedding layer norm optimization for better model size reduction opt_options = BertOptimizationOptions('bert') opt_options.enable_embed_layer_norm = False opt_model = optimizer.optimize_model( fpath, 'bert', # bert_keras causes error with transformers num_heads=12, hidden_size=768, optimization_options=opt_options) opt_model.save_model_to_file(fpath) except: warnings.warn('Could not run BERT-specific optimizations') pass quantize_path = quantize(Path(fpath)) return_fpath = quantize_path.as_posix() if verbose: print('done.') return return_fpath