def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=False, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class) if args.input_test_file: test_inputs = [] with open(args.input_test_file) as read_f: for i, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } test_inputs.append(inputs) Gpt2Tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose) logger.info(f"Done. Output model: {output_path}")
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if hasattr(config, 'return_tuple'): config.return_tuple = True model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) onnx_model_paths = Gpt2Helper.get_onnx_paths(output_dir, args.model_name_or_path, args.model_class) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose) if args.optimize_onnx or args.precision != Precision.FLOAT32: Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=False, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class) logger.info(f"Done. Output model: {output_path}")
def main(): args = parse_arguments() setup_logger(args.verbose) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" model_class = MODEL_CLASSES[args.model_class][0] config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) use_external_data_format = (config.n_layer > 24 ) #TODO: find a way to check model size > 2GB onnx_model_paths = Gpt2Helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, new_folder=use_external_data_format) raw_onnx_model = args.output if args.output.endswith( '.onnx') else onnx_model_paths["raw"] output_path = raw_onnx_model if ( args.output.endswith('.onnx') or (args.precision == Precision.FLOAT32 and not args.optimize_onnx) ) else onnx_model_paths[str(args.precision)] logger.info(f"Exporting ONNX model to {raw_onnx_model}") use_padding = MODEL_CLASSES[args.model_class][2] Gpt2Helper.export_onnx(model, device, raw_onnx_model, args.verbose, use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) if args.optimize_onnx or args.precision != Precision.FLOAT32: logger.info(f"Optimizing model to {output_path}") Gpt2Helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size) if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, output_path) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if session is not None: Gpt2Helper.test_parity(session, model, device, args.precision == Precision.FLOAT16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding) if args.input_test_file: test_inputs = [] # Each line of test file is a JSON string like: # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]} with open(args.input_test_file) as read_f: for i, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) if use_padding: if "attention_mask" in data: numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 attention_mask = ( input_ids != padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) else: position_ids = (attention_mask.long().cumsum(-1) - 1) position_ids.masked_fill_(position_ids < 0, 0) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } else: inputs = {"input_ids": input_ids} test_inputs.append(inputs) Gpt2Tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose) logger.info(f"Done. Output model: {output_path}")