def quantize(models_name_or_path): """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU Uses unsigned ints for activation values, signed ints for weights, per https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection it is faster on most CPU architectures Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ from onnxruntime.quantization import quantize_dynamic, QuantType bar = Bar("Quantizing...", max=3) quant_model_paths = [] for model in models_name_or_path: model_name = model.as_posix() output_model_name = f"{model_name[:-5]}-quantized.onnx" quantize_dynamic( model_input=model_name, model_output=output_model_name, per_channel=True, reduce_range=True, # should be the same as per_channel activation_type=QuantType.QUInt8, weight_type=QuantType. QInt8, # per docs, signed is faster on most CPUs optimize_model=False, ) # op_types_to_quantize=['MatMul', 'Relu', 'Add', 'Mul' ], quant_model_paths.append(output_model_name) bar.next() bar.finish() return tuple(quant_model_paths)
def test_quantize_batch_size_2(self): batch = 2 hidden_size = 4 sequence_length = 4 model_f32_path = 'test_embed_layer_norm_unit_test_batch2.onnx' model_uint8_path = 'test_embed_layer_norm_unit_test_batch2_uint8.onnx' self.construct_model(batch, hidden_size, sequence_length, model_f32_path) data_reader = self.input_feeds_int32( 1, { 'input_ids': [batch, sequence_length], 'segment_ids': [batch, sequence_length] }) quantize_dynamic(model_f32_path, model_uint8_path) # Quantization should not have any DequantizeLinear nodes: qnode_counts = {'DequantizeLinear': 0, 'QEmbedLayerNormalization': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_f32_path, model_uint8_path, data_reader.get_next())
def dynamic_quant_test( self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}, ): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "gemm_fp32.quant_dynamic_{}{}.onnx".format( activation_type_str, weight_type_str) quantize_dynamic( model_fp32_path, model_int8_path, weight_type=weight_type, extra_options=extra_options, ) quant_nodes = {"MatMulInteger": 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {"MatMulInteger": [["i", 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness( self, model_fp32_path, model_int8_path, {"input": np.random.rand(5, 10).astype(np.float32)}, )
def dynamic_quant_conv(self, model_fp32_path, model_int8_path): quantize_dynamic(model_fp32_path, model_int8_path) quant_nodes = {'ConvInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})
def test_quantize_batch_size_1(self): batch = 1 hidden_size = 4 sequence_length = 4 model_f32_path = "test_embed_layer_norm_unit_test_batch1.onnx" model_uint8_path = "test_embed_layer_norm_unit_test_batch1_uint8.onnx" self.construct_model(batch, hidden_size, sequence_length, model_f32_path) data_reader = self.input_feeds_int32( 1, { "input_ids": [batch, sequence_length], "segment_ids": [batch, sequence_length], }, ) quantize_dynamic(model_f32_path, model_uint8_path) # Quantization should not have any DequantizeLinear nodes: qnode_counts = {"DequantizeLinear": 0, "QEmbedLayerNormalization": 1} check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() check_model_correctness(self, model_f32_path, model_uint8_path, data_reader.get_next())
def quantize(models_name_or_path): """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import quantize, quantize_dynamic, QuantType bar = Bar('Quantizing...', max=3) quant_model_paths = [] for model in models_name_or_path: model_name = model.as_posix() output_model_name = f'{model_name[:-5]}-quantized.onnx' quantize_dynamic( model_input=model_name, model_output=output_model_name, per_channel=True, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, # optimize_model=False, ) # op_types_to_quantize=['MatMul', 'Relu', 'Add', 'Mul' ], quant_model_paths.append(output_model_name) bar.next() bar.finish() return tuple(quant_model_paths)
def dynamic_quant_conv_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = 'conv_bias.fp32.onnx' self.construct_model(model_fp32_path) activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_int8_path = 'conv_bias.quant.{}{}.onnx'.format( activation_type_str, weight_type_str) quantize_dynamic(model_fp32_path, model_int8_path, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = {'ConvInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {'ConvInteger': [['i', 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})
def dynamic_quant_test(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' model_int8_path = 'gemm_fp32.quant_dynamic_{}{}.onnx'.format( activation_type_str, weight_type_str) quantize_dynamic(model_fp32_path, model_int8_path, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) quant_nodes = {'MatMulInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {'MatMulInteger': [['i', 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(5, 10).astype(np.float32)})
def dynamic_quant_conv_test(self, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = "conv_bias.fp32.onnx" self.construct_model(model_fp32_path) activation_proto_qtype = TensorProto.UINT8 activation_type_str = "u8" weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8" model_int8_path = "conv_bias.quant.{}{}.onnx".format( activation_type_str, weight_type_str) quantize_dynamic( model_fp32_path, model_int8_path, weight_type=weight_type, extra_options=extra_options, ) quant_nodes = {"ConvInteger": 2} check_op_type_count(self, model_int8_path, **quant_nodes) qnode_io_qtypes = {"ConvInteger": [["i", 2, activation_proto_qtype]]} check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) check_model_correctness( self, model_fp32_path, model_int8_path, {"input": np.random.rand(4, 2, 8, 8).astype(np.float32)}, )
def quantize_model( self, model_fp32_path, model_i8_path, data_reader=None, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}, ): if data_reader is not None: quantize_static( model_fp32_path, model_i8_path, data_reader, reduce_range=True, quant_format=QuantFormat.QOperator, activation_type=activation_type, weight_type=weight_type, extra_options=extra_options, ) else: quantize_dynamic( model_fp32_path, model_i8_path, reduce_range=True, weight_type=weight_type, extra_options=extra_options, )
def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None): if data_reader is not None: quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True) else: quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True)
def quantize_onnx_model(onnx_model_path, quantized_model_path): print("Starting quantization...") from onnxruntime.quantization import quantize_dynamic, QuantType quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=QuantType.QUInt8) print(f"Quantized model saved to: {quantized_model_path}")
def dynamic_attention_quant_test(self, model_fp32_path, model_int8_path, per_channel, reduce_range): quantize_dynamic(model_fp32_path, model_int8_path, per_channel=per_channel, reduce_range=reduce_range) quant_nodes = {'QAttention': 1, 'MatMulInteger': 1} check_op_type_count(self, model_int8_path, **quant_nodes) check_model_correctness( self, model_fp32_path, model_int8_path, {'input': np.random.rand(1, 5, 10).astype(np.float32)})
def main(): init_logging('quantizeOnnx.log') args = parse_args() modelFilePath = args.modelFilePath quantizedModelFilePath = args.quantizedModelFilePath datasetDir = args.datasetDir # dr = ModelCalibrationDataReader(datasetDir, modelFilePath) # quantize_static(modelFilePath, quantizedModelFilePath, dr) quantize_dynamic(modelFilePath, quantizedModelFilePath, weight_type=QuantType.QInt8) print('Calibrated and quantized model saved.')
def _maybe_quantize(cls): logger.info("Saving quantized model") # Download model model_fp32 = f"/tmp_models/{cls.__name__}/model.onnx" model_quant = f"/models/{cls.__name__}/1/model.onnx" os.makedirs(os.path.dirname(model_quant), exist_ok=True) url = cls.MODEL_URL r = requests.get(url) quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8) logger.info(f"Quantized model saved to:{model_quant}") full_size = os.path.getsize(model_fp32) / (1024 * 1024) quant_size = os.path.getsize(model_quant) / (1024 * 1024) logger.info(f"ONNX full precision model size (MB): {full_size}") logger.info(f"ONNX quantized model size (MB): {quant_size}")
def _preprocess(self): """Preprocess/preparation operations before the benchmarking. Return: True if _preprocess() succeed. """ if not super()._preprocess(): return False import onnxruntime as ort self.__graph_opt_level = { 0: ort.GraphOptimizationLevel.ORT_DISABLE_ALL, 1: ort.GraphOptimizationLevel.ORT_ENABLE_BASIC, 2: ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED, 3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL, } for model in self._args.pytorch_models: if hasattr(torchvision.models, model): data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \ else Precision.FLOAT32.value model_path = f'{self.__model_cache_path / (model + "." + data_type + ".onnx")}' torch.onnx.export( getattr(torchvision.models, model)(pretrained=True).to( dtype=getattr(torch, data_type)).cuda(), torch.randn(self._args.batch_size, 3, 224, 224, device='cuda', dtype=getattr(torch, data_type)), model_path, input_names=['input'], ) if self._args.precision == Precision.INT8: file_name = '{model}.{precision}.onnx'.format( model=model, precision=self._args.precision) # For quantization of ONNXRuntime, refer # https://onnxruntime.ai/docs/performance/quantization.html#quantization-overview from onnxruntime.quantization import quantize_dynamic quantize_dynamic(model_path, f'{self.__model_cache_path / file_name}') else: logger.error('Cannot find PyTorch model %s.', model) return False return True
def convert_to_onnx(self, onnx_output_dir=None, set_onnx_arg=True): """Convert the model to ONNX format and save to output_dir Args: onnx_output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None. set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True. """ # noqa if not onnx_output_dir: onnx_output_dir = os.path.join(self.options.output_dir, self.options.model_type, self.options.model_name, "onnx") os.makedirs(onnx_output_dir, exist_ok=True) if not os.listdir(onnx_output_dir): onnx_model_name = os.path.join(onnx_output_dir, "onnx_model.onnx") with tempfile.TemporaryDirectory() as temp_dir: basedir = os.path.basename(temp_dir) temp_dir = os.path.join(self.options.output_dir, basedir) self.save_model(output_dir=temp_dir, model=self.model) convert( framework="pt", model=temp_dir, tokenizer=self.tokenizer, output=Path(onnx_model_name), pipeline_name="ner", opset=11, ) self.tokenizer.save_pretrained(onnx_output_dir) self.config.save_pretrained(onnx_output_dir) onnx_options = SessionOptions() use_cuda = True if self._device.type != 'cpu' else False onnx_execution_provider = "CUDAExecutionProvider" if use_cuda else "CPUExecutionProvider" onnx_options.intra_op_num_threads = 1 onnx_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL onnx_model_path = os.path.join(onnx_output_dir, "onnx_model.onnx") if self.options.dynamic_quantize: # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename( Path(onnx_model_path), "-quantized") quantize_dynamic(Path(onnx_model_path), quantized_model_path) onnx_model_path = quantized_model_path.as_posix() return InferenceSession(onnx_model_path, onnx_options, providers=[onnx_execution_provider])
def benchmark_onnxruntime(path_to_model, repeat=1000, number=1, warmup=100, quantize=False): """ Parameters ---------- path_to_model: str or onnx.ModelProto Path to an onnx model. repeat: int Repetition of experiment. Default: 1000 number: int Number of forward passes in each experiment. Default: 1 warmup: int Number of disregarded experiments. Default: 100 quantize: bool Dynamically quantize the model with default parameters. Returns ------- info: dict Information about the size and min, max, mean, std of the time of the experiments. """ assert repeat >= 2 * warmup if quantize: import onnx from onnx import version_converter from onnxruntime.quantization import quantize_dynamic orig_model = onnx.load(path_to_model) if orig_model.opset_import[0].version < 11: converted_model = version_converter.convert_version(orig_model, 11) path_to_model = '/tmp/model_conv.onnx' with open(path_to_model, 'wb') as f: f.write(converted_model.SerializeToString()) del orig_model, converted_model path_to_quant_model = "/tmp/model_quant.onnx" model = quantize_dynamic(path_to_model, path_to_quant_model) size = os.path.getsize(path_to_quant_model) sess = ort.InferenceSession(path_to_quant_model) else: size = os.path.getsize(path_to_model) sess = ort.InferenceSession(path_to_model) inputs = { x.name: np.random.randn(*get_shape(x)).astype(get_type(x)) for x in sess.get_inputs() } def _benchmark(): output = sess.run(None, inputs) res = dict(size=size, input_size=[tuple(x.shape) for x in inputs.values()]) res.update(benchmark_speed(_benchmark, repeat, number, warmup)) return res
def quantization_optimize(optimization_config): logger.info("ONNX model quantization started") base_dir = os.path.dirname(optimization_config.model_path) unquantized_model = os.path.join(base_dir, "unquantized_model.onnx") copy(optimization_config.model_path, unquantized_model) try: quantization.quantize_dynamic(unquantized_model, optimization_config.model_path) default_ep = "CUDAExecutionProvider" if "CUDAExecutionProvider" in ort.get_available_providers( ) else "CPUExecutionProvider" ort.InferenceSession(optimization_config.model_path, providers=[default_ep]) logger.info("ONNX model quantized successfully") except Exception as e: logger.info( "Quantization optimization failed with error {}. Original model will be used for optimization." .format(e)) copy(unquantized_model, optimization_config.model_path)
def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data_format=False): from pathlib import Path from onnxruntime.quantization import quantize_dynamic Path(quantized_model_path).parent.mkdir(parents=True, exist_ok=True) logger.info( f"Size of full precision ONNX model(MB):{os.path.getsize(onnx_model_path)/(1024*1024)}" ) quantize_dynamic( onnx_model_path, quantized_model_path, use_external_data_format=use_external_data_format, ) logger.info(f"quantized model saved to:{quantized_model_path}") # TODO: inlcude external data in total model size. logger.info( f"Size of quantized ONNX model(MB):{os.path.getsize(quantized_model_path)/(1024*1024)}" )
def quantize_model(onnx_model): import onnx from onnxruntime.quantization import quantize_dynamic, QuantType path = Path(onnx_model) parent_path = path.parent output_path = os.path.join(parent_path, 'model-dyn-quant.onnx') quantized_model = quantize_dynamic(onnx_model, output_path, weight_type=QuantType.QUInt8) return output_path
def quantize_onnx_model( onnx_model_path: Union[Path, str], quantized_model_path: Union[Path, str], qtype: str = "qint8", verbose: bool = False, ) -> None: """Takes model converted to onnx runtime and applies pruning. Args: onnx_model_path: path to onnx model. quantized_model_path: path to quantized model. qtype: Type of weights in quantized model. Can be `quint8` or `qint8`. Defaults to "qint8". verbose: If set to True prints model size before and after quantization. Defaults to False. Raises: ValueError: If qtype is not understood. """ type_mapping = { "qint8": QuantType.QInt8, "quint8": QuantType.QUInt8, } if qtype not in type_mapping.keys(): raise ValueError( "type should be string one of 'quint8' or 'qint8'. Got {}".format( qtype)) quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=type_mapping[qtype]) if verbose: v_str = ("Model size before quantization (MB):" f"{os.path.getsize(onnx_model_path) / 2**20:.2f}\n" "Model size after quantization (MB):" f"{os.path.getsize(quantized_model_path) / 2**20:.2f}") print("Done.") print(v_str) print(f"Quantized model saved to {quantized_model_path}.")
def create_predictor(cls, args): if args.use_onnxruntime: assert args.device != "xpu", "Running ONNXRuntime on XPU is temporarily not supported." if args.model_path.count(".onnx"): onnx_model = args.model_path else: import paddle2onnx onnx_model = paddle2onnx.command.c_paddle_to_onnx( model_file=args.model_path + ".pdmodel", params_file=args.model_path + ".pdiparams", opset_version=13, enable_onnx_checker=True) dynamic_quantize_model = onnx_model providers = ['CUDAExecutionProvider'] if args.enable_quantize: from onnxruntime.quantization import QuantizationMode, quantize_dynamic float_onnx_file = "model.onnx" with open(float_onnx_file, "wb") as f: f.write(onnx_model) dynamic_quantize_model = "dynamic_quantize_model.onnx" quantize_dynamic(float_onnx_file, dynamic_quantize_model) providers = ['CPUExecutionProvider'] sess_options = ort.SessionOptions() sess_options.intra_op_num_threads = args.num_threads sess_options.inter_op_num_threads = args.num_threads predictor = ort.InferenceSession(dynamic_quantize_model, sess_options=sess_options, providers=providers) input_name1 = predictor.get_inputs()[1].name input_name2 = predictor.get_inputs()[0].name input_handles = [input_name1, input_name2] return cls(predictor, input_handles, []) config = paddle.inference.Config(args.model_path + ".pdmodel", args.model_path + ".pdiparams") if args.device == "gpu": # set GPU configs accordingly config.enable_use_gpu(100, 0) cls.device = paddle.set_device("gpu") elif args.device == "cpu": # set CPU configs accordingly, # such as enable_mkldnn, set_cpu_math_library_num_threads config.disable_gpu() config.switch_ir_optim(True) config.enable_mkldnn() config.set_cpu_math_library_num_threads(args.num_threads) cls.device = paddle.set_device("cpu") elif args.device == "xpu": # set XPU configs accordingly config.enable_xpu(100) if args.use_trt: precision_map = { "int8": inference.PrecisionType.Int8, "fp16": inference.PrecisionType.Half, "fp32": inference.PrecisionType.Float32 } config.enable_tensorrt_engine( workspace_size=1 << 30, precision_mode=precision_map[args.precision], max_batch_size=args.batch_size, min_subgraph_size=5, use_static=False, use_calib_mode=False) print("Enable TensorRT is: {}".format( config.tensorrt_engine_enabled())) if args.collect_shape: config.collect_shape_range_info(args.task_name + args.shape_file) else: config.enable_tuned_tensorrt_dynamic_shape( args.task_name + args.shape_file, True) config.delete_pass("embedding_eltwise_layernorm_fuse_pass") predictor = paddle.inference.create_predictor(config) input_handles = [ predictor.get_input_handle(name) for name in predictor.get_input_names() ] output_handles = [ predictor.get_output_handle(name) for name in predictor.get_output_names() ] return cls(predictor, input_handles, output_handles)
maxlen=128, export_model_path="outdir/14_0.79_sentim.onnx") config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path) ### Quantization if True: onnx_model_path = 'model.onnx' quantized_model_path = f"model-quantized.onnx" quantized_model = quantize_dynamic(onnx_model_path, quantized_model_path) # text = preprocess_text("здравствуйте скажите пожалуйста как мне отключить данные номер на время") # inputs = tokenizer(text, max_length=args.maxlen, padding="max_length", return_tensors="np") # input_ids = inputs['input_ids'] # token_type_ids = inputs['token_type_ids'] # attention_mask = inputs['attention_mask'] ort_session = ort.InferenceSession('model-quantized.onnx') # def to_numpy(tensor): # return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() # ort_inputs = { # ort_session.get_inputs()[0].name: input_ids, # ort_session.get_inputs()[1].name: token_type_ids,
def train(conf: omegaconf.DictConfig) -> None: # fancy logger console = Console() # reproducibility pl.seed_everything(conf.train.seed) console.log( f"Starting training for [bold cyan]{conf.train.model_name}[/bold cyan] model" ) if conf.train.pl_trainer.fast_dev_run: console.log( f"Debug mode {conf.train.pl_trainer.fast_dev_run}. Forcing debugger configuration" ) # Debuggers don't like GPUs nor multiprocessing conf.train.pl_trainer.accelerator = "cpu" conf.train.pl_trainer.devices = 1 conf.train.pl_trainer.strategy = None conf.train.pl_trainer.precision = 32 conf.data.datamodule.num_workers = { k: 0 for k in conf.data.datamodule.num_workers } # Switch wandb to offline mode to prevent online logging conf.logging.log = None # remove model checkpoint callback conf.train.model_checkpoint_callback = None # data module declaration console.log(f"Instantiating the Data Module") pl_data_module: NERDataModule = hydra.utils.instantiate( conf.data.datamodule, _recursive_=False ) # force setup to get labels initialized for the model pl_data_module.prepare_data() # main module declaration model_kwargs = {"_recursive_": False, "labels": pl_data_module.labels} console.log(f"Instantiating the Model") pl_module: NERModule = hydra.utils.instantiate(conf.model, **model_kwargs) experiment_logger: Optional[WandbLogger] = None experiment_path: Optional[Path] = None if conf.logging.log: console.log(f"Instantiating Wandb Logger") experiment_logger = hydra.utils.instantiate(conf.logging.wandb_arg) experiment_logger.watch(pl_module, **conf.logging.watch) # callbacks declaration callbacks_store = [RichProgressBar()] if conf.train.early_stopping_callback is not None: early_stopping_callback: EarlyStopping = hydra.utils.instantiate( conf.train.early_stopping_callback ) callbacks_store.append(early_stopping_callback) model_checkpoint_callback: Optional[ModelCheckpoint] = None if conf.train.model_checkpoint_callback is not None: model_checkpoint_callback = hydra.utils.instantiate( conf.train.model_checkpoint_callback, dirpath=experiment_path / "checkpoints" if experiment_path else None, ) callbacks_store.append(model_checkpoint_callback) # trainer console.log(f"Instantiating the Trainer") trainer: Trainer = hydra.utils.instantiate( conf.train.pl_trainer, callbacks=callbacks_store, logger=experiment_logger ) model_export: Optional[Path] = None if trainer.global_rank == 0: if conf.logging.log: experiment_path = Path(experiment_logger.experiment.dir) # Store the YaML config separately into the wandb dir yaml_conf: str = OmegaConf.to_yaml(cfg=conf) (experiment_path / "hparams.yaml").write_text(yaml_conf) # save labels before starting training model_export = experiment_path / "model_export" model_export.mkdir(exist_ok=True, parents=True) # save labels pl_data_module.labels.to_file(model_export / "labels.json") # module fit trainer.fit(pl_module, datamodule=pl_data_module) if trainer.global_rank == 0: if model_checkpoint_callback: # load best model for testing best_pl_module = NERModule.load_from_checkpoint( model_checkpoint_callback.best_model_path, labels=pl_data_module.labels ) else: best_pl_module = pl_module # module test trainer.test(best_pl_module, datamodule=pl_data_module) if conf.train.export and not conf.train.pl_trainer.fast_dev_run: # export model stuff best_model = best_pl_module.model torch.save( best_model.state_dict(), model_export / "weights.pt", ) if is_onnx_available(): from onnxruntime.quantization import quantize_dynamic, QuantType inputs = next(iter(pl_data_module.train_dataloader())) dynamic_axes = { "input_ids": { 0: "batch_size", 1: "batch_length", }, # variable length axes "attention_mask": { 0: "batch_size", 1: "batch_length", }, # variable length axes "offsets": { 0: "batch_size", 1: "batch_length", }, # variable length axes "ner_tags": { 0: "batch_size", 1: "batch_length", }, # variable length axes } # onnx accepts only Tuples onnx_inputs = ( inputs.input_ids, inputs.attention_mask, inputs.offsets, ) input_names = ["input_ids", "attention_mask", "offsets"] # export onnx torch.onnx.export( best_model, onnx_inputs, model_export / "weights.onnx", export_params=True, # store the trained parameter weights inside the model file opset_version=15, # the ONNX version to export the model to do_constant_folding=True, # whether to execute constant folding for optimization input_names=input_names, # the model's input names output_names=["ner_tags"], # the model's output names verbose=False, dynamic_axes=dynamic_axes, ) quantize_dynamic( model_input=model_export / "weights.onnx", model_output=model_export / "weights.quantized.onnx", per_channel=True, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, optimize_model=True, )
def dynamic_quantize(self, input_float_model, dynamic_quantized_model): from onnxruntime.quantization import QuantizationMode, quantize_dynamic quantize_dynamic(input_float_model, dynamic_quantized_model)