示例#1
0
def quantize(models_name_or_path):
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU

    Uses unsigned ints for activation values, signed ints for weights, per
    https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
    it is faster on most CPU architectures
    Args:
        onnx_model_path: Path to location the exported ONNX model is stored
    Returns: The Path generated for the quantized
    """
    from onnxruntime.quantization import quantize_dynamic, QuantType

    bar = Bar("Quantizing...", max=3)

    quant_model_paths = []
    for model in models_name_or_path:
        model_name = model.as_posix()
        output_model_name = f"{model_name[:-5]}-quantized.onnx"
        quantize_dynamic(
            model_input=model_name,
            model_output=output_model_name,
            per_channel=True,
            reduce_range=True,  # should be the same as per_channel
            activation_type=QuantType.QUInt8,
            weight_type=QuantType.
            QInt8,  # per docs, signed is faster on most CPUs
            optimize_model=False,
        )  # op_types_to_quantize=['MatMul', 'Relu', 'Add', 'Mul' ],
        quant_model_paths.append(output_model_name)
        bar.next()

    bar.finish()

    return tuple(quant_model_paths)
示例#2
0
    def test_quantize_batch_size_2(self):
        batch = 2
        hidden_size = 4
        sequence_length = 4

        model_f32_path = 'test_embed_layer_norm_unit_test_batch2.onnx'
        model_uint8_path = 'test_embed_layer_norm_unit_test_batch2_uint8.onnx'

        self.construct_model(batch, hidden_size, sequence_length,
                             model_f32_path)

        data_reader = self.input_feeds_int32(
            1, {
                'input_ids': [batch, sequence_length],
                'segment_ids': [batch, sequence_length]
            })

        quantize_dynamic(model_f32_path, model_uint8_path)

        # Quantization should not have any DequantizeLinear nodes:
        qnode_counts = {'DequantizeLinear': 0, 'QEmbedLayerNormalization': 1}
        check_op_type_count(self, model_uint8_path, **qnode_counts)
        data_reader.rewind()

        check_model_correctness(self, model_f32_path, model_uint8_path,
                                data_reader.get_next())
示例#3
0
    def dynamic_quant_test(
        self,
        model_fp32_path,
        data_reader,
        activation_type,
        weight_type,
        extra_options={},
    ):
        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
        activation_type_str = "u8" if (activation_type
                                       == QuantType.QUInt8) else "s8"
        weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
        model_int8_path = "gemm_fp32.quant_dynamic_{}{}.onnx".format(
            activation_type_str, weight_type_str)

        quantize_dynamic(
            model_fp32_path,
            model_int8_path,
            weight_type=weight_type,
            extra_options=extra_options,
        )
        quant_nodes = {"MatMulInteger": 2}
        check_op_type_count(self, model_int8_path, **quant_nodes)
        qnode_io_qtypes = {"MatMulInteger": [["i", 2, activation_proto_qtype]]}
        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(
            self,
            model_fp32_path,
            model_int8_path,
            {"input": np.random.rand(5, 10).astype(np.float32)},
        )
示例#4
0
 def dynamic_quant_conv(self, model_fp32_path, model_int8_path):
     quantize_dynamic(model_fp32_path, model_int8_path)
     quant_nodes = {'ConvInteger': 2}
     check_op_type_count(self, model_int8_path, **quant_nodes)
     check_model_correctness(
         self, model_fp32_path, model_int8_path,
         {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})
示例#5
0
    def test_quantize_batch_size_1(self):
        batch = 1
        hidden_size = 4
        sequence_length = 4

        model_f32_path = "test_embed_layer_norm_unit_test_batch1.onnx"
        model_uint8_path = "test_embed_layer_norm_unit_test_batch1_uint8.onnx"

        self.construct_model(batch, hidden_size, sequence_length, model_f32_path)

        data_reader = self.input_feeds_int32(
            1,
            {
                "input_ids": [batch, sequence_length],
                "segment_ids": [batch, sequence_length],
            },
        )

        quantize_dynamic(model_f32_path, model_uint8_path)

        # Quantization should not have any DequantizeLinear nodes:
        qnode_counts = {"DequantizeLinear": 0, "QEmbedLayerNormalization": 1}
        check_op_type_count(self, model_uint8_path, **qnode_counts)
        data_reader.rewind()

        check_model_correctness(self, model_f32_path, model_uint8_path, data_reader.get_next())
示例#6
0
def quantize(models_name_or_path):
    """
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU
    Args:
        onnx_model_path: Path to location the exported ONNX model is stored
    Returns: The Path generated for the quantized
    """
    import onnx
    from onnxruntime.quantization import quantize, quantize_dynamic, QuantType

    bar = Bar('Quantizing...', max=3)

    quant_model_paths = []
    for model in models_name_or_path:
        model_name = model.as_posix()
        output_model_name = f'{model_name[:-5]}-quantized.onnx'
        quantize_dynamic(
            model_input=model_name,
            model_output=output_model_name,
            per_channel=True,
            activation_type=QuantType.QUInt8,
            weight_type=QuantType.QUInt8,
            #  optimize_model=False,
        )  # op_types_to_quantize=['MatMul', 'Relu', 'Add', 'Mul' ],
        quant_model_paths.append(output_model_name)
        bar.next()

    bar.finish()

    return tuple(quant_model_paths)
示例#7
0
    def dynamic_quant_conv_test(self,
                                activation_type,
                                weight_type,
                                extra_options={}):
        np.random.seed(1)
        model_fp32_path = 'conv_bias.fp32.onnx'
        self.construct_model(model_fp32_path)

        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
        activation_type_str = 'u8' if (activation_type
                                       == QuantType.QUInt8) else 's8'
        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
        model_int8_path = 'conv_bias.quant.{}{}.onnx'.format(
            activation_type_str, weight_type_str)

        quantize_dynamic(model_fp32_path,
                         model_int8_path,
                         activation_type=activation_type,
                         weight_type=weight_type,
                         extra_options=extra_options)
        quant_nodes = {'ConvInteger': 2}
        check_op_type_count(self, model_int8_path, **quant_nodes)
        qnode_io_qtypes = {'ConvInteger': [['i', 2, activation_proto_qtype]]}
        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
        check_model_correctness(
            self, model_fp32_path, model_int8_path,
            {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})
示例#8
0
    def dynamic_quant_test(self,
                           model_fp32_path,
                           data_reader,
                           activation_type,
                           weight_type,
                           extra_options={}):
        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
        activation_type_str = 'u8' if (activation_type
                                       == QuantType.QUInt8) else 's8'
        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
        model_int8_path = 'gemm_fp32.quant_dynamic_{}{}.onnx'.format(
            activation_type_str, weight_type_str)

        quantize_dynamic(model_fp32_path,
                         model_int8_path,
                         activation_type=activation_type,
                         weight_type=weight_type,
                         extra_options=extra_options)
        quant_nodes = {'MatMulInteger': 2}
        check_op_type_count(self, model_int8_path, **quant_nodes)
        qnode_io_qtypes = {'MatMulInteger': [['i', 2, activation_proto_qtype]]}
        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(
            self, model_fp32_path, model_int8_path,
            {'input': np.random.rand(5, 10).astype(np.float32)})
示例#9
0
    def dynamic_quant_conv_test(self, weight_type, extra_options={}):
        np.random.seed(1)
        model_fp32_path = "conv_bias.fp32.onnx"
        self.construct_model(model_fp32_path)

        activation_proto_qtype = TensorProto.UINT8
        activation_type_str = "u8"
        weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
        model_int8_path = "conv_bias.quant.{}{}.onnx".format(
            activation_type_str, weight_type_str)

        quantize_dynamic(
            model_fp32_path,
            model_int8_path,
            weight_type=weight_type,
            extra_options=extra_options,
        )
        quant_nodes = {"ConvInteger": 2}
        check_op_type_count(self, model_int8_path, **quant_nodes)
        qnode_io_qtypes = {"ConvInteger": [["i", 2, activation_proto_qtype]]}
        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
        check_model_correctness(
            self,
            model_fp32_path,
            model_int8_path,
            {"input": np.random.rand(4, 2, 8, 8).astype(np.float32)},
        )
示例#10
0
 def quantize_model(
     self,
     model_fp32_path,
     model_i8_path,
     data_reader=None,
     activation_type=QuantType.QUInt8,
     weight_type=QuantType.QUInt8,
     extra_options={},
 ):
     if data_reader is not None:
         quantize_static(
             model_fp32_path,
             model_i8_path,
             data_reader,
             reduce_range=True,
             quant_format=QuantFormat.QOperator,
             activation_type=activation_type,
             weight_type=weight_type,
             extra_options=extra_options,
         )
     else:
         quantize_dynamic(
             model_fp32_path,
             model_i8_path,
             reduce_range=True,
             weight_type=weight_type,
             extra_options=extra_options,
         )
示例#11
0
 def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None):
     if data_reader is not None:
         quantize_static(model_fp32_path,
                         model_i8_path,
                         data_reader,
                         reduce_range=True)
     else:
         quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True)
def quantize_onnx_model(onnx_model_path, quantized_model_path):
    print("Starting quantization...")
    from onnxruntime.quantization import quantize_dynamic, QuantType
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QUInt8)

    print(f"Quantized model saved to: {quantized_model_path}")
示例#13
0
 def dynamic_attention_quant_test(self, model_fp32_path, model_int8_path,
                                  per_channel, reduce_range):
     quantize_dynamic(model_fp32_path,
                      model_int8_path,
                      per_channel=per_channel,
                      reduce_range=reduce_range)
     quant_nodes = {'QAttention': 1, 'MatMulInteger': 1}
     check_op_type_count(self, model_int8_path, **quant_nodes)
     check_model_correctness(
         self, model_fp32_path, model_int8_path,
         {'input': np.random.rand(1, 5, 10).astype(np.float32)})
示例#14
0
def main():
    init_logging('quantizeOnnx.log')

    args = parse_args()
    modelFilePath = args.modelFilePath
    quantizedModelFilePath = args.quantizedModelFilePath
    datasetDir = args.datasetDir

    # dr = ModelCalibrationDataReader(datasetDir, modelFilePath)
    # quantize_static(modelFilePath, quantizedModelFilePath, dr)
    quantize_dynamic(modelFilePath,
                     quantizedModelFilePath,
                     weight_type=QuantType.QInt8)
    print('Calibrated and quantized model saved.')
示例#15
0
    def _maybe_quantize(cls):
        logger.info("Saving quantized model")
        # Download model
        model_fp32 = f"/tmp_models/{cls.__name__}/model.onnx"
        model_quant = f"/models/{cls.__name__}/1/model.onnx"
        os.makedirs(os.path.dirname(model_quant), exist_ok=True)

        url = cls.MODEL_URL
        r = requests.get(url)
        quantize_dynamic(model_fp32, model_quant, weight_type=QuantType.QUInt8)
        logger.info(f"Quantized model saved to:{model_quant}")
        full_size = os.path.getsize(model_fp32) / (1024 * 1024)
        quant_size = os.path.getsize(model_quant) / (1024 * 1024)
        logger.info(f"ONNX full precision model size (MB): {full_size}")
        logger.info(f"ONNX quantized model size (MB): {quant_size}")
示例#16
0
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        import onnxruntime as ort
        self.__graph_opt_level = {
            0: ort.GraphOptimizationLevel.ORT_DISABLE_ALL,
            1: ort.GraphOptimizationLevel.ORT_ENABLE_BASIC,
            2: ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED,
            3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
        }

        for model in self._args.pytorch_models:
            if hasattr(torchvision.models, model):
                data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \
                    else Precision.FLOAT32.value
                model_path = f'{self.__model_cache_path / (model + "." + data_type + ".onnx")}'
                torch.onnx.export(
                    getattr(torchvision.models, model)(pretrained=True).to(
                        dtype=getattr(torch, data_type)).cuda(),
                    torch.randn(self._args.batch_size,
                                3,
                                224,
                                224,
                                device='cuda',
                                dtype=getattr(torch, data_type)),
                    model_path,
                    input_names=['input'],
                )
                if self._args.precision == Precision.INT8:
                    file_name = '{model}.{precision}.onnx'.format(
                        model=model, precision=self._args.precision)
                    # For quantization of ONNXRuntime, refer
                    # https://onnxruntime.ai/docs/performance/quantization.html#quantization-overview
                    from onnxruntime.quantization import quantize_dynamic
                    quantize_dynamic(model_path,
                                     f'{self.__model_cache_path / file_name}')
            else:
                logger.error('Cannot find PyTorch model %s.', model)
                return False

        return True
示例#17
0
    def convert_to_onnx(self, onnx_output_dir=None, set_onnx_arg=True):
        """Convert the model to ONNX format and save to output_dir
        Args:
            onnx_output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None.
            set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True.
        """  # noqa
        if not onnx_output_dir:
            onnx_output_dir = os.path.join(self.options.output_dir,
                                           self.options.model_type,
                                           self.options.model_name, "onnx")
        os.makedirs(onnx_output_dir, exist_ok=True)

        if not os.listdir(onnx_output_dir):
            onnx_model_name = os.path.join(onnx_output_dir, "onnx_model.onnx")
            with tempfile.TemporaryDirectory() as temp_dir:
                basedir = os.path.basename(temp_dir)
                temp_dir = os.path.join(self.options.output_dir, basedir)
                self.save_model(output_dir=temp_dir, model=self.model)

                convert(
                    framework="pt",
                    model=temp_dir,
                    tokenizer=self.tokenizer,
                    output=Path(onnx_model_name),
                    pipeline_name="ner",
                    opset=11,
                )
            self.tokenizer.save_pretrained(onnx_output_dir)
            self.config.save_pretrained(onnx_output_dir)

        onnx_options = SessionOptions()
        use_cuda = True if self._device.type != 'cpu' else False
        onnx_execution_provider = "CUDAExecutionProvider" if use_cuda else "CPUExecutionProvider"
        onnx_options.intra_op_num_threads = 1
        onnx_options.execution_mode = ExecutionMode.ORT_SEQUENTIAL
        onnx_model_path = os.path.join(onnx_output_dir, "onnx_model.onnx")
        if self.options.dynamic_quantize:
            # Append "-quantized" at the end of the model's name
            quantized_model_path = generate_identified_filename(
                Path(onnx_model_path), "-quantized")
            quantize_dynamic(Path(onnx_model_path), quantized_model_path)
            onnx_model_path = quantized_model_path.as_posix()

        return InferenceSession(onnx_model_path,
                                onnx_options,
                                providers=[onnx_execution_provider])
示例#18
0
def benchmark_onnxruntime(path_to_model,
                          repeat=1000,
                          number=1,
                          warmup=100,
                          quantize=False):
    """
    Parameters
    ----------
    path_to_model: str or onnx.ModelProto
        Path to an onnx model.
    repeat: int
        Repetition of experiment. Default: 1000
    number: int
        Number of forward passes in each experiment. Default: 1
    warmup: int
        Number of disregarded experiments. Default: 100
    quantize: bool
        Dynamically quantize the model with default parameters.

    Returns
    -------
    info: dict
        Information about the size and min, max, mean, std of the time
        of the experiments.
    """
    assert repeat >= 2 * warmup

    if quantize:
        import onnx
        from onnx import version_converter
        from onnxruntime.quantization import quantize_dynamic

        orig_model = onnx.load(path_to_model)
        if orig_model.opset_import[0].version < 11:
            converted_model = version_converter.convert_version(orig_model, 11)
            path_to_model = '/tmp/model_conv.onnx'
            with open(path_to_model, 'wb') as f:
                f.write(converted_model.SerializeToString())
            del orig_model, converted_model
        path_to_quant_model = "/tmp/model_quant.onnx"
        model = quantize_dynamic(path_to_model, path_to_quant_model)
        size = os.path.getsize(path_to_quant_model)
        sess = ort.InferenceSession(path_to_quant_model)
    else:
        size = os.path.getsize(path_to_model)
        sess = ort.InferenceSession(path_to_model)

    inputs = {
        x.name: np.random.randn(*get_shape(x)).astype(get_type(x))
        for x in sess.get_inputs()
    }

    def _benchmark():
        output = sess.run(None, inputs)

    res = dict(size=size, input_size=[tuple(x.shape) for x in inputs.values()])
    res.update(benchmark_speed(_benchmark, repeat, number, warmup))
    return res
示例#19
0
def quantization_optimize(optimization_config):
    logger.info("ONNX model quantization started")
    base_dir = os.path.dirname(optimization_config.model_path)
    unquantized_model = os.path.join(base_dir, "unquantized_model.onnx")
    copy(optimization_config.model_path, unquantized_model)
    try:
        quantization.quantize_dynamic(unquantized_model,
                                      optimization_config.model_path)
        default_ep = "CUDAExecutionProvider" if "CUDAExecutionProvider" in ort.get_available_providers(
        ) else "CPUExecutionProvider"
        ort.InferenceSession(optimization_config.model_path,
                             providers=[default_ep])
        logger.info("ONNX model quantized successfully")
    except Exception as e:
        logger.info(
            "Quantization optimization failed with error {}. Original model will be used for optimization."
            .format(e))
        copy(unquantized_model, optimization_config.model_path)
示例#20
0
    def quantize_onnx_model(onnx_model_path,
                            quantized_model_path,
                            use_external_data_format=False):
        from pathlib import Path

        from onnxruntime.quantization import quantize_dynamic

        Path(quantized_model_path).parent.mkdir(parents=True, exist_ok=True)
        logger.info(
            f"Size of full precision ONNX model(MB):{os.path.getsize(onnx_model_path)/(1024*1024)}"
        )
        quantize_dynamic(
            onnx_model_path,
            quantized_model_path,
            use_external_data_format=use_external_data_format,
        )
        logger.info(f"quantized model saved to:{quantized_model_path}")
        # TODO: inlcude external data in total model size.
        logger.info(
            f"Size of quantized ONNX model(MB):{os.path.getsize(quantized_model_path)/(1024*1024)}"
        )
def quantize_model(onnx_model):
    import onnx
    from onnxruntime.quantization import quantize_dynamic, QuantType

    path = Path(onnx_model)
    parent_path = path.parent
    output_path = os.path.join(parent_path, 'model-dyn-quant.onnx')

    quantized_model = quantize_dynamic(onnx_model,
                                       output_path,
                                       weight_type=QuantType.QUInt8)

    return output_path
示例#22
0
def quantize_onnx_model(
    onnx_model_path: Union[Path, str],
    quantized_model_path: Union[Path, str],
    qtype: str = "qint8",
    verbose: bool = False,
) -> None:
    """Takes model converted to onnx runtime and applies pruning.

    Args:
        onnx_model_path: path to onnx model.
        quantized_model_path: path to quantized model.
        qtype: Type of weights in quantized model.
            Can be `quint8` or `qint8`. Defaults to "qint8".
        verbose: If set to True prints model size before
            and after quantization. Defaults to False.

    Raises:
        ValueError: If qtype is not understood.
    """
    type_mapping = {
        "qint8": QuantType.QInt8,
        "quint8": QuantType.QUInt8,
    }
    if qtype not in type_mapping.keys():
        raise ValueError(
            "type should be string one of 'quint8' or 'qint8'. Got {}".format(
                qtype))
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=type_mapping[qtype])
    if verbose:
        v_str = ("Model size before quantization (MB):"
                 f"{os.path.getsize(onnx_model_path) / 2**20:.2f}\n"
                 "Model size after quantization (MB):"
                 f"{os.path.getsize(quantized_model_path) / 2**20:.2f}")
        print("Done.")
        print(v_str)
        print(f"Quantized model saved to {quantized_model_path}.")
示例#23
0
    def create_predictor(cls, args):
        if args.use_onnxruntime:
            assert args.device != "xpu", "Running ONNXRuntime on XPU is temporarily not supported."
            if args.model_path.count(".onnx"):
                onnx_model = args.model_path
            else:
                import paddle2onnx
                onnx_model = paddle2onnx.command.c_paddle_to_onnx(
                    model_file=args.model_path + ".pdmodel",
                    params_file=args.model_path + ".pdiparams",
                    opset_version=13,
                    enable_onnx_checker=True)
            dynamic_quantize_model = onnx_model
            providers = ['CUDAExecutionProvider']
            if args.enable_quantize:
                from onnxruntime.quantization import QuantizationMode, quantize_dynamic

                float_onnx_file = "model.onnx"
                with open(float_onnx_file, "wb") as f:
                    f.write(onnx_model)
                dynamic_quantize_model = "dynamic_quantize_model.onnx"
                quantize_dynamic(float_onnx_file, dynamic_quantize_model)
                providers = ['CPUExecutionProvider']
            sess_options = ort.SessionOptions()
            sess_options.intra_op_num_threads = args.num_threads
            sess_options.inter_op_num_threads = args.num_threads
            predictor = ort.InferenceSession(dynamic_quantize_model,
                                             sess_options=sess_options,
                                             providers=providers)
            input_name1 = predictor.get_inputs()[1].name
            input_name2 = predictor.get_inputs()[0].name
            input_handles = [input_name1, input_name2]
            return cls(predictor, input_handles, [])

        config = paddle.inference.Config(args.model_path + ".pdmodel",
                                         args.model_path + ".pdiparams")
        if args.device == "gpu":
            # set GPU configs accordingly
            config.enable_use_gpu(100, 0)
            cls.device = paddle.set_device("gpu")
        elif args.device == "cpu":
            # set CPU configs accordingly,
            # such as enable_mkldnn, set_cpu_math_library_num_threads
            config.disable_gpu()
            config.switch_ir_optim(True)
            config.enable_mkldnn()
            config.set_cpu_math_library_num_threads(args.num_threads)
            cls.device = paddle.set_device("cpu")
        elif args.device == "xpu":
            # set XPU configs accordingly
            config.enable_xpu(100)
        if args.use_trt:
            precision_map = {
                "int8": inference.PrecisionType.Int8,
                "fp16": inference.PrecisionType.Half,
                "fp32": inference.PrecisionType.Float32
            }
            config.enable_tensorrt_engine(
                workspace_size=1 << 30,
                precision_mode=precision_map[args.precision],
                max_batch_size=args.batch_size,
                min_subgraph_size=5,
                use_static=False,
                use_calib_mode=False)
            print("Enable TensorRT is: {}".format(
                config.tensorrt_engine_enabled()))

            if args.collect_shape:
                config.collect_shape_range_info(args.task_name +
                                                args.shape_file)
            else:
                config.enable_tuned_tensorrt_dynamic_shape(
                    args.task_name + args.shape_file, True)

        config.delete_pass("embedding_eltwise_layernorm_fuse_pass")
        predictor = paddle.inference.create_predictor(config)

        input_handles = [
            predictor.get_input_handle(name)
            for name in predictor.get_input_names()
        ]
        output_handles = [
            predictor.get_output_handle(name)
            for name in predictor.get_output_names()
        ]

        return cls(predictor, input_handles, output_handles)
示例#24
0
    maxlen=128,
    export_model_path="outdir/14_0.79_sentim.onnx")

config = AutoConfig.from_pretrained(
    args.config_name if args.config_name else args.model_name_or_path,
    num_labels=args.num_labels)

tokenizer = AutoTokenizer.from_pretrained(
    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path)

### Quantization
if True:
    onnx_model_path = 'model.onnx'
    quantized_model_path = f"model-quantized.onnx"

    quantized_model = quantize_dynamic(onnx_model_path, quantized_model_path)

# text = preprocess_text("здравствуйте скажите пожалуйста как мне отключить данные номер на время")
# inputs = tokenizer(text, max_length=args.maxlen, padding="max_length", return_tensors="np")
# input_ids = inputs['input_ids']
# token_type_ids = inputs['token_type_ids']
# attention_mask = inputs['attention_mask']

ort_session = ort.InferenceSession('model-quantized.onnx')

# def to_numpy(tensor):
#     return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

# ort_inputs = {
#     ort_session.get_inputs()[0].name: input_ids,
#     ort_session.get_inputs()[1].name: token_type_ids,
示例#25
0
def train(conf: omegaconf.DictConfig) -> None:
    # fancy logger
    console = Console()
    # reproducibility
    pl.seed_everything(conf.train.seed)

    console.log(
        f"Starting training for [bold cyan]{conf.train.model_name}[/bold cyan] model"
    )
    if conf.train.pl_trainer.fast_dev_run:
        console.log(
            f"Debug mode {conf.train.pl_trainer.fast_dev_run}. Forcing debugger configuration"
        )
        # Debuggers don't like GPUs nor multiprocessing
        conf.train.pl_trainer.accelerator = "cpu"
        conf.train.pl_trainer.devices = 1
        conf.train.pl_trainer.strategy = None
        conf.train.pl_trainer.precision = 32
        conf.data.datamodule.num_workers = {
            k: 0 for k in conf.data.datamodule.num_workers
        }
        # Switch wandb to offline mode to prevent online logging
        conf.logging.log = None
        # remove model checkpoint callback
        conf.train.model_checkpoint_callback = None

    # data module declaration
    console.log(f"Instantiating the Data Module")
    pl_data_module: NERDataModule = hydra.utils.instantiate(
        conf.data.datamodule, _recursive_=False
    )
    # force setup to get labels initialized for the model
    pl_data_module.prepare_data()

    # main module declaration
    model_kwargs = {"_recursive_": False, "labels": pl_data_module.labels}
    console.log(f"Instantiating the Model")
    pl_module: NERModule = hydra.utils.instantiate(conf.model, **model_kwargs)

    experiment_logger: Optional[WandbLogger] = None
    experiment_path: Optional[Path] = None
    if conf.logging.log:
        console.log(f"Instantiating Wandb Logger")
        experiment_logger = hydra.utils.instantiate(conf.logging.wandb_arg)
        experiment_logger.watch(pl_module, **conf.logging.watch)

    # callbacks declaration
    callbacks_store = [RichProgressBar()]

    if conf.train.early_stopping_callback is not None:
        early_stopping_callback: EarlyStopping = hydra.utils.instantiate(
            conf.train.early_stopping_callback
        )
        callbacks_store.append(early_stopping_callback)

    model_checkpoint_callback: Optional[ModelCheckpoint] = None
    if conf.train.model_checkpoint_callback is not None:
        model_checkpoint_callback = hydra.utils.instantiate(
            conf.train.model_checkpoint_callback,
            dirpath=experiment_path / "checkpoints" if experiment_path else None,
        )
        callbacks_store.append(model_checkpoint_callback)

    # trainer
    console.log(f"Instantiating the Trainer")
    trainer: Trainer = hydra.utils.instantiate(
        conf.train.pl_trainer, callbacks=callbacks_store, logger=experiment_logger
    )

    model_export: Optional[Path] = None
    if trainer.global_rank == 0:
        if conf.logging.log:
            experiment_path = Path(experiment_logger.experiment.dir)
            # Store the YaML config separately into the wandb dir
            yaml_conf: str = OmegaConf.to_yaml(cfg=conf)
            (experiment_path / "hparams.yaml").write_text(yaml_conf)
            # save labels before starting training
            model_export = experiment_path / "model_export"
            model_export.mkdir(exist_ok=True, parents=True)
            # save labels
            pl_data_module.labels.to_file(model_export / "labels.json")

    # module fit
    trainer.fit(pl_module, datamodule=pl_data_module)

    if trainer.global_rank == 0:
        if model_checkpoint_callback:
            # load best model for testing
            best_pl_module = NERModule.load_from_checkpoint(
                model_checkpoint_callback.best_model_path, labels=pl_data_module.labels
            )
        else:
            best_pl_module = pl_module

        # module test
        trainer.test(best_pl_module, datamodule=pl_data_module)

        if conf.train.export and not conf.train.pl_trainer.fast_dev_run:
            # export model stuff
            best_model = best_pl_module.model
            torch.save(
                best_model.state_dict(),
                model_export / "weights.pt",
            )
            if is_onnx_available():
                from onnxruntime.quantization import quantize_dynamic, QuantType

                inputs = next(iter(pl_data_module.train_dataloader()))
                dynamic_axes = {
                    "input_ids": {
                        0: "batch_size",
                        1: "batch_length",
                    },  # variable length axes
                    "attention_mask": {
                        0: "batch_size",
                        1: "batch_length",
                    },  # variable length axes
                    "offsets": {
                        0: "batch_size",
                        1: "batch_length",
                    },  # variable length axes
                    "ner_tags": {
                        0: "batch_size",
                        1: "batch_length",
                    },  # variable length axes
                }
                # onnx accepts only Tuples
                onnx_inputs = (
                    inputs.input_ids,
                    inputs.attention_mask,
                    inputs.offsets,
                )
                input_names = ["input_ids", "attention_mask", "offsets"]

                # export onnx
                torch.onnx.export(
                    best_model,
                    onnx_inputs,
                    model_export / "weights.onnx",
                    export_params=True,  # store the trained parameter weights inside the model file
                    opset_version=15,  # the ONNX version to export the model to
                    do_constant_folding=True,  # whether to execute constant folding for optimization
                    input_names=input_names,  # the model's input names
                    output_names=["ner_tags"],  # the model's output names
                    verbose=False,
                    dynamic_axes=dynamic_axes,
                )
                quantize_dynamic(
                    model_input=model_export / "weights.onnx",
                    model_output=model_export / "weights.quantized.onnx",
                    per_channel=True,
                    activation_type=QuantType.QUInt8,
                    weight_type=QuantType.QUInt8,
                    optimize_model=True,
                )
示例#26
0
 def dynamic_quantize(self, input_float_model, dynamic_quantized_model):
     from onnxruntime.quantization import QuantizationMode, quantize_dynamic
     quantize_dynamic(input_float_model, dynamic_quantized_model)