示例#1
0
def convert(program, place, config=None, scope=None, save_int8=False):
    """
    convert quantized and well-trained ``program`` to final  quantized
    ``program``that can be used to  save ``inference model``.
    
    Args:
        program(paddle.static.Program): quantized and well-trained ``test program``.
        place(paddle.CPUPlace or paddle.CUDAPlace): This parameter represents
                the executor run on which device.
        config(dict, optional): configs for convert. if set None, will use
                default config. It must be same with config that used in
                'quant_aware'. Default is None.
        scope(paddle.static.Scope, optional):  Scope records the mapping between
                variable names and variables, similar to brackets in
                programming languages. Usually users can use
                `paddle.static.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_.
                When ``None`` will use 
                `paddle.static.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_
                . Default: ``None``.
        save_int8: Whether to return ``program`` which model parameters'
                dtype is ``int8``. This parameter can only be used to
                get model size. Default: ``False``.

    Returns:
        Tuple : freezed program which can be used for inference.
                when ``save_int8`` is False, return ``freezed_program(paddle.static.Program)``.
                when ``save_int8`` is True, return ``freezed_program(paddle.static.Program)``
                and ``freezed_program_int8(paddle.static.Program)``
    """
    scope = paddle.static.global_scope() if not scope else scope

    if config is None:
        config = _quant_config_default
    else:
        assert isinstance(config, dict), "config must be dict"
        config = _parse_configs(config)
    _logger.info("convert config {}".format(config))
    test_graph = IrGraph(core.Graph(program.desc), for_test=True)

    out_scale_infer_pass = OutScaleForInferencePass(scope=scope)
    out_scale_infer_pass.apply(test_graph)

    # Freeze the graph after training by adjusting the quantize
    # operators' order for the inference.
    freeze_pass = QuantizationFreezePass(
        scope=scope,
        place=place,
        weight_bits=config['weight_bits'],
        activation_bits=config['activation_bits'],
        weight_quantize_type=config['weight_quantize_type'])

    if os.path.exists(VARS_MAPPING_TABLE):
        test_graph.out_node_mapping_table = load_dict()

    freeze_pass.apply(test_graph)
    freezed_program = test_graph.to_program()

    if save_int8:
        convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
        convert_int8_pass.apply(test_graph)
        freezed_program_int8 = test_graph.to_program()
        return freezed_program, freezed_program_int8
    else:
        return freezed_program
示例#2
0
    def check_output_with_option(self,
                                 use_gpu,
                                 atol=1e-5,
                                 flatten=False,
                                 quant=False,
                                 rtol=1e-5):
        '''
        Check whether calculating on CPU and GPU, enable TensorRT 
        or disable TensorRT, enable MKLDNN or disable MKLDNN 
        are all the same. 
        '''
        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
        executor = fluid.Executor(place)
        scope = fluid.Scope()
        device = "GPU" if use_gpu else "CPU"

        with fluid.scope_guard(scope):
            executor.run(self.startup_program)
            executor.run(self.test_startup_program)
        main_graph = IrGraph(core.Graph(self.main_program.desc),
                             for_test=False)
        test_graph = IrGraph(core.Graph(self.test_main_program.desc),
                             for_test=True)

        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=self.activation_quantize_type,
            weight_quantize_type=self.weight_quantize_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph)

        iters = 10
        batch_size = 1
        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=[self.data, self.label],
                                  place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = executor.run(binary,
                                      feed=feeder.feed(data),
                                      fetch_list=[self.loss])

        scale_inference_pass = OutScaleForInferencePass(scope=scope)
        scale_inference_pass.apply(test_graph)

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope,
            place=place,
            weight_quantize_type=self.weight_quantize_type)
        freeze_pass.apply(test_graph)

        self.main_program = test_graph.to_program()

        with fluid.scope_guard(scope):
            self.main_program = self._normalize_program(
                self.main_program, self.data, self.fetch_list)

        self._save_models(self.path, list(self.feeds.keys()), self.fetch_list,
                          executor, self.main_program, scope)

        paddle_outs = self._get_paddle_outs(self.feeds, self.fetch_list,
                                            executor, self.main_program, scope)
        inference_outs = self._get_inference_outs(
            self._get_analysis_config(use_gpu=use_gpu))

        # Check whether the results calculated on CPU and on GPU are the same.
        self.assertTrue(
            len(paddle_outs) == len(inference_outs),
            "The number of outputs is different between inference and training forward at {}"
            .format(device))

        for out, inference_out in zip(paddle_outs, inference_outs):
            paddle_out = np.array(out)

            if flatten:
                paddle_out = paddle_out.flatten()
                inference_out = inference_out.flatten()

            self.assertTrue(
                np.allclose(paddle_out, inference_out, atol=atol),
                "Output has diff between inference and training forward at {} "
                .format(device))

        # Check whether the trt results and the GPU results are the same.
        if use_gpu and self.enable_trt:
            tensorrt_outputs = self._get_inference_outs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_trt=self.enable_trt))

            if self.trt_parameters.use_static:
                #deserialize
                tensorrt_outputs = self._get_inference_outs(
                    self._get_analysis_config(use_gpu=use_gpu,
                                              use_trt=self.enable_trt))

            self.assertTrue(
                len(tensorrt_outputs) == len(paddle_outs),
                "The number of outputs is different between GPU and TensorRT. "
            )

            for paddle_out, tensorrt_output in zip(paddle_outs,
                                                   tensorrt_outputs):
                paddle_out = np.array(paddle_out)

                if flatten:
                    paddle_out = paddle_out.flatten()
                    tensorrt_output = tensorrt_output.flatten()

                self.assertTrue(
                    np.allclose(paddle_out,
                                tensorrt_output,
                                rtol=rtol,
                                atol=atol),
                    "Output has diff between GPU and TensorRT. ")

        # Check whether the mkldnn results and the CPU results are the same.
        if (not use_gpu) and self.enable_mkldnn:
            mkldnn_outputs = self._get_inference_outs(
                self._get_analysis_config(use_gpu=use_gpu,
                                          use_mkldnn=self.enable_mkldnn))

            self.assertTrue(
                len(paddle_outs) == len(mkldnn_outputs),
                "The number of outputs is different between CPU and MKLDNN. ")

            if self.enable_mkldnn_bfloat16:
                atol = 0.01
            for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
                self.assertTrue(
                    np.allclose(np.array(paddle_out), mkldnn_output,
                                atol=atol),
                    "Output has diff between CPU and MKLDNN. ")
示例#3
0
    def quantization_scale(self,
                           use_cuda,
                           seed,
                           activation_quant_type,
                           weight_quant_type='abs_max',
                           for_ci=False,
                           act_preprocess_func=None,
                           weight_preprocess_func=None,
                           act_quantize_func=None,
                           weight_quantize_func=None):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(name='image',
                                            shape=[1, 28, 28],
                                            dtype='float32')
                    img.stop_gradient = False
                    label = fluid.layers.data(name='label',
                                              shape=[1],
                                              dtype='int64')
                    loss = conv_net(img, label)
                    if not is_test:
                        opt = fluid.optimizer.SGD(learning_rate=0.0001)
                        opt.minimize(loss)
            return [img, label], loss

        def get_optimizer():
            return fluid.optimizer.MomentumOptimizer(0.0001, 0.9)

        def load_dict():
            with open('mapping_table_for_saving_inference_model', 'r') as file:
                data = file.read()
                data = json.loads(data)
                return data

        def save_dict(Dict):
            with open('mapping_table_for_saving_inference_model', 'w') as file:
                file.write(json.dumps(Dict))

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        train_transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type,
            act_preprocess_func=act_preprocess_func,
            weight_preprocess_func=weight_preprocess_func,
            act_quantize_func=act_quantize_func,
            weight_quantize_func=weight_quantize_func,
            optimizer_func=get_optimizer,
            executor=exe)
        train_transform_pass.apply(main_graph)
        test_transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type,
            act_preprocess_func=act_preprocess_func,
            weight_preprocess_func=weight_preprocess_func,
            act_quantize_func=act_quantize_func,
            weight_quantize_func=weight_quantize_func,
            optimizer_func=get_optimizer,
            executor=exe)

        test_transform_pass.apply(test_graph)
        save_dict(test_graph.out_node_mapping_table)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        dev_name = '_gpu' if use_cuda else '_cpu'

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])

        out_scale_infer_pass = OutScaleForInferencePass(scope=scope)
        out_scale_infer_pass.apply(test_graph)

        freeze_pass = QuantizationFreezePass(
            scope=scope,
            place=place,
            weight_bits=8,
            activation_bits=8,
            weight_quantize_type=weight_quant_type)

        mapping_table = load_dict()
        test_graph.out_node_mapping_table = mapping_table
        if act_quantize_func == None and weight_quantize_func == None:
            freeze_pass.apply(test_graph)
    def quantization_scale(self,
                           use_cuda,
                           seed,
                           activation_quant_type,
                           weight_quant_type='abs_max',
                           for_ci=False):
        def build_program(main, startup, is_test):
            main.random_seed = seed
            startup.random_seed = seed
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    img = fluid.layers.data(name='image',
                                            shape=[1, 28, 28],
                                            dtype='float32')
                    label = fluid.layers.data(name='label',
                                              shape=[1],
                                              dtype='int64')
                    loss = residual_block(img, label, 1)
                    if not is_test:
                        opt = fluid.optimizer.Adam(learning_rate=0.0001)
                        opt.minimize(loss)
            return [img, label], loss

        random.seed(0)
        np.random.seed(0)

        main = fluid.Program()
        startup = fluid.Program()
        test_program = fluid.Program()
        feeds, loss = build_program(main, startup, False)
        build_program(test_program, startup, True)
        test_program = test_program.clone(for_test=True)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        test_graph = IrGraph(core.Graph(test_program.desc), for_test=True)

        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        scope = fluid.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)

        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quant_type)
        transform_pass.apply(main_graph)
        transform_pass.apply(test_graph)

        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
        add_quant_dequant_pass.apply(main_graph)
        add_quant_dequant_pass.apply(test_graph)

        scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place)
        scale_training_pass.apply(main_graph)

        dev_name = '_gpu' if use_cuda else '_cpu'
        if not for_ci:
            marked_nodes = set()
            for op in main_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            main_graph.draw('.', 'main_scale' + dev_name, marked_nodes)
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'test_scale' + dev_name, marked_nodes)

        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = False
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=loss.name, build_strategy=build_strategy)
        iters = 5
        batch_size = 8

        train_reader = paddle.batch(paddle.reader.shuffle(
            paddle.dataset.mnist.train(), buf_size=500),
                                    batch_size=batch_size)
        feeder = fluid.DataFeeder(feed_list=feeds, place=place)
        with fluid.scope_guard(scope):
            for _ in range(iters):
                data = next(train_reader())
                loss_v = exe.run(binary,
                                 feed=feeder.feed(data),
                                 fetch_list=[loss])
                if not for_ci:
                    print('{}: {}'.format('loss' + dev_name, loss_v))

        scale_inference_pass = OutScaleForInferencePass(scope=scope)
        scale_inference_pass.apply(test_graph)

        # Freeze graph for inference, but the weight of fc/conv is still float type.
        freeze_pass = QuantizationFreezePass(
            scope=scope, place=place, weight_quantize_type=weight_quant_type)
        freeze_pass.apply(test_graph)
        server_program = test_graph.to_program()

        if not for_ci:
            marked_nodes = set()
            for op in test_graph.all_op_nodes():
                if op.name().find('quantize') > -1:
                    marked_nodes.add(op)
            test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes)

        with open('quant_scale_model' + dev_name + '.txt', 'w') as f:
            f.write(str(server_program))

        with fluid.scope_guard(scope):
            fluid.io.save_inference_model('quant_scale_model' + dev_name,
                                          ['image', 'label'], [loss], exe,
                                          server_program)
示例#5
0
    def test_out_scale_acc(self):
        def _build_static_lenet(main, startup, is_test=False, seed=1000):
            with fluid.unique_name.guard():
                with fluid.program_guard(main, startup):
                    main.random_seed = seed
                    startup.random_seed = seed
                    img = fluid.layers.data(name='image',
                                            shape=[1, 28, 28],
                                            dtype='float32')
                    label = fluid.layers.data(name='label',
                                              shape=[1],
                                              dtype='int64')
                    prediction = StaticLenet(img)
                    if not is_test:
                        loss = fluid.layers.cross_entropy(input=prediction,
                                                          label=label)
                        avg_loss = fluid.layers.mean(loss)
                    else:
                        avg_loss = prediction
            return img, label, avg_loss

        reader = paddle.batch(paddle.dataset.mnist.test(),
                              batch_size=32,
                              drop_last=True)
        weight_quantize_type = 'abs_max'
        activation_quant_type = 'moving_average_abs_max'
        param_init_map = {}
        seed = 1000
        lr = 0.1
        dynamic_out_scale_list = []
        static_out_scale_list = []

        # imperative train
        _logger.info(
            "--------------------------dynamic graph qat--------------------------"
        )
        imperative_out_scale = ImperativeQuantAware()

        with fluid.dygraph.guard():
            np.random.seed(seed)
            fluid.default_main_program().random_seed = seed
            fluid.default_startup_program().random_seed = seed
            lenet = ImperativeLenet()
            fixed_state = {}
            for name, param in lenet.named_parameters():
                p_shape = param.numpy().shape
                p_value = param.numpy()
                if name.endswith("bias"):
                    value = np.zeros_like(p_value).astype('float32')
                else:
                    value = np.random.normal(loc=0.0,
                                             scale=0.01,
                                             size=np.product(p_shape)).reshape(
                                                 p_shape).astype('float32')
                fixed_state[name] = value
                param_init_map[param.name] = value
            lenet.set_dict(fixed_state)
            imperative_out_scale.quantize(lenet)
            adam = AdamOptimizer(learning_rate=lr,
                                 parameter_list=lenet.parameters())
            dynamic_loss_rec = []
            lenet.train()
            for batch_id, data in enumerate(reader()):
                x_data = np.array([x[0].reshape(1, 28, 28)
                                   for x in data]).astype('float32')
                y_data = np.array([x[1] for x in data
                                   ]).astype('int64').reshape(-1, 1)

                img = fluid.dygraph.to_variable(x_data)
                label = fluid.dygraph.to_variable(y_data)

                out = lenet(img)
                loss = fluid.layers.cross_entropy(out, label)
                avg_loss = fluid.layers.mean(loss)
                avg_loss.backward()
                adam.minimize(avg_loss)
                lenet.clear_gradients()
                dynamic_loss_rec.append(avg_loss.numpy()[0])
                if batch_id % 100 == 0:
                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))

            lenet.eval()

        path = "./dynamic_outscale_infer_model/lenet"
        dynamic_save_dir = "./dynamic_outscale_infer_model"

        imperative_out_scale.save_quantized_model(
            layer=lenet,
            path=path,
            input_spec=[
                paddle.static.InputSpec(shape=[None, 1, 28, 28],
                                        dtype='float32')
            ])

        _logger.info(
            "--------------------------static graph qat--------------------------"
        )
        static_loss_rec = []
        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)
        else:
            place = core.CPUPlace()
        exe = fluid.Executor(place)

        main = fluid.Program()
        infer = fluid.Program()
        startup = fluid.Program()
        static_img, static_label, static_loss = _build_static_lenet(
            main, startup, False, seed)
        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
                                                      seed)
        with fluid.unique_name.guard():
            with fluid.program_guard(main, startup):
                opt = AdamOptimizer(learning_rate=lr)
                opt.minimize(static_loss)

        scope = core.Scope()
        with fluid.scope_guard(scope):
            exe.run(startup)
        for param in main.all_parameters():
            param_tensor = scope.var(param.name).get_tensor()
            param_tensor.set(param_init_map[param.name], place)
        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
        transform_pass = QuantizationTransformPass(
            scope=scope,
            place=place,
            activation_quantize_type=activation_quant_type,
            weight_quantize_type=weight_quantize_type,
            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
        transform_pass.apply(main_graph)
        transform_pass.apply(infer_graph)
        outscale_pass = OutScaleForTrainingPass(scope=scope, place=place)
        outscale_pass.apply(main_graph)
        build_strategy = fluid.BuildStrategy()
        build_strategy.fuse_all_reduce_ops = False
        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
            loss_name=static_loss.name, build_strategy=build_strategy)

        feeder = fluid.DataFeeder(feed_list=[static_img, static_label],
                                  place=place)
        with fluid.scope_guard(scope):
            for batch_id, data in enumerate(reader()):
                loss_v, = exe.run(binary,
                                  feed=feeder.feed(data),
                                  fetch_list=[static_loss])
                static_loss_rec.append(loss_v[0])
                if batch_id % 100 == 0:
                    _logger.info('{}: {}'.format('loss', loss_v))
        scale_inference_pass = OutScaleForInferencePass(scope=scope)
        scale_inference_pass.apply(infer_graph)

        save_program = infer_graph.to_program()
        static_save_dir = "./static_outscale_infer_model"
        with fluid.scope_guard(scope):
            fluid.io.save_inference_model(
                dirname=static_save_dir,
                feeded_var_names=[infer_img.name],
                target_vars=[infer_pre],
                executor=exe,
                main_program=save_program,
                model_filename="lenet" + INFER_MODEL_SUFFIX,
                params_filename="lenet" + INFER_PARAMS_SUFFIX)

        rtol = 1e-05
        atol = 1e-08
        for i, (loss_d,
                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
            diff = np.abs(loss_d - loss_s)
            if diff > (atol + rtol * np.abs(loss_s)):
                _logger.info(
                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
                    format(diff, i, loss_d, loss_s))
                break

        self.assertTrue(np.allclose(np.array(dynamic_loss_rec),
                                    np.array(static_loss_rec),
                                    rtol=rtol,
                                    atol=atol,
                                    equal_nan=True),
                        msg='Failed to do the imperative qat.')

        # load dynamic model
        [dynamic_inference_program, feed_target_names,
         fetch_targets] = (fluid.io.load_inference_model(
             dirname=dynamic_save_dir,
             executor=exe,
             model_filename="lenet" + INFER_MODEL_SUFFIX,
             params_filename="lenet" + INFER_PARAMS_SUFFIX))
        # load static model
        [static_inference_program, feed_target_names,
         fetch_targets] = (fluid.io.load_inference_model(
             dirname=static_save_dir,
             executor=exe,
             model_filename="lenet" + INFER_MODEL_SUFFIX,
             params_filename="lenet" + INFER_PARAMS_SUFFIX))

        dynamic_ops = dynamic_inference_program.global_block().ops
        static_ops = static_inference_program.global_block().ops

        for op in dynamic_ops[:]:
            if op.type == "flatten2" or 'fake' in op.type:
                dynamic_ops.remove(op)

        for op in static_ops[:]:
            if 'fake' in op.type:
                static_ops.remove(op)

        for i in range(len(dynamic_ops)):
            if dynamic_ops[i].has_attr("out_threshold"):
                self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
                self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                static_ops[i].attr("out_threshold"))