def convert(program, place, config=None, scope=None, save_int8=False): """ convert quantized and well-trained ``program`` to final quantized ``program`` that can be used to save ``inference model``. Args: program(fluid.Program): quantized and well-trained ``test program``. place(fluid.CPUPlace or fluid.CUDAPlace): This parameter represents the executor run on which device. config(dict, optional): configs for convert. if set None, will use default config. It must be same with config that used in 'quant_aware'. Default: None. scope(fluid.Scope, optional): Scope records the mapping between variable names and variables, similar to brackets in programming languages. Usually users can use `fluid.global_scope <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_. When ``None`` will use `fluid.global_scope() <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api_cn/executor_cn/global_scope_cn.html>`_ . Default: ``None``. save_int8: Whether to return ``program`` which model parameters' dtype is ``int8``. This parameter can only be used to get model size. Default: ``False``. Returns: Tuple : freezed program which can be used for inference. when ``save_int8`` is False, return ``freezed_program(fluid.Program)``. when ``save_int8`` is True, return ``freezed_program(fluid.Program)`` and ``freezed_program_int8(fluid.Program)`` """ scope = fluid.global_scope() if not scope else scope if config is None: config = _quant_config_default else: assert isinstance(config, dict), "config must be dict" config = _parse_configs(config) _logger.info("convert config {}".format(config)) test_graph = IrGraph(core.Graph(program.desc), for_test=True) support_op_types = [] for op in config['quantize_op_types']: if op in QuantizationFreezePass._supported_quantizable_op_type: support_op_types.append(op) # Freeze the graph after training by adjusting the quantize # operators' order for the inference. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_bits=config['weight_bits'], activation_bits=config['activation_bits'], weight_quantize_type=config['weight_quantize_type'], quantizable_op_type=support_op_types) freeze_pass.apply(test_graph) freezed_program = test_graph.to_program() if save_int8: convert_int8_pass = ConvertToInt8Pass( scope=fluid.global_scope(), place=place, quantizable_op_type=support_op_types) convert_int8_pass.apply(test_graph) freezed_program_int8 = test_graph.to_program() return freezed_program, freezed_program_int8 else: return freezed_program
def freeze_graph(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=True, quant_skip_pattern='skip_quant'): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data( name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data( name='label', shape=[1], dtype='int64') loss = conv_net(img, label, quant_skip_pattern) if not is_test: opt = fluid.optimizer.Adam(learning_rate=0.001) opt.minimize(loss) return [img, label], loss random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, skip_pattern=quant_skip_pattern) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' if not for_ci: marked_nodes = set() for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) quantized_test_program = test_graph.to_program() iters = 5 batch_size = 8 train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: print('{}: {}'.format('loss' + dev_name + activation_quant_type + '_' + weight_quant_type, loss_v)) test_data = next(test_reader()) with fluid.program_guard(quantized_test_program): w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', quantized_test_program) # Testing with fluid.scope_guard(scope): test_loss1, w_quant = exe.run(program=quantized_test_program, feed=feeder.feed(test_data), fetch_list=[loss, w_var]) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program = test_graph.to_program() with fluid.scope_guard(scope): test_loss2, = exe.run(program=server_program, feed=feeder.feed(test_data), fetch_list=[loss]) self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3) if not for_ci: print( '{}: {}'.format('test_loss1' + dev_name + activation_quant_type + '_' + weight_quant_type, test_loss1)) print( '{}: {}'.format('test_loss2' + dev_name + activation_quant_type + '_' + weight_quant_type, test_loss2)) w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) # Maybe failed, this is due to the calculation precision # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant)) if not for_ci: print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_freeze))) print('{}: {}'.format('w_quant' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_quant))) # Convert parameter to 8-bit. convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place) convert_int8_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) server_program_int8 = test_graph.to_program() # Save the 8-bit parameter and model file. with fluid.scope_guard(scope): fluid.io.save_inference_model( 'server_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, ['image', 'label'], [loss], exe, server_program_int8) # Test whether the 8-bit parameter and model file can be loaded successfully. [infer, feed, fetch] = fluid.io.load_inference_model( 'server_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, exe) # Check the loaded 8-bit weight. w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor()) self.assertEqual(w_8bit.dtype, np.int8) self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) if not for_ci: print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_8bit))) print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_freeze))) mobile_pass = TransformForMobilePass() mobile_pass.apply(test_graph) if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_mobile' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) mobile_program = test_graph.to_program() with fluid.scope_guard(scope): fluid.io.save_inference_model( 'mobile_int8' + dev_name + activation_quant_type + '_' + weight_quant_type, ['image', 'label'], [loss], exe, mobile_program)
def check_output_with_option(self, use_gpu, atol=1e-5, flatten=False, quant=False, rtol=1e-5): ''' Check whether calculating on CPU and GPU, enable TensorRT or disable TensorRT, enable MKLDNN or disable MKLDNN are all the same. ''' place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() executor = fluid.Executor(place) scope = fluid.Scope() device = "GPU" if use_gpu else "CPU" with fluid.scope_guard(scope): executor.run(self.startup_program) executor.run(self.test_startup_program) main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=False) test_graph = IrGraph(core.Graph(self.test_main_program.desc), for_test=True) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=self.activation_quantize_type, weight_quantize_type=self.weight_quantize_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph) iters = 10 batch_size = 1 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=[self.data, self.label], place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = executor.run(binary, feed=feeder.feed(data), fetch_list=[self.loss]) scale_inference_pass = OutScaleForInferencePass(scope=scope) scale_inference_pass.apply(test_graph) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=self.weight_quantize_type) freeze_pass.apply(test_graph) self.main_program = test_graph.to_program() with fluid.scope_guard(scope): self.main_program = self._normalize_program( self.main_program, self.data, self.fetch_list) self._save_models(self.path, list(self.feeds.keys()), self.fetch_list, executor, self.main_program, scope) paddle_outs = self._get_paddle_outs(self.feeds, self.fetch_list, executor, self.main_program, scope) inference_outs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu)) # Check whether the results calculated on CPU and on GPU are the same. self.assertTrue( len(paddle_outs) == len(inference_outs), "The number of outputs is different between inference and training forward at {}" .format(device)) for out, inference_out in zip(paddle_outs, inference_outs): paddle_out = np.array(out) if flatten: paddle_out = paddle_out.flatten() inference_out = inference_out.flatten() self.assertTrue( np.allclose(paddle_out, inference_out, atol=atol), "Output has diff between inference and training forward at {} " .format(device)) # Check whether the trt results and the GPU results are the same. if use_gpu and self.enable_trt: tensorrt_outputs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) if self.trt_parameters.use_static: #deserialize tensorrt_outputs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) self.assertTrue( len(tensorrt_outputs) == len(paddle_outs), "The number of outputs is different between GPU and TensorRT. " ) for paddle_out, tensorrt_output in zip(paddle_outs, tensorrt_outputs): paddle_out = np.array(paddle_out) if flatten: paddle_out = paddle_out.flatten() tensorrt_output = tensorrt_output.flatten() self.assertTrue( np.allclose(paddle_out, tensorrt_output, rtol=rtol, atol=atol), "Output has diff between GPU and TensorRT. ") # Check whether the mkldnn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: mkldnn_outputs = self._get_inference_outs( self._get_analysis_config(use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn)) self.assertTrue( len(paddle_outs) == len(mkldnn_outputs), "The number of outputs is different between CPU and MKLDNN. ") if self.enable_mkldnn_bfloat16: atol = 0.01 for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs): self.assertTrue( np.allclose(np.array(paddle_out), mkldnn_output, atol=atol), "Output has diff between CPU and MKLDNN. ")
def quantization_scale(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=False): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = residual_block(img, label, 1) if not is_test: opt = fluid.optimizer.Adam(learning_rate=0.0001) opt.minimize(loss) return [img, label], loss random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) dev_name = '_gpu' if use_cuda else '_cpu' if not for_ci: marked_nodes = set() for op in main_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) main_graph.draw('.', 'main_scale' + dev_name, marked_nodes) marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'test_scale' + dev_name, marked_nodes) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) if not for_ci: print('{}: {}'.format('loss' + dev_name, loss_v)) scale_inference_pass = OutScaleForInferencePass(scope=scope) scale_inference_pass.apply(test_graph) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes) with open('quant_scale_model' + dev_name + '.txt', 'w') as f: f.write(str(server_program)) with fluid.scope_guard(scope): fluid.io.save_inference_model('quant_scale_model' + dev_name, ['image', 'label'], [loss], exe, server_program)
def check_output_with_option(self, use_gpu, atol=1e-5, flatten=False, quant=False): ''' Check whether calculating on CPU and GPU, enable TensorRT or disable TensorRT, enable MKLDNN or disable MKLDNN are all the same. ''' place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() executor = fluid.Executor(place) scope = fluid.Scope() device = "GPU" if use_gpu else "CPU" with fluid.scope_guard(scope): executor.run(self.startup_program) if quant: main_graph = IrGraph(core.Graph(self.main_program.desc), for_test=True) transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=self.activation_quant_type, weight_quantize_type=self.weight_quant_type, quantizable_op_type=[ 'conv2d', 'mul', 'depthwise_conv2d', 'conv2d_transpose' ]) transform_pass.apply(main_graph) weight_scale_map = { "conv2d": "conv2d_0.w_0.scale", "mul": "fc_0.w_0.scale" } weight_scale_tensor = scope.var( weight_scale_map[self.quantized_op_type]).get_tensor() weight_scale = np.ones(self.channels).astype("float32") weight_scale_tensor.set(weight_scale, place) op_nodes = main_graph.all_op_nodes() for op_node in op_nodes: if op_node.name() in [self.quantized_op_type, "relu"]: op_node.op()._set_attr("out_threshold", 0.5) with fluid.scope_guard(scope): executor.run(program=self.main_program, feed=self.feeds, fetch_list=self.fetch_list) freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=self.weight_quant_type) freeze_pass.apply(main_graph) self.main_program = main_graph.to_program() outs = self._save_models(executor, self.main_program, scope) analysis_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu)) # Check whether the results calculated on CPU and on GPU are the same. self.assertTrue( len(outs) == len(analysis_outputs), "The number of outputs is different between inference and training forward at {}" .format(device)) for out, analysis_output in zip(outs, analysis_outputs): out = np.array(out) if flatten: out = out.flatten() analysis_output = analysis_output.flatten() self.assertTrue( np.allclose(out, analysis_output, atol=atol), "Output has diff between inference and training forward at {} " .format(device)) # Check whether the trt results and the GPU results are the same. if use_gpu and self.enable_trt: tensorrt_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) if self.trt_parameters.use_static: #deserialize tensorrt_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu, use_trt=self.enable_trt)) self.assertTrue( len(tensorrt_outputs) == len(outs), "The number of outputs is different between GPU and TensorRT. " ) for out, tensorrt_output in zip(outs, tensorrt_outputs): out = np.array(out) if flatten: out = out.flatten() tensorrt_output = tensorrt_output.flatten() self.assertTrue(np.allclose(out, tensorrt_output, atol=atol), "Output has diff between GPU and TensorRT. ") # Check whether the mkldnn results and the CPU results are the same. if (not use_gpu) and self.enable_mkldnn: mkldnn_outputs = self._get_analysis_outputs( self._get_analysis_config(use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn)) self.assertTrue( len(outs) == len(mkldnn_outputs), "The number of outputs is different between CPU and MKLDNN. ") if self.enable_mkldnn_bfloat16: atol = 0.01 for out, mkldnn_output in zip(outs, mkldnn_outputs): self.assertTrue( np.allclose(np.array(out), mkldnn_output, atol=atol), "Output has diff between CPU and MKLDNN. ")
def train(args): # parameters from arguments model_name = args.model pretrained_fp32_model = args.pretrained_fp32_model checkpoint = args.checkpoint model_save_dir = args.model_save_dir data_dir = args.data_dir activation_quant_type = args.act_quant_type weight_quant_type = args.wt_quant_type print("Using %s as the actiavtion quantize type." % activation_quant_type) print("Using %s as the weight quantize type." % weight_quant_type) startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() _, _, train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) image, out, test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) main_graph = IrGraph(core.Graph(train_prog.desc), for_test=False) test_graph = IrGraph(core.Graph(test_prog.desc), for_test=True) if pretrained_fp32_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_fp32_model, var.name)) fluid.io.load_vars( exe, pretrained_fp32_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: visible_device = os.getenv('CUDA_VISIBLE_DEVICES') if visible_device: device_num = len(visible_device.split(',')) else: device_num = subprocess.check_output( ['nvidia-smi', '-L']).decode().count('\n') else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 1 if activation_quant_type == 'abs_max' else 8 train_reader = paddle.batch( reader.train(data_dir=data_dir), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.val(data_dir=data_dir), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name, global_lr.name] test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] # 1. Make some quantization transforms in the graph before training and testing. # According to the weight and activation quantization type, the graph will be added # some fake quantize operators and fake dequantize operators. transform_pass = QuantizationTransformPass( scope=fluid.global_scope(), place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) if checkpoint: load_persistable_nodes(exe, checkpoint, main_graph) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=train_cost.name, build_strategy=build_strategy) test_prog = test_graph.to_program() params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5, lr = exe.run(binary, fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(period) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, loss, acc1, acc5, "%.6f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, loss, acc1, acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, train_loss, train_acc1, train_acc5, test_loss, test_acc1, test_acc5)) sys.stdout.flush() save_checkpoint_path = os.path.join(model_save_dir, model_name, str(pass_id)) if not os.path.isdir(save_checkpoint_path): os.makedirs(save_checkpoint_path) save_persistable_nodes(exe, save_checkpoint_path, main_graph) model_path = os.path.join(model_save_dir, model_name, args.act_quant_type) float_path = os.path.join(model_path, 'float') int8_path = os.path.join(model_path, 'int8') mobile_path = os.path.join(model_path, 'mobile') if not os.path.isdir(model_path): os.makedirs(model_path) # 2. Freeze the graph after training by adjusting the quantize # operators' order for the inference. freeze_pass = QuantizationFreezePass( scope=fluid.global_scope(), place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() fluid.io.save_inference_model( dirname=float_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=server_program) # 3. Convert the weights into int8_t type. # (This step is optional.) convert_int8_pass = ConvertToInt8Pass(scope=fluid.global_scope(), place=place) convert_int8_pass.apply(test_graph) server_int8_program = test_graph.to_program() fluid.io.save_inference_model( dirname=int8_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=server_int8_program) # 4. Convert the freezed graph for paddle-mobile execution. # (This step is optional.) mobile_pass = TransformForMobilePass() mobile_pass.apply(test_graph) mobile_program = test_graph.to_program() fluid.io.save_inference_model( dirname=mobile_path, feeded_var_names=[image.name], target_vars=[out], executor=exe, main_program=mobile_program)
def main(): cfg = load_config(FLAGS.config) if 'architecture' in cfg: main_arch = cfg.architecture else: raise ValueError("'architecture' not specified in config file.") merge_config(FLAGS.opt) if 'log_iter' not in cfg: cfg.log_iter = 20 # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) if 'eval_feed' not in cfg: eval_feed = create(main_arch + 'EvalFeed') else: eval_feed = create(cfg.eval_feed) place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) _, test_feed_vars = create_feed(eval_feed, False) eval_reader = create_reader(eval_feed, args_path=FLAGS.dataset_dir) #eval_pyreader.decorate_sample_list_generator(eval_reader, place) test_data_feed = fluid.DataFeeder(test_feed_vars.values(), place) assert os.path.exists(FLAGS.model_path) infer_prog, feed_names, fetch_targets = fluid.io.load_inference_model( dirname=FLAGS.model_path, executor=exe, model_filename='__model__.infer', params_filename='__params__') eval_keys = ['bbox', 'gt_box', 'gt_label', 'is_difficult'] eval_values = [ 'multiclass_nms_0.tmp_0', 'gt_box', 'gt_label', 'is_difficult' ] eval_cls = [] eval_values[0] = fetch_targets[0] results = eval_run(exe, infer_prog, eval_reader, eval_keys, eval_values, eval_cls, test_data_feed) resolution = None if 'mask' in results[0]: resolution = model.mask_head.resolution box_ap_stats = eval_results(results, eval_feed, cfg.metric, cfg.num_classes, resolution, False, FLAGS.output_eval) logger.info("freeze the graph for inference") test_graph = IrGraph(core.Graph(infer_prog.desc), for_test=True) freeze_pass = QuantizationFreezePass( scope=fluid.global_scope(), place=place, weight_quantize_type=FLAGS.weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() fluid.io.save_inference_model(dirname=os.path.join(FLAGS.save_path, 'float'), feeded_var_names=feed_names, target_vars=fetch_targets, executor=exe, main_program=server_program, model_filename='model', params_filename='weights') logger.info("convert the weights into int8 type") convert_int8_pass = ConvertToInt8Pass(scope=fluid.global_scope(), place=place) convert_int8_pass.apply(test_graph) server_int8_program = test_graph.to_program() fluid.io.save_inference_model(dirname=os.path.join(FLAGS.save_path, 'int8'), feeded_var_names=feed_names, target_vars=fetch_targets, executor=exe, main_program=server_int8_program, model_filename='model', params_filename='weights')
def quantization_scale(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', for_ci=False, act_preprocess_func=None, weight_preprocess_func=None, act_quantize_func=None, weight_quantize_func=None): def build_program(main, startup, is_test): main.random_seed = seed startup.random_seed = seed with fluid.unique_name.guard(): with fluid.program_guard(main, startup): img = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32') img.stop_gradient = False label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = conv_net(img, label) if not is_test: opt = fluid.optimizer.SGD(learning_rate=0.0001) opt.minimize(loss) return [img, label], loss def get_optimizer(): return fluid.optimizer.MomentumOptimizer(0.0001, 0.9) def load_dict(): with open('mapping_table_for_saving_inference_model', 'r') as file: data = file.read() data = json.loads(data) return data def save_dict(Dict): with open('mapping_table_for_saving_inference_model', 'w') as file: file.write(json.dumps(Dict)) random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = build_program(main, startup, False) build_program(test_program, startup, True) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) train_transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, act_preprocess_func=act_preprocess_func, weight_preprocess_func=weight_preprocess_func, act_quantize_func=act_quantize_func, weight_quantize_func=weight_quantize_func, optimizer_func=get_optimizer, executor=exe) train_transform_pass.apply(main_graph) test_transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type, act_preprocess_func=act_preprocess_func, weight_preprocess_func=weight_preprocess_func, act_quantize_func=act_quantize_func, weight_quantize_func=weight_quantize_func, optimizer_func=get_optimizer, executor=exe) test_transform_pass.apply(test_graph) save_dict(test_graph.out_node_mapping_table) add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place) add_quant_dequant_pass.apply(main_graph) add_quant_dequant_pass.apply(test_graph) scale_training_pass = OutScaleForTrainingPass(scope=scope, place=place) scale_training_pass.apply(main_graph) dev_name = '_gpu' if use_cuda else '_cpu' build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.fuse_all_reduce_ops = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) out_scale_infer_pass = OutScaleForInferencePass(scope=scope) out_scale_infer_pass.apply(test_graph) freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_bits=8, activation_bits=8, weight_quantize_type=weight_quant_type) mapping_table = load_dict() test_graph.out_node_mapping_table = mapping_table if act_quantize_func == None and weight_quantize_func == None: freeze_pass.apply(test_graph)
def mkldnn_based_freeze_graph(self, use_cuda, seed, activation_quant_type, weight_quant_type='abs_max', quant_perf=False, for_ci=False): random.seed(0) np.random.seed(0) main = fluid.Program() startup = fluid.Program() test_program = fluid.Program() feeds, loss = self.build_program(main, startup, False, seed) self.build_program(test_program, startup, True, seed) test_program = test_program.clone(for_test=True) main_graph = IrGraph(core.Graph(main.desc), for_test=False) test_graph = IrGraph(core.Graph(test_program.desc), for_test=True) place = fluid.CPUPlace() exe = fluid.Executor(place) scope = fluid.Scope() with fluid.scope_guard(scope): exe.run(startup) # Apply the QuantizationTransformPass transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False build_strategy.enable_inplace = False binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) quantized_test_program = test_graph.to_program() iters = 5 batch_size = 8 train_reader = paddle.batch(paddle.reader.shuffle( paddle.dataset.mnist.train(), buf_size=500), batch_size=batch_size) test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=feeds, place=place) # Training the model to get the weights value with fluid.scope_guard(scope): for _ in range(iters): data = next(train_reader()) loss_v = exe.run(binary, feed=feeder.feed(data), fetch_list=[loss]) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) # Transform quantized graph for MKL-DNN INT8 inference mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place) mkldnn_int8_pass.apply(test_graph) dev_name = '_cpu_' if not for_ci: marked_nodes = set() for op in test_graph.all_op_nodes(): if op.name().find('quantize') > -1: marked_nodes.add(op) test_graph.draw( '.', 'test_mkldnn' + dev_name + activation_quant_type + '_' + weight_quant_type, marked_nodes) mkldnn_program = test_graph.to_program() # Check the transformation weights of conv2d and mul conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor()) mul_w_mkldnn = np.array(scope.find_var('fc_0.w_0').get_tensor()) # Check if weights are still integer self.assertFalse(self.isinteger(np.sum(conv_w_mkldnn))) self.assertFalse(self.isinteger(np.sum(mul_w_mkldnn))) # Check if the conv2d output and mul output are correctly linked to fake_dequantize's # output self.check_program(mkldnn_program) if not for_ci: print('{}: {}'.format( 'w_mkldnn' + dev_name + activation_quant_type + '_' + weight_quant_type, np.sum(w_mkldnn)))
def eval(args): # parameters from arguments place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) val_program, feed_names, fetch_targets = fluid.io.load_inference_model( args.model_path, exe, model_filename="__model__.infer", params_filename="__params__") val_reader = paddle.batch(reader.val(), batch_size=128) feeder = fluid.DataFeeder( place=place, feed_list=feed_names, program=val_program) results = [] for batch_id, data in enumerate(val_reader()): image = [[d[0]] for d in data] label = [[d[1]] for d in data] feed_data = feeder.feed(image) pred = exe.run(val_program, feed=feed_data, fetch_list=fetch_targets) pred = np.array(pred[0]) label = np.array(label) sort_array = pred.argsort(axis=1) top_1_pred = sort_array[:, -1:][:, ::-1] top_1 = np.mean(label == top_1_pred) top_5_pred = sort_array[:, -5:][:, ::-1] acc_num = 0 for i in range(len(label)): if label[i][0] in top_5_pred[i]: acc_num += 1 top_5 = acc_num / len(label) results.append([top_1, top_5]) result = np.mean(np.array(results), axis=0) print("top1_acc/top5_acc= {}".format(result)) sys.stdout.flush() _logger.info("freeze the graph for inference") test_graph = IrGraph(core.Graph(val_program.desc), for_test=True) freeze_pass = QuantizationFreezePass( scope=fluid.global_scope(), place=place, weight_quantize_type=args.weight_quant_type) freeze_pass.apply(test_graph) server_program = test_graph.to_program() fluid.io.save_inference_model( dirname=os.path.join(args.save_path, 'float'), feeded_var_names=feed_names, target_vars=fetch_targets, executor=exe, main_program=server_program, model_filename='model', params_filename='weights') _logger.info("convert the weights into int8 type") convert_int8_pass = ConvertToInt8Pass( scope=fluid.global_scope(), place=place) convert_int8_pass.apply(test_graph) server_int8_program = test_graph.to_program() fluid.io.save_inference_model( dirname=os.path.join(args.save_path, 'int8'), feeded_var_names=feed_names, target_vars=fetch_targets, executor=exe, main_program=server_int8_program, model_filename='model', params_filename='weights')
def create_quant_model(model, params, activation_quantize_type='moving_average_abs_max', weight_quantize_type='channel_wise_abs_max', save=False): place = paddle.CUDAPlace(0) scope = global_scope() exe = paddle.static.Executor(place) [inference_program, feed_target_names, fetch_targets ] = paddle.static.load_inference_model(path_prefix=None, executor=exe, model_filename=model, params_filename=params) graph = IrGraph(core.Graph(inference_program.desc), for_test=True) out_scale_op_list = [ "conv2d", "depthwise_conv2d", "mul", "matmul", "relu", "leaky_relu", "relu6", "sigmoid", "tanh", "prelu", "swish", "softmax", "batch_norm", "layer_norm", "elementwise_add", "pool2d", "reshape2", "transpose2", "concat", "elementwise_mul", "scale", "slice", "hard_swish", "hard_sigmoid", "conv2d_transpose", "gru", "bilinear_interp", "nearest_interp", "trilinear_interp", "flatten", "flatten2", "transpose", "pad2d", "reshape", "layer_norm", ] op_real_in_out_name = { "conv2d": [["Input", "Filter"], ["Output"]], "depthwise_conv2d": [["Input", "Filter"], ["Output"]], "conv2d_transpose": [["Input", "Filter"], ["Output"]], "mul": [["X", "Y"], ["Out"]], "matmul": [["X", "Y"], ["Out"]], "pool2d": [["X"], ["Out"]], "elementwise_add": [["X", "Y"], ["Out"]], "concat": [["X"], ["Out"]], "softmax": [["X"], ["Out"]], "argmax": [["X"], ["Out"]], "transpose": [["X"], ["Out"]], "equal": [["X", "Y"], ["Out"]], "gather": [["X"], ["Out"]], "greater_equal": [["X", "Y"], ["Out"]], "greater_than": [["X", "Y"], ["Out"]], "less_equal": [["X", "Y"], ["Out"]], "less_than": [["X", "Y"], ["Out"]], "mean": [["X"], ["Out"]], "not_equal": [["X", "Y"], ["Out"]], "reshape": [["X"], ["Out"]], "reshape2": [["X"], ["Out"]], "transpose2": [["X"], ["Out"]], "bilinear_interp": [["X"], ["Out"]], "nearest_interp": [["X"], ["Out"]], "trilinear_interp": [["X"], ["Out"]], "slice": [["Input"], ["Out"]], "squeeze": [["X"], ["Out"]], "elementwise_sub": [["X", "Y"], ["Out"]], "relu": [["X"], ["Out"]], "relu6": [["X"], ["Out"]], "leaky_relu": [["X"], ["Out"]], "prelu": [["X"], ["Out"]], "tanh": [["X"], ["Out"]], "swish": [["X"], ["Out"]], "dropout": [["X"], ["Out"]], "batch_norm": [["X"], ["Y"]], "layer_norm": [["X"], ["Y"]], "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], "scale": [["X"], ["Out"]], "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], "lstm": [["Input", "Weight"], ["Hidden"]], "pad2d": [["X"], ["Out"]], "flatten": [["X"], ["Out"]], "flatten2": [["X"], ["Out"]], } def _get_op_output_var_names(op): """ """ assert isinstance(op, (IrNode, Operator)), \ "The input op should be IrNode or Operator." var_names = [] op_name = op.name() if isinstance(op, IrNode) \ else op.type if op_name not in op_real_in_out_name: return [] name_list = op_real_in_out_name[op_name][1] for name in name_list: var_name = op.output(name) if isinstance(var_name, list): var_names.extend(var_name) else: var_names.append(var_name) return var_names transform_pass = QuantizationTransformPass( scope=scope, place=place, activation_quantize_type=activation_quantize_type, weight_quantize_type=weight_quantize_type) transform_pass.apply(graph) op_nodes = graph.all_op_nodes() for op_node in op_nodes: if op_node.name() in out_scale_op_list: var_names = _get_op_output_var_names(op_node) for var_name in var_names: in_node = graph._find_node_by_name(op_node.outputs, var_name) if in_node.dtype() not in \ [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: continue op_node.op()._set_attr("out_threshold", 3.0) # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quantize_type) freeze_pass.apply(graph) main_program = graph.to_program() # modify fake_quantize_moving_average_abs_max(InScale) and fake_channel_wise_dequantize_max_abs(Scales) op_nodes = graph.all_op_nodes() for op_node in op_nodes: if op_node.name() == 'fake_quantize_moving_average_abs_max': var_name = op_node.input("InScale")[0] tensor = scope.var(var_name).get_tensor() tensor.set(np.array([1], dtype=np.float32), place) elif op_node.name() == 'fake_channel_wise_dequantize_max_abs': var_name = op_node.input("Scales")[0] tensor = scope.var(var_name).get_tensor() tensor.set(np.ones(tensor.shape(), dtype=np.float32), place) if save: fluid.io.save_inference_model('test_inference_model', feed_target_names, fetch_targets, exe, main_program=main_program) feed_vars = [ main_program.global_block().var(name) for name in feed_target_names ] serialized_program = paddle.static.serialize_program(feed_vars, fetch_targets, program=main_program) serialized_params = paddle.static.serialize_persistables( feed_vars, fetch_targets, executor=exe, program=main_program) return serialized_program, serialized_params