def build_distill_prog_with_infermodel(executor, place, train_config): """build distill program with infermodel""" [train_program, feed_target_names, fetch_targets]= paddle.static.load_inference_model( \ path_prefix=train_config["model_path_prefix"], \ executor=executor) _remove_fetch_node(train_program) [teacher_program, teacher_feed_target_names, teacher_fetch_targets]= paddle.static.load_inference_model( \ path_prefix=train_config["teacher_model_path_prefix"], \ executor=executor) _remove_fetch_node(teacher_program) test_program = train_program.clone(for_test=True) train_program = _recover_param_attr(train_program) train_program = _recover_reserve_space_with_bn(train_program) for var in train_program.list_vars(): var.stop_gradient = False train_graph = GraphWrapper(train_program) for op in train_graph.ops(): op._op._set_attr("is_test", False) ############################################################################ # distill ############################################################################ data_name_map = {} assert len(feed_target_names) == len(teacher_feed_target_names), \ "the number of feed nodes in the teacher model is not equal to the student model" for i, name in enumerate(feed_target_names): data_name_map[teacher_feed_target_names[i]] = name merge(teacher_program, train_program, data_name_map, place) # all feed node should set stop_gradient is False, for using pact quant algo. for var in train_program.list_vars(): if var.name in data_name_map.values( ) or var.name in data_name_map.keys(): var.stop_gradient = False train_fetch_list = [] train_fetch_name_list = [] startup_program = paddle.static.Program() with paddle.static.program_guard(train_program, startup_program): with fluid.unique_name.guard('merge'): optimizer = _create_optimizer(train_config) distill_loss = _parse_distill_loss(train_config) loss = paddle.mean(distill_loss) loss.stop_gradient = False p_g_list = paddle.static.append_backward(loss=loss) opts = optimizer.apply_gradients(p_g_list) train_fetch_list.append(loss) train_fetch_name_list.append(loss.name) return DistillProgramInfo(startup_program, train_program, \ feed_target_names, train_fetch_list, optimizer, \ test_program, feed_target_names, fetch_targets)
def test_loss(self): student_main = fluid.Program() student_startup = fluid.Program() with fluid.program_guard(student_main, student_startup): input = fluid.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") student_predict = conv1 + conv2 teacher_main = fluid.Program() teacher_startup = fluid.Program() with fluid.program_guard(teacher_main, teacher_startup): input = fluid.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") sum1 = conv1 + conv2 conv3 = conv_bn_layer(sum1, 8, 3, "conv3") conv4 = conv_bn_layer(conv3, 8, 3, "conv4") sum2 = conv4 + sum1 conv5 = conv_bn_layer(sum2, 8, 3, "conv5") teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6") place = fluid.CPUPlace() data_name_map = {'image': 'image'} merge(teacher_main, student_main, data_name_map, place) merged_ops = [] for block in student_main.blocks: for op in block.ops: merged_ops.append(op.type) def adaptation_loss(t_var, s_var): teacher_channel = t_var.shape[1] s_hint = fluid.layers.conv2d(s_var, teacher_channel, 1) hint_loss = fluid.layers.reduce_mean( fluid.layers.square(s_hint - t_var)) return hint_loss with fluid.program_guard(student_main): distill_loss = loss( adaptation_loss, student_main, t_var='teacher_conv6_bn_output.tmp_2', s_var='conv2_bn_output.tmp_2') loss_ops = [] for block in student_main.blocks: for op in block.ops: loss_ops.append(op.type) self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set()) self.assertTrue( set(loss_ops).difference(set(merged_ops)) == {'reduce_mean', 'elementwise_sub', 'square'})
def test_loss(self): input = paddle.static.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") student_predict = conv1 + conv2 teacher_main = paddle.static.Program() teacher_startup = paddle.static.Program() with paddle.static.program_guard(teacher_main, teacher_startup): input = paddle.static.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") sum1 = conv1 + conv2 conv3 = conv_bn_layer(sum1, 8, 3, "conv3") conv4 = conv_bn_layer(conv3, 8, 3, "conv4") sum2 = conv4 + sum1 conv5 = conv_bn_layer(sum2, 8, 3, "conv5") teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6") place = paddle.CPUPlace() data_name_map = {'image': 'image'} merge(teacher_main, paddle.static.default_main_program(), data_name_map, place) merged_ops = [] for block in paddle.static.default_main_program().blocks: for op in block.ops: merged_ops.append(op.type) def adaptation_loss(t_var, s_var): hint_loss = paddle.mean( paddle.nn.functional.square_error_cost(s_var, t_var)) return hint_loss distill_loss = loss(adaptation_loss, t_var='teacher_conv6_bn_output.tmp_2', s_var='conv2_bn_output.tmp_2') loss_ops = [] for block in paddle.static.default_main_program().blocks: for op in block.ops: loss_ops.append(op.type) self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set()) self.assertTrue( set(loss_ops).difference(set(merged_ops)) == {'reduce_mean', 'elementwise_sub', 'square'})
def test_merge(self): student_main = fluid.Program() student_startup = fluid.Program() with fluid.program_guard(student_main, student_startup): input = fluid.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") student_predict = conv1 + conv2 teacher_main = fluid.Program() teacher_startup = fluid.Program() with fluid.program_guard(teacher_main, teacher_startup): input = fluid.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") sum1 = conv1 + conv2 conv3 = conv_bn_layer(sum1, 8, 3, "conv3") conv4 = conv_bn_layer(conv3, 8, 3, "conv4") sum2 = conv4 + sum1 conv5 = conv_bn_layer(sum2, 8, 3, "conv5") teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6") place = fluid.CPUPlace() data_name_map = {'image': 'image'} merge(teacher_main, student_main, data_name_map, place) merged_ops = [] for block in student_main.blocks: for op in block.ops: merged_ops.append(op.type) with fluid.program_guard(student_main): distill_loss = fsp_loss('teacher_conv5_bn_output.tmp_2', 'teacher_conv6_bn_output.tmp_2', 'conv1_bn_output.tmp_2', 'conv2_bn_output.tmp_2', student_main) loss_ops = [] for block in student_main.blocks: for op in block.ops: loss_ops.append(op.type) self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set()) self.assertTrue( set(loss_ops).difference(set(merged_ops)) == {'elementwise_sub', 'reduce_mean', 'square', 'fsp'})
def test_merge(self): student_main = fluid.Program() student_startup = fluid.Program() with fluid.program_guard(student_main, student_startup): input = fluid.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") student_predict = conv1 + conv2 student_ops = [] for block in student_main.blocks: for op in block.ops: student_ops.append(op) teacher_main = fluid.Program() teacher_startup = fluid.Program() with fluid.program_guard(teacher_main, teacher_startup): input = fluid.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") sum1 = conv1 + conv2 conv3 = conv_bn_layer(sum1, 8, 3, "conv3") conv4 = conv_bn_layer(conv3, 8, 3, "conv4") sum2 = conv4 + sum1 conv5 = conv_bn_layer(sum2, 8, 3, "conv5") teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6") teacher_ops = [] for block in teacher_main.blocks: for op in block.ops: teacher_ops.append(op) place = fluid.CPUPlace() data_name_map = {'image': 'image'} merge(teacher_main, student_main, data_name_map, place) merged_ops = [] for block in student_main.blocks: for op in block.ops: merged_ops.append(op) self.assertTrue(len(student_ops) + len(teacher_ops) == len(merged_ops))
def test_soft_label_loss(self): input = paddle.static.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") student_predict = conv1 + conv2 teacher_main = paddle.static.Program() teacher_startup = paddle.static.Program() with paddle.static.program_guard(teacher_main, teacher_startup): input = paddle.static.data(name="image", shape=[None, 3, 224, 224]) conv1 = conv_bn_layer(input, 8, 3, "conv1") conv2 = conv_bn_layer(conv1, 8, 3, "conv2") sum1 = conv1 + conv2 conv3 = conv_bn_layer(sum1, 8, 3, "conv3") conv4 = conv_bn_layer(conv3, 8, 3, "conv4") sum2 = conv4 + sum1 conv5 = conv_bn_layer(sum2, 8, 3, "conv5") teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6") place = paddle.CPUPlace() data_name_map = {'image': 'image'} merge(teacher_main, paddle.static.default_main_program(), data_name_map, place) merged_ops = [] for block in paddle.static.default_main_program().blocks: for op in block.ops: merged_ops.append(op.type) distill_loss = soft_label_loss('teacher_conv6_bn_output.tmp_2', 'conv2_bn_output.tmp_2') loss_ops = [] for block in paddle.static.default_main_program().blocks: for op in block.ops: loss_ops.append(op.type) self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set()) self.assertTrue( set(loss_ops).difference(set(merged_ops)) == {'cross_entropy', 'softmax', 'reduce_mean', 'scale'})
def compress(args): if args.data == "cifar10": import paddle.dataset.cifar as reader train_reader = reader.train10() val_reader = reader.test10() class_dim = 10 image_shape = "3,32,32" elif args.data == "imagenet": import imagenet_reader as reader train_reader = reader.train() val_reader = reader.val() class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) student_program = fluid.Program() s_startup = fluid.Program() with fluid.program_guard(student_program, s_startup): with fluid.unique_name.guard(): image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') train_loader = fluid.io.DataLoader.from_generator( feed_list=[image, label], capacity=64, use_double_buffer=True, iterable=True) valid_loader = fluid.io.DataLoader.from_generator( feed_list=[image, label], capacity=64, use_double_buffer=True, iterable=True) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) train_reader = paddle.batch(train_reader, batch_size=args.batch_size, drop_last=True) val_reader = paddle.batch(val_reader, batch_size=args.batch_size, drop_last=True) val_program = student_program.clone(for_test=True) places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() train_loader.set_sample_list_generator(train_reader, places) valid_loader.set_sample_list_generator(val_reader, place) teacher_model = models.__dict__[args.teacher_model]() # define teacher program teacher_program = fluid.Program() t_startup = fluid.Program() with fluid.program_guard(teacher_program, t_startup): with fluid.unique_name.guard(): image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') predict = teacher_model.net(image, class_dim=class_dim) exe.run(t_startup) if not os.path.exists(args.teacher_pretrained_model): _download( 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar', '.') _decompress('./ResNet50_vd_pretrained.tar') assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model ), "teacher_pretrained_model should be set when teacher_model is not None." def if_exist(var): return os.path.exists( os.path.join(args.teacher_pretrained_model, var.name)) fluid.io.load_vars(exe, args.teacher_pretrained_model, main_program=teacher_program, predicate=if_exist) data_name_map = {'image': 'image'} merge(teacher_program, student_program, data_name_map, place) with fluid.program_guard(student_program, s_startup): distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program) loss = avg_cost + distill_loss lr, opt = create_optimizer(args) opt.minimize(loss) exe.run(s_startup) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False parallel_main = fluid.CompiledProgram(student_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) for epoch_id in range(args.num_epochs): for step_id, data in enumerate(train_loader): lr_np, loss_1, loss_2, loss_3 = exe.run(parallel_main, feed=data, fetch_list=[ lr.name, loss.name, avg_cost.name, distill_loss.name ]) if step_id % args.log_period == 0: _logger.info( "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}" .format(epoch_id, step_id, lr_np[0], loss_1[0], loss_2[0], loss_3[0])) val_acc1s = [] val_acc5s = [] for step_id, data in enumerate(valid_loader): val_loss, val_acc1, val_acc5 = exe.run( val_program, data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) val_acc1s.append(val_acc1) val_acc5s.append(val_acc5) if step_id % args.log_period == 0: _logger.info( "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}" .format(epoch_id, step_id, val_loss[0], val_acc1[0], val_acc5[0])) _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format( epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))
def compress(args): if args.data == "cifar10": train_dataset = paddle.vision.datasets.Cifar10(mode='train') val_dataset = paddle.vision.datasets.Cifar10(mode='test') class_dim = 10 image_shape = "3,32,32" elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) student_program = paddle.static.Program() s_startup = paddle.static.Program() places = paddle.static.cuda_places( ) if args.use_gpu else paddle.static.cpu_places() place = places[0] with paddle.static.program_guard(student_program, s_startup): with paddle.fluid.unique_name.guard(): image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data( name='label', shape=[None, 1], dtype='int64') train_loader = paddle.io.DataLoader( train_dataset, places=places, feed_list=[image, label], drop_last=True, batch_size=args.batch_size, return_list=False, shuffle=True, use_shared_memory=True, num_workers=4) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size, shuffle=False) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) cost = paddle.nn.functional.loss.cross_entropy( input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = student_program.clone(for_test=True) exe = paddle.static.Executor(place) teacher_model = models.__dict__[args.teacher_model]() # define teacher program teacher_program = paddle.static.Program() t_startup = paddle.static.Program() with paddle.static.program_guard(teacher_program, t_startup): with paddle.fluid.unique_name.guard(): image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') predict = teacher_model.net(image, class_dim=class_dim) exe.run(t_startup) if not os.path.exists(args.teacher_pretrained_model): _download( 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar', '.') _decompress('./ResNet50_vd_pretrained.tar') assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model ), "teacher_pretrained_model should be set when teacher_model is not None." def if_exist(var): exist = os.path.exists( os.path.join(args.teacher_pretrained_model, var.name)) if args.data == "cifar10" and (var.name == 'fc_0.w_0' or var.name == 'fc_0.b_0'): exist = False return exist paddle.static.load(teacher_program, args.teacher_pretrained_model, exe) data_name_map = {'image': 'image'} merge(teacher_program, student_program, data_name_map, place) with paddle.static.program_guard(student_program, s_startup): distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program) loss = avg_cost + distill_loss lr, opt = create_optimizer(args) opt.minimize(loss) exe.run(s_startup) build_strategy = paddle.static.BuildStrategy() build_strategy.fuse_all_reduce_ops = False parallel_main = paddle.static.CompiledProgram( student_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) for epoch_id in range(args.num_epochs): for step_id, data in enumerate(train_loader): loss_1, loss_2, loss_3 = exe.run( parallel_main, feed=data, fetch_list=[loss.name, avg_cost.name, distill_loss.name]) if step_id % args.log_period == 0: _logger.info( "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}". format(epoch_id, step_id, lr.get_lr(), loss_1[0], loss_2[0], loss_3[0])) lr.step() val_acc1s = [] val_acc5s = [] for step_id, data in enumerate(valid_loader): val_loss, val_acc1, val_acc5 = exe.run( val_program, data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) val_acc1s.append(val_acc1) val_acc5s.append(val_acc5) if step_id % args.log_period == 0: _logger.info( "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}". format(epoch_id, step_id, val_loss[0], val_acc1[0], val_acc5[0])) if args.save_inference: paddle.fluid.io.save_inference_model( os.path.join("./saved_models", str(epoch_id)), ["image"], [out], exe, student_program) _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format( epoch_id, np.mean(val_acc1s), np.mean(val_acc5s)))