def fpgm_speedup(masks_file, model_checkpoint): from fpgm_torch_mnist import Mnist device = torch.device('cpu') model = Mnist() model.to(device) model.print_conv_filter_sparsity() dummy_input = torch.randn(64, 1, 28, 28) if use_mask: apply_compression_results(model, masks_file) dummy_input = dummy_input.to(device) start = time.time() for _ in range(40): out = model(dummy_input) print('mask elapsed time: ', time.time() - start) #print(out.size(), out) return else: m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file) m_speedup.speedup_model() dummy_input = dummy_input.to(device) start = time.time() for _ in range(40): out = model(dummy_input) print('speedup elapsed time: ', time.time() - start) #print(out.size(), out) return
def test_speedup_bigmodel(self): prune_model_l1(BigModel()) model = BigModel() apply_compression_results(model, MASK_FILE, 'cpu') model.eval() mask_out = model(dummy_input) model.train() ms = ModelSpeedup(model, dummy_input, MASK_FILE) ms.speedup_model() assert model.training model.eval() speedup_out = model(dummy_input) if not torch.allclose(mask_out, speedup_out, atol=1e-07): print('input:', dummy_input.size(), torch.abs(dummy_input).sum((2, 3))) print('mask_out:', mask_out) print('speedup_out:', speedup_out) raise RuntimeError('model speedup inference result is incorrect!') orig_model = BigModel() assert model.backbone2.conv1.out_channels == int( orig_model.backbone2.conv1.out_channels * SPARSITY) assert model.backbone2.conv2.in_channels == int( orig_model.backbone2.conv2.in_channels * SPARSITY) assert model.backbone2.conv2.out_channels == int( orig_model.backbone2.conv2.out_channels * SPARSITY) assert model.backbone2.fc1.in_features == int( orig_model.backbone2.fc1.in_features * SPARSITY)
def test_channel_prune(self): orig_net = resnet18(num_classes=10).to(device) channel_prune(orig_net) state_dict = torch.load(MODEL_FILE) orig_net = resnet18(num_classes=10).to(device) orig_net.load_state_dict(state_dict) apply_compression_results(orig_net, MASK_FILE) orig_net.eval() net = resnet18(num_classes=10).to(device) net.load_state_dict(state_dict) net.eval() data = torch.randn(BATCH_SIZE, 3, 224, 224).to(device) ms = ModelSpeedup(net, data, MASK_FILE) ms.speedup_model() ms.bound_model(data) net.eval() ori_sum = orig_net(data).abs().sum().item() speeded_sum = net(data).abs().sum().item() print(ori_sum, speeded_sum) assert (abs(ori_sum - speeded_sum) / abs(ori_sum) < RELATIVE_THRESHOLD) or \ (abs(ori_sum - speeded_sum) < ABSOLUTE_THRESHOLD)
def slim_speedup(masks_file, model_checkpoint): device = torch.device('cuda') model = VGG(depth=19) model.to(device) model.eval() dummy_input = torch.randn(64, 3, 32, 32) if use_mask: apply_compression_results(model, masks_file) dummy_input = dummy_input.to(device) start = time.time() for _ in range(32): out = model(dummy_input) #print(out.size(), out) print('mask elapsed time: ', time.time() - start) return else: #print("model before: ", model) m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file) m_speedup.speedup_model() #print("model after: ", model) dummy_input = dummy_input.to(device) start = time.time() for _ in range(32): out = model(dummy_input) #print(out.size(), out) print('speedup elapsed time: ', time.time() - start) return
def model_inference(config): masks_file = config['masks_file'] device = torch.device(config['device']) if config['model_name'] == 'unet': model = UNet(3, 1) elif config['model_name'] == 'vgg19': model = VGG(depth=19) elif config['model_name'] == 'naive': from model_prune_torch import NaiveModel model = NaiveModel() model.to(device) model.load_state_dict(torch.load(config['model_file'], map_location=device)) model.eval() dummy_input = torch.randn(config['input_shape']).to(device) use_mask_out = use_speedup_out = None # must run use_mask before use_speedup because use_speedup modify the model if use_mask: apply_compression_results(model, masks_file, device) start = time.time() for _ in range(1): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) if use_speedup: m_speedup = ModelSpeedup(model, dummy_input, masks_file, device) m_speedup.speedup_model() start = time.time() for _ in range(1): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) if compare_results: if torch.allclose(use_mask_out, use_speedup_out, atol=1e-05): torch.save(model, config['save_dir_for_speedup']) print('the outputs from use_mask and use_speedup are the same') else: raise RuntimeError( 'the outputs from use_mask and use_speedup are different')
def model_inference(config): masks_file = config['masks_file'] device = torch.device(config['device']) if config['model_name'] == 'unet': model = UNet(3, 1) elif config['model_name'] == 'testNet': model = testNet() elif config['model_name'] == 'naive': from model_prune_torch import NaiveModel model = NaiveModel() model.to(device) model.eval() dummy_input = torch.randn(config['input_shape']).to(device) use_mask_out = use_speedup_out = None # must run use_mask before use_speedup because use_speedup modify the model if use_mask: apply_compression_results(model, masks_file, 'cpu' if config['device'] == 'cpu' else None) start = time.time() for _ in range(1): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) if use_speedup: m_speedup = ModelSpeedup(model, dummy_input, masks_file, 'cpu' if config['device'] == 'cpu' else None) m_speedup.speedup_model() start = time.time() for _ in range(1): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) if compare_results: if torch.allclose(use_mask_out, use_speedup_out, atol=1e-07): print('the output from use_mask and use_speedup are the same') else: raise RuntimeError( 'the outputs from use_mask and use_speedup are different')
def compress(model, dummy, pruner_cls, config_list, ori_metric=1.00, metric_thres=0.01, sensitivity=None, trace=None, verbose=True): if sensitivity: config_list = update_sparsity_by_sensitivity(config_list, ori_metric, metric_thres, sensitivity) compressed_model = copy.deepcopy(model) pruner = pruner_cls(compressed_model, config_list) compressed_model = pruner.compress() mask_path = "/tmp/mask.pth" pruner.export_model(model_path='/tmp/model.pth', mask_path=mask_path) pruner._unwrap_model() print("fixing mask conflict...") fixed_mask = fix_mask_conflict(mask_path, compressed_model, dummy, trace) # mask = torch.load(mask_path) compressed_model.load_state_dict(model.state_dict()) apply_compression_results(compressed_model, fixed_mask) if verbose: count_zero(compressed_model, verbose=False) from thop import profile macs, params = profile(compressed_model, inputs=dummy, verbose=False) print("MACs: {} G, Params: {} M".format(macs / 1000000000, params / 100000)) speedup_model = speedup(compressed_model, dummy, fixed_mask, trace) if verbose: count_zero(speedup_model, verbose=False) return speedup_model, fixed_mask
def test_nni(): model = load_t_net() config_list = [{'sparsity': 0.5, 'op_types': ['Conv2d']}] pruner = SlimPruner(model, config_list) model = pruner.compress() print(model) masks_file = "./nni/mask.pth" pruner.export_model(model_path="./nni/nni_mod.pth", mask_path=masks_file) print("export ok") apply_compression_results(model, masks_file) # model: 要加速的模型 # dummy_input: 模型的示例输入,传给 `jit.trace` # masks_file: 剪枝算法创建的掩码文件 dummy_input = torch.randn(1, 3, 384, 224) m_speedup = ModelSpeedup(model, dummy_input.cuda(), masks_file) m_speedup.speedup_model() dummy_input = dummy_input.cuda() start = time.time() out = model(dummy_input) summary(model, dummy_input) print('elapsed time: ', time.time() - start)
net = UNet(n_channels=3, n_classes=1) logging.info("Loading model {}".format(args.model)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # logging.info(f'Using device {device}') net.to(device=device) #选择要加载的模型 net.load_state_dict( torch.load("./save_pruner/pruned_model.pt", map_location=device)) logging.info("Model loaded !") net.eval() apply_compression_results(net, "./save_pruner/pruned_mask.pt", 'cuda') import time start = time.time() for i, fn in enumerate(in_files): logging.info("\nPredicting image {} ...".format(fn)) img = Image.open(fn) mask = predict_img(net=net, full_img=img, scale_factor=args.scale, out_threshold=args.mask_threshold, device=device)
test(model, device, test_data_loader) torch.save(model.state_dict(), 'pretrained_model.pth') print("start model pruning...") optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4) best_top1 = 0 # pruner = SlimPruner(model, config_list, optimizer) pruner = ActivationMeanRankFilterPruner(model, config_list, optimizer) model = pruner.compress() for epoch in range(prune_epochs): pruner.update_epoch(epoch) print("# Epoch {} #".format(epoch)) train(model, device, train_data_loader, optimizer) top1 = test(model, device, test_data_loader) if top1 > best_top1: pruner.export_model(model_path='pruned_model.pth', mask_path='pruned_mask.pth') from nni.compression.torch import apply_compression_results from nni.compression.speedup.torch import ModelSpeedup model = MobileModel().cuda() model.eval() apply_compression_results(model, 'pruned_mask.pth', None) m_speedup = ModelSpeedup(model, torch.randn(1, 3, 224, 224).cuda(), 'pruned_mask.pth', None) m_speedup.speedup_model() torch.save(model.state_dict(), 'pruned_speedup_model.pth')
device = torch.device("cuda") else: device = torch.device("cpu") dummy_input = next(iter(validate_loader)) dummy_input = dummy_input['img'].to(device) checkpoint = torch.load(args.model_file, map_location=device) model.load_state_dict(checkpoint, strict=False) model.to(device) model.eval() use_mask_out = use_speedup_out = None # must run use_mask before use_speedup because use_speedup modify the model if use_mask: apply_compression_results(model, args.masks_file, device) start = time.time() for _ in range(32): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) if use_speedup: m_speedup = ModelSpeedup(model, dummy_input, args.masks_file, device) m_speedup.speedup_model() start = time.time() for _ in range(32): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) torch.save( model.state_dict(), "output/DBNet_opensource_nni_resnet18_fpn_db/checkpoint/pruner_speed.pth" )
def model_inference(config): masks_file = './speedup_test/mask_new.pth' shape_mask = './speedup_test/mask_new.pth' org_mask = './speedup_test/mask.pth' rn50 = models.resnet50() m_paras = torch.load('./speedup_test/model_fine_tuned.pth') ##delete mask in pth m_new = collections.OrderedDict() for key in m_paras: if 'mask' in key: continue if 'module' in key: m_new[key.replace('module.', '')] = m_paras[key] else: m_new[key] = m_paras[key] rn50.load_state_dict(m_new) rn50.cuda() rn50.eval() dummy_input = torch.randn(64, 3, 224, 224).cuda() use_mask_out = use_speedup_out = None rn = rn50 apply_compression_results(rn, org_mask, 'cuda') rn_mask_out = rn(dummy_input) model = rn50 # must run use_mask before use_speedup because use_speedup modify the model if use_mask: apply_compression_results(model, masks_file, 'cuda') torch.onnx.export(model, dummy_input, 'resnet_masked.onnx', export_params=True, opset_version=12, do_constant_folding=True, input_names=['inputs'], output_names=['proba'], dynamic_axes={ 'inputs': [0], 'mask': [0] }, keep_initializers_as_inputs=True) start = time.time() for _ in range(32): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) print('Model is ', model) print('before speed up===================') # print(para) # print(model.state_dict()[para]) # print(model.state_dict()[para].shape) flops, paras = count_flops_params(model, (1, 3, 224, 224)) print( 'flops and parameters before speedup is {} FLOPS and {} params'.format( flops, paras)) if use_speedup: dummy_input.cuda() m_speedup = ModelSpeedup(model, dummy_input, shape_mask, 'cuda') m_speedup.speedup_model() print('==' * 20) print('Start inference') torch.onnx.export(model, dummy_input, 'resnet_fpgm.onnx', export_params=True, opset_version=12, do_constant_folding=True, input_names=['inputs'], output_names=['proba'], dynamic_axes={ 'inputs': [0], 'mask': [0] }, keep_initializers_as_inputs=True) start = time.time() for _ in range(32): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) print('After speedup model is ', model) print('=================') print('After speedup') flops, paras = count_flops_params(model, (1, 3, 224, 224)) print( 'flops and parameters before speedup is {} FLOPS and {} params'.format( flops, paras)) #for para in model.state_dict(): # print(para) # print(model.state_dict()[para]) # print(model.state_dict()[para].shape) if compare_results: print(rn_mask_out) print('another is', use_speedup_out) if torch.allclose(rn_mask_out, use_speedup_out, atol=1e-6): #-07): print('the outputs from use_mask and use_speedup are the same') else: raise RuntimeError( 'the outputs from use_mask and use_speedup are different') # start the accuracy check criterion = nn.CrossEntropyLoss() with torch.no_grad(): start = time.time() evaluate(model, criterion, data_loader_test, device="cuda", print_freq=20) print('elapsed time is ', time.time() - start)
from nni.compression.speedup.torch import ModelSpeedup dummy_input = torch.randn((64, 3, 224, 224)).cuda() model = MobileNetV2(n_class=config.num_classes, width_mult=1.0) model.cuda() start = time.time() for i in range(32): output = model(dummy_input) end = time.time() print("Time for original model:", end - start) model.load_state_dict(torch.load('results/pruned/pruned_model.pth')) mask_file = './results/pruned/pruned_mask.pth' apply_compression_results(model, mask_file, 'cuda') start = time.time() for i in range(32): mask_output = model(dummy_input) end = time.time() print("Time for masked model:", end - start) m_speedup = ModelSpeedup(model, dummy_input, mask_file, 'cuda') m_speedup.speedup_model() start = time.time() for i in range(32): speedup_output = model(dummy_input) end = time.time() print("Time for speedup model:", end - start)