def test_flops_params(self): class Model1(nn.Module): def __init__(self): super(Model1, self).__init__() self.conv = nn.Conv2d(3, 5, 1, 1) self.bn = nn.BatchNorm2d(5) self.relu = nn.LeakyReLU() self.linear = nn.Linear(20, 10) self.upsample = nn.UpsamplingBilinear2d(size=2) self.pool = nn.AdaptiveAvgPool2d((2, 2)) def forward(self, x): x = self.conv(x) x = self.bn(x) x = self.relu(x) x = self.upsample(x) x = self.pool(x) x = x.view(x.size(0), -1) x = self.linear(x) return x class Model2(nn.Module): def __init__(self): super(Model2, self).__init__() self.conv = nn.Conv2d(3, 5, 1, 1) self.conv2 = nn.Conv2d(5, 5, 1, 1) def forward(self, x): x = self.conv(x) for _ in range(5): x = self.conv2(x) return x for bs in [1, 2]: flops, params, results = count_flops_params(Model1(), (bs, 3, 2, 2), mode='full', verbose=False) assert (flops, params) == (610, 240) flops, params, results = count_flops_params(Model2(), (bs, 3, 2, 2), verbose=False) assert (flops, params) == (560, 50) from torchvision.models import resnet50 flops, params, results = count_flops_params(resnet50(), (bs, 3, 224, 224), verbose=False) assert (flops, params) == (4089184256, 25503912)
def step(self, action: float, model: Module): _, _, current_statistics = count_flops_params(model, self.dummy_input, verbose=False) current_statistics = { result['name']: result for result in current_statistics } index = self.pruning_op_names.index(self.current_op_name) action = 1 - current_statistics[self.current_op_name][ self.target] / self.current_op_target total_current_target = sum([ current_statistics[name][self.target] for name in self.pruning_op_names ]) previous_pruning_target = self.under_pruning_target - total_current_target rest_target = sum([ current_statistics[name][self.target] for name in self.pruning_op_names[index + 1:] ]) self.layer_embedding[index][ -3] = previous_pruning_target / self.under_pruning_target # reduced self.layer_embedding[index][ -2] = rest_target / self.under_pruning_target # rest self.layer_embedding[index][-1] = action # last action observation = self.layer_embedding[index, :].copy() return action, 0, observation, self.is_final_layer()
def correct_action(self, action: float, model: Module): try: op_name = next(self.ops_iter) index = self.pruning_op_names.index(op_name) _, _, current_statistics = count_flops_params(model, self.dummy_input, verbose=False) current_statistics = {result['name']: result for result in current_statistics} total_current_target = sum([current_statistics[name][self.target] for name in self.pruning_op_names]) previous_pruning_target = self.under_pruning_target - total_current_target max_rest_pruning_target = sum([current_statistics[name][self.target] * self.max_sparsity_per_layer[name] for name in self.pruning_op_names[index + 1:]]) min_current_pruning_target = self.excepted_pruning_target - previous_pruning_target - max_rest_pruning_target max_current_pruning_target_1 = self.origin_statistics[op_name][self.target] * self.max_sparsity_per_layer[op_name] - (self.origin_statistics[op_name][self.target] - current_statistics[op_name][self.target]) max_current_pruning_target_2 = self.excepted_pruning_target - previous_pruning_target max_current_pruning_target = min(max_current_pruning_target_1, max_current_pruning_target_2) min_action = min_current_pruning_target / current_statistics[op_name][self.target] max_action = max_current_pruning_target / current_statistics[op_name][self.target] if min_action > self.max_sparsity_per_layer[op_name]: _logger.warning('[%s] min action > max sparsity per layer: %f > %f', op_name, min_action, self.max_sparsity_per_layer[op_name]) action = max(0., min(max_action, max(min_action, action))) self.current_op_name = op_name self.current_op_target = current_statistics[op_name][self.target] except StopIteration: raise Error('Something goes wrong, this should not happen.') return action
def __init__(self, model: Module, config_list: List[Dict], dummy_input: Tensor, total_sparsity: float, max_sparsity_per_layer: Dict[str, float], target: str = 'flops'): pruning_op_names = [] [pruning_op_names.extend(config['op_names']) for config in config_list_canonical(model, config_list)] self.pruning_ops = OrderedDict() self.pruning_types = [] for i, (name, layer) in enumerate(model.named_modules()): if name in pruning_op_names: op_type = type(layer).__name__ stride = np.power(np.prod(layer.stride), 1 / len(layer.stride)) if hasattr(layer, 'stride') else 0 # type: ignore kernel_size = np.power(np.prod(layer.kernel_size), 1 / len(layer.kernel_size)) if hasattr(layer, 'kernel_size') else 1 # type: ignore self.pruning_ops[name] = (i, op_type, stride, kernel_size) self.pruning_types.append(op_type) self.pruning_types = list(set(self.pruning_types)) self.pruning_op_names = list(self.pruning_ops.keys()) self.dummy_input = dummy_input self.total_sparsity = total_sparsity self.max_sparsity_per_layer = max_sparsity_per_layer assert target in ['flops', 'params'] self.target = target self.origin_target, self.origin_params_num, origin_statistics = count_flops_params(model, dummy_input, verbose=False) self.origin_statistics = {result['name']: result for result in origin_statistics} self.under_pruning_target = sum([self.origin_statistics[name][self.target] for name in self.pruning_op_names]) self.excepted_pruning_target = self.total_sparsity * self.under_pruning_target
def count_flops(model, log=None, device=None): dummy_input = torch.rand([1, 3, 256, 256]) if device is not None: dummy_input = dummy_input.to(device) flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") if log is not None: log.write(f"FLOPs: {flops}, params: {params}\n") return flops, params
def generate_compression_search_space( config: CompressionConfig, vessel: CompressionVessel) -> Dict[str, Dict]: """ Using config (constraints & priori) and vessel (model-related) to generate the hpo search space. """ search_space = {} model, _, evaluator, dummy_input, _, _, _, _ = vessel.export() flops, params, results = count_flops_params(model, dummy_input, verbose=False, mode='full') metric = evaluator(model) module_names_summary = _summary_module_names(model, config.module_types, config.module_names, config.exclude_module_names) for module_name in module_names_summary: search_space['{}{}'.format(KEY_MODULE_NAME, module_name)] = { '_type': 'uniform', '_value': [0, 1] } assert not config.pruners or not config.quantizers # TODO: hard code for step 1, need refactor search_space[KEY_PRUNERS] = { '_type': 'choice', '_value': [pruner_config.json() for pruner_config in config.pruners] } original_target = { 'flops': flops, 'params': params, 'metric': metric, 'results': results } # TODO: following fucntion need improvement flops_theta = _flops_theta_helper(config.flops, flops) params_theta = _flops_theta_helper(config.params, params) metric_theta = _metric_theta_helper(config.metric, metric) thetas = { 'flops': flops_theta, 'params': params_theta, 'metric': metric_theta } search_space[KEY_VESSEL] = {'_type': 'choice', '_value': [vessel.json()]} search_space[KEY_ORIGINAL_TARGET] = { '_type': 'choice', '_value': [original_target] } search_space[KEY_THETAS] = {'_type': 'choice', '_value': [thetas]} return search_space
def _calculate_flops(self, eps=0.001): """FLOPs cost.""" flops_lut = [{} for i in range(self.cnt_layers)] layer_id = 0 for stage_name in self.lut_ops: stage_ops = self.lut_ops[stage_name] ops_num = self.layer_num[stage_name] for _ in range(ops_num): for op_name in stage_ops: layer_config = self.layer_configs[layer_id] key_params = {"fm_size": layer_config[3]} op = stage_ops[op_name](*layer_config[0:3], **key_params) # measured in Flops in_shape = self.layer_in_shapes[layer_id] x = (1, in_shape[0], in_shape[1], in_shape[2]) flops, _, _ = count_flops_params(op, x, verbose=False) flops = eps if flops == 0.0 else flops flops_lut[layer_id][op_name] = float(flops) layer_id += 1 return flops_lut
def generate_tasks(self, task_result: TaskResult) -> List[Task]: # append experience & update agent policy if self.action is not None: action, reward, observation, done = self.env.step(self.action, task_result.compact_model) self.T.append([reward, self.observation, observation, self.action, done]) self.observation = observation.copy() if done: assert task_result.score is not None, 'task_result.score should not be None if environment is done.' final_reward = task_result.score - 1 # agent observe and update policy for _, s_t, s_t1, a_t, d_t in self.T: self.agent.observe(final_reward, s_t, s_t1, a_t, d_t) if self.current_episode > self.warmup_episode: self.agent.update_policy() self.current_episode += 1 self.T = [] self.action = None self.observation = None # update current2origin_sparsity in log file origin_model = torch.load(self._origin_model_path) compact_model = task_result.compact_model compact_model_masks = task_result.compact_model_masks current2origin_sparsity, _, _ = compute_sparsity(origin_model, compact_model, compact_model_masks, self.temp_config_list) self._tasks[task_result.task_id].state['current2origin_sparsity'] = current2origin_sparsity current2origin_sparsity, _, _ = compute_sparsity(origin_model, compact_model, compact_model_masks, self.config_list_copy) self._tasks[task_result.task_id].state['current_total_sparsity'] = current2origin_sparsity flops, params, _ = count_flops_params(compact_model, self.dummy_input, verbose=False) self._tasks[task_result.task_id].state['current_flops'] = '{:.2f} M'.format(flops / 1e6) self._tasks[task_result.task_id].state['current_params'] = '{:.2f} M'.format(params / 1e6) # generate new action if self.current_episode < self.total_episode: if self.observation is None: self.observation = self.env.reset().copy() self.temp_config_list = [] compact_model = torch.load(self._origin_model_path) compact_model_masks = torch.load(self._origin_masks_path) else: compact_model = task_result.compact_model compact_model_masks = task_result.compact_model_masks if self.current_episode <= self.warmup_episode: action = self.agent.random_action() else: action = self.agent.select_action(self.observation, episode=self.current_episode) action = action.tolist()[0] self.action = self.env.correct_action(action, compact_model) sub_config_list = [{'op_names': [self.env.current_op_name], 'total_sparsity': self.action}] self.temp_config_list.extend(sub_config_list) task_id = self._task_id_candidate if self.env.is_first_layer() or self.env.is_final_layer(): task_config_list = self.temp_config_list else: task_config_list = sub_config_list config_list_path = Path(self._intermediate_result_dir, '{}_config_list.json'.format(task_id)) with Path(config_list_path).open('w') as f: json_tricks.dump(task_config_list, f, indent=4) model_path = Path(self._intermediate_result_dir, '{}_compact_model.pth'.format(task_result.task_id)) masks_path = Path(self._intermediate_result_dir, '{}_compact_model_masks.pth'.format(task_result.task_id)) torch.save(compact_model, model_path) torch.save(compact_model_masks, masks_path) task = Task(task_id, model_path, masks_path, config_list_path) if not self.env.is_final_layer(): task.finetune = False task.evaluate = False self._tasks[task_id] = task self._task_id_candidate += 1 return [task] else: return []
model, total_epoch=args.pretrain_epochs) criterion = torch.nn.CrossEntropyLoss() pre_best_acc = 0.0 best_state_dict = None for i in range(args.pretrain_epochs): trainer(model, optimizer, criterion) scheduler.step() acc = evaluator(model) if acc > pre_best_acc: pre_best_acc = acc best_state_dict = model.state_dict() print("Best accuracy: {}".format(pre_best_acc)) model.load_state_dict(best_state_dict) pre_flops, pre_params, _ = count_flops_params( model, torch.randn([128, 3, 32, 32]).to(device)) g_epoch = 0 # Start to prune and speedup print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50) config_list = [{'sparsity': 0.5, 'op_types': ['Conv2d']}] pruner = FPGMPruner(model, config_list) _, masks = pruner.compress() pruner.show_pruned_weights() pruner._unwrap_model() ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model() print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parse_args() ######################################################################### # Prepare model, tokenizer, dataset, optimizer, and the scheduler logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # Load dataset and tokenizer, and then preprocess the dataset raw_dataset, is_regression, num_labels = get_raw_dataset(args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True) processed_datasets = preprocess(args, tokenizer, raw_dataset) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Load pretrained model config = AutoConfig.from_pretrained( args.model_name, num_labels=num_labels, finetuning_task=args.task_name) model = AutoModelForSequenceClassification.from_pretrained( args.model_name, config=config) model.to(device) ######################################################################### # Finetune on the target GLUE task before pruning optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer, model, train_dataset, eval_dataset) train_steps = args.num_train_epochs * len(train_dataloader) lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=train_steps) metric = load_metric("glue", args.task_name) logger.info("================= Finetuning before pruning =================") train_model(args, model, is_regression, train_dataloader, eval_dataloader, optimizer, lr_scheduler, metric, device) if args.output_dir is not None: torch.save(model.state_dict(), args.output_dir + "/model_before_pruning.pt") if args.task_name == "mnli": final_eval_for_mnli(args, model, processed_datasets, metric, data_collator) ######################################################################### # Pruning optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer, model, train_dataset, eval_dataset) dummy_input = next(iter(train_dataloader))["input_ids"].to(device) flops, params, results = count_flops_params(model, dummy_input) print(f"Initial model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M") # Here criterion is embedded in the model. Upper levels can just pass None to trainer. def trainer(model, optimizer, criterion, epoch): return trainer_helper(model, train_dataloader, optimizer, device) def forward_runner(model): return forward_runner_helper(model, train_dataloader, device) # example: prune different layers with different sparsity attention_name_groups = list(zip(["bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12)], ["bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12)], ["bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12)], ["bert.encoder.layer.{}.attention.output.dense".format(i) for i in range(12)])) kwargs = {"ranking_criterion": args.ranking_criterion, "global_sort": args.global_sort, "num_iterations": args.num_iterations, "epochs_per_iteration": args.epochs_per_iteration, "attention_name_groups": attention_name_groups, "head_hidden_dim": 64, "trainer": trainer, "optimizer": optimizer, "forward_runner": forward_runner} config_list = [{ "sparsity": args.sparsity, "op_types": ["Linear"], "op_names": [x for layer in attention_name_groups[:6] for x in layer] }, { "sparsity": args.sparsity / 2, "op_types": ["Linear"], "op_names": [x for layer in attention_name_groups[6:] for x in layer] }] pruner = TransformerHeadPruner(model, config_list, **kwargs) pruner.compress() ######################################################################### # uncomment the following part to export the pruned model masks # model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name)) # mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name)) # pruner.export_model(model_path=model_path, mask_path=mask_path) ######################################################################### # Speedup # Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues. # However, if you are using the transformers library, you can use the following workaround: # The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function # implemented in models from the transformers library to speedup the model. if args.speedup: speedup_rules = {} for group_idx, group in enumerate(pruner.attention_name_groups): # get the layer index layer_idx = None for part in group[0].split("."): try: layer_idx = int(part) break except: continue if layer_idx is not None: speedup_rules[layer_idx] = pruner.pruned_heads[group_idx] pruner._unwrap_model() model.bert._prune_heads(speedup_rules) print(model) ######################################################################### # After pruning, finetune again on the target task # Get the metric function metric = load_metric("glue", args.task_name) # re-initialize the optimizer and the scheduler optimizer, _, _, data_collator = get_dataloader_and_optimizer(args, tokenizer, model, train_dataset, eval_dataset) lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=train_steps) logger.info("================= Finetuning after Pruning =================") train_model(args, model, is_regression, train_dataloader, eval_dataloader, optimizer, lr_scheduler, metric, device) if args.output_dir is not None: torch.save(model.state_dict(), args.output_dir + "/model_after_pruning.pt") if args.task_name == "mnli": final_eval_for_mnli(args, model, processed_datasets, metric, data_collator) flops, params, results = count_flops_params(model, dummy_input) print(f"Final model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
def main(args): # prepare dataset torch.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_loader, val_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) model, optimizer = get_trained_model_optimizer(args, device, train_loader, val_loader, criterion) def short_term_fine_tuner(model, epochs=1): for epoch in range(epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) def trainer(model, optimizer, criterion, epoch): return train(args, model, device, train_loader, criterion, optimizer, epoch=epoch) def evaluator(model): return test(model, device, criterion, val_loader) # used to save the performance of the original & pruned & finetuned models result = {'flops': {}, 'params': {}, 'performance':{}} flops, params, _ = count_flops_params(model, get_input_size(args.dataset)) result['flops']['original'] = flops result['params']['original'] = params evaluation_result = evaluator(model) print('Evaluation result (original model): %s' % evaluation_result) result['performance']['original'] = evaluation_result # module types to prune, only "Conv2d" supported for channel pruning if args.base_algo in ['l1', 'l2', 'fpgm']: op_types = ['Conv2d'] elif args.base_algo == 'level': op_types = ['default'] config_list = [{ 'sparsity': args.sparsity, 'op_types': op_types }] dummy_input = get_dummy_input(args, device) if args.pruner == 'L1FilterPruner': pruner = L1FilterPruner(model, config_list) elif args.pruner == 'L2FilterPruner': pruner = L2FilterPruner(model, config_list) elif args.pruner == 'FPGMPruner': pruner = FPGMPruner(model, config_list) elif args.pruner == 'NetAdaptPruner': pruner = NetAdaptPruner(model, config_list, short_term_fine_tuner=short_term_fine_tuner, evaluator=evaluator, base_algo=args.base_algo, experiment_data_dir=args.experiment_data_dir) elif args.pruner == 'ADMMPruner': # users are free to change the config here if args.model == 'LeNet': if args.base_algo in ['l1', 'l2', 'fpgm']: config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'], 'op_names': ['conv1'] }, { 'sparsity': 0.92, 'op_types': ['Conv2d'], 'op_names': ['conv2'] }] elif args.base_algo == 'level': config_list = [{ 'sparsity': 0.8, 'op_names': ['conv1'] }, { 'sparsity': 0.92, 'op_names': ['conv2'] }, { 'sparsity': 0.991, 'op_names': ['fc1'] }, { 'sparsity': 0.93, 'op_names': ['fc2'] }] else: raise ValueError('Example only implemented for LeNet.') pruner = ADMMPruner(model, config_list, trainer=trainer, num_iterations=2, epochs_per_iteration=2) elif args.pruner == 'SimulatedAnnealingPruner': pruner = SimulatedAnnealingPruner( model, config_list, evaluator=evaluator, base_algo=args.base_algo, cool_down_rate=args.cool_down_rate, experiment_data_dir=args.experiment_data_dir) elif args.pruner == 'AutoCompressPruner': pruner = AutoCompressPruner( model, config_list, trainer=trainer, evaluator=evaluator, dummy_input=dummy_input, num_iterations=3, optimize_mode='maximize', base_algo=args.base_algo, cool_down_rate=args.cool_down_rate, admm_num_iterations=30, admm_epochs_per_iteration=5, experiment_data_dir=args.experiment_data_dir) else: raise ValueError( "Pruner not supported.") # Pruner.compress() returns the masked model # but for AutoCompressPruner, Pruner.compress() returns directly the pruned model model = pruner.compress() evaluation_result = evaluator(model) print('Evaluation result (masked model): %s' % evaluation_result) result['performance']['pruned'] = evaluation_result if args.save_model: pruner.export_model( os.path.join(args.experiment_data_dir, 'model_masked.pth'), os.path.join(args.experiment_data_dir, 'mask.pth')) print('Masked model saved to %s' % args.experiment_data_dir) # model speedup if args.speedup: if args.pruner != 'AutoCompressPruner': if args.model == 'LeNet': model = LeNet().to(device) elif args.model == 'vgg16': model = VGG(depth=16).to(device) elif args.model == 'resnet18': model = ResNet18().to(device) elif args.model == 'resnet50': model = ResNet50().to(device) model.load_state_dict(torch.load(os.path.join(args.experiment_data_dir, 'model_masked.pth'))) masks_file = os.path.join(args.experiment_data_dir, 'mask.pth') m_speedup = ModelSpeedup(model, dummy_input, masks_file, device) m_speedup.speedup_model() evaluation_result = evaluator(model) print('Evaluation result (speedup model): %s' % evaluation_result) result['performance']['speedup'] = evaluation_result torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_speedup.pth')) print('Speedup model saved to %s' % args.experiment_data_dir) flops, params, _ = count_flops_params(model, get_input_size(args.dataset)) result['flops']['speedup'] = flops result['params']['speedup'] = params if args.fine_tune: if args.dataset == 'mnist': optimizer = torch.optim.Adadelta(model.parameters(), lr=1) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) elif args.dataset == 'cifar10' and args.model == 'vgg16': optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR( optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) elif args.dataset == 'cifar10' and args.model == 'resnet18': optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR( optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) elif args.dataset == 'cifar10' and args.model == 'resnet50': optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR( optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) best_acc = 0 for epoch in range(args.fine_tune_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = evaluator(model) if acc > best_acc: best_acc = acc torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_fine_tuned.pth')) print('Evaluation result (fine tuned): %s' % best_acc) print('Fined tuned model saved to %s' % args.experiment_data_dir) result['performance']['finetuned'] = best_acc with open(os.path.join(args.experiment_data_dir, 'result.json'), 'w+') as f: json.dump(result, f)
from .config.utils import parse_params, parse_basic_pruner # TODO: move this function to evaluate module def sigmoid(x: float, theta0: float = -0.5, theta1: float = 10) -> float: return 1 / (1 + math.exp(-theta1 * (x + theta0))) if __name__ == '__main__': kwargs = nni.get_next_parameter() pruner_config, config_list, vessel, original_target, thetas = parse_params(kwargs) basic_pruner, model, finetuner, evaluator, dummy_input, device = parse_basic_pruner(pruner_config, config_list, vessel) # TODO: move following logic to excution engine log_dir = Path(os.environ['NNI_OUTPUT_DIR']) if 'NNI_OUTPUT_DIR' in os.environ else Path('nni_outputs', 'log') task_generator = AGPTaskGenerator(total_iteration=3, origin_model=model, origin_config_list=config_list, skip_first_iteration=True, log_dir=log_dir) speedup = dummy_input is not None scheduler = PruningScheduler(pruner=basic_pruner, task_generator=task_generator, finetuner=finetuner, speedup=speedup, dummy_input=dummy_input, evaluator=None) scheduler.compress() _, model, _, _, _ = scheduler.get_best_result() metric = evaluator(model) flops, params, _ = count_flops_params(model, dummy_input, verbose=False, mode='full') # TODO: more efficient way to calculate or combine these scores flops_score = sigmoid(flops / original_target['flops'], *thetas['flops']) params_score = sigmoid(params / original_target['params'], *thetas['params']) metric_score = sigmoid(metric / original_target['metric'], *thetas['metric']) final_result = flops_score + params_score + metric_score nni.report_final_result({'default': final_result, 'flops': flops, 'params': params, 'metric': metric})
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.makedirs(args.experiment_data_dir, exist_ok=True) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transform), batch_size=64,) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transform), batch_size=1000) # Step1. Model Pretraining model = NaiveModel().to(device) criterion = torch.nn.NLLLoss() optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) flops, params, _ = count_flops_params(model, (1, 1, 28, 28), verbose=False) if args.pretrained_model_dir is None: args.pretrained_model_dir = os.path.join(args.experiment_data_dir, f'pretrained.pth') best_acc = 0 for epoch in range(args.pretrain_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() model.load_state_dict(state_dict) torch.save(state_dict, args.pretrained_model_dir) print(f'Model saved to {args.pretrained_model_dir}') else: state_dict = torch.load(args.pretrained_model_dir) model.load_state_dict(state_dict) best_acc = test(args, model, device, criterion, test_loader) dummy_input = torch.randn([1000, 1, 28, 28]).to(device) time_cost = get_model_time_cost(model, dummy_input) # 125.49 M, 0.85M, 93.29, 1.1012 print(f'Pretrained model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}') # Step2. Model Pruning config_list = [{ 'sparsity': args.sparsity, 'op_types': ['Conv2d'] }] kw_args = {} if args.dependency_aware: dummy_input = torch.randn([1000, 1, 28, 28]).to(device) print('Enable the dependency_aware mode') # note that, not all pruners support the dependency_aware mode kw_args['dependency_aware'] = True kw_args['dummy_input'] = dummy_input pruner = L1FilterPruner(model, config_list, **kw_args) model = pruner.compress() pruner.get_pruned_weights() mask_path = os.path.join(args.experiment_data_dir, 'mask.pth') model_path = os.path.join(args.experiment_data_dir, 'pruned.pth') pruner.export_model(model_path=model_path, mask_path=mask_path) pruner._unwrap_model() # unwrap all modules to normal state # Step3. Model Speedup m_speedup = ModelSpeedup(model, dummy_input, mask_path, device) m_speedup.speedup_model() print('model after speedup', model) flops, params, _ = count_flops_params(model, dummy_input, verbose=False) acc = test(args, model, device, criterion, test_loader) time_cost = get_model_time_cost(model, dummy_input) print(f'Pruned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {acc: .2f}, Time Cost: {time_cost}') # Step4. Model Finetuning optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) best_acc = 0 for epoch in range(args.finetune_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() model.load_state_dict(state_dict) save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth') torch.save(state_dict, save_path) flops, params, _ = count_flops_params(model, dummy_input, verbose=True) time_cost = get_model_time_cost(model, dummy_input) # FLOPs 28.48 M, #Params: 0.18M, Accuracy: 89.03, Time Cost: 1.03 print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}') print(f'Model saved to {save_path}') # Step5. Model Quantization via QAT config_list = [{ 'quant_types': ['weight', 'output'], 'quant_bits': {'weight': 8, 'output': 8}, 'op_names': ['conv1'] }, { 'quant_types': ['output'], 'quant_bits': {'output':8}, 'op_names': ['relu1'] }, { 'quant_types': ['weight', 'output'], 'quant_bits': {'weight': 8, 'output': 8}, 'op_names': ['conv2'] }, { 'quant_types': ['output'], 'quant_bits': {'output': 8}, 'op_names': ['relu2'] }] optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) quantizer = QAT_Quantizer(model, config_list, optimizer) quantizer.compress() # Step6. Quantization Aware Training best_acc = 0 for epoch in range(1): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() calibration_path = os.path.join(args.experiment_data_dir, 'calibration.pth') calibration_config = quantizer.export_model(model_path, calibration_path) print("calibration_config: ", calibration_config) # Step7. Model Speedup batch_size = 32 input_shape = (batch_size, 1, 28, 28) engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32) engine.compress() test_trt(engine, test_loader)
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.makedirs(args.experiment_data_dir, exist_ok=True) # prepare model and data train_loader, test_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) model, optimizer, _ = get_model_optimizer_scheduler( args, device, train_loader, test_loader, criterion) dummy_input = get_dummy_input(args, device) flops, params, _ = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") print(f'start {args.pruner} pruning...') def trainer(model, optimizer, criterion, epoch): return train(args, model, device, train_loader, criterion, optimizer, epoch=epoch) pruner_cls = str2pruner[args.pruner] kw_args = {} config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}] if args.pruner == 'level': config_list = [{'sparsity': args.sparsity, 'op_types': ['default']}] else: if args.global_sort: print('Enable the global_sort mode') # only taylor pruner supports global sort mode currently kw_args['global_sort'] = True if args.dependency_aware: dummy_input = get_dummy_input(args, device) print('Enable the dependency_aware mode') # note that, not all pruners support the dependency_aware mode kw_args['dependency_aware'] = True kw_args['dummy_input'] = dummy_input if args.pruner not in ('l1filter', 'l2filter', 'fpgm'): # set only work for training aware pruners kw_args['trainer'] = trainer kw_args['optimizer'] = optimizer kw_args['criterion'] = criterion if args.pruner in ('mean_activation', 'apoz', 'taylorfo'): kw_args['sparsifying_training_batches'] = 1 if args.pruner == 'slim': kw_args['sparsifying_training_epochs'] = 1 if args.pruner == 'agp': kw_args['pruning_algorithm'] = 'l1' kw_args['num_iterations'] = 2 kw_args['epochs_per_iteration'] = 1 # Reproduced result in paper 'PRUNING FILTERS FOR EFFICIENT CONVNETS', # Conv_1, Conv_8, Conv_9, Conv_10, Conv_11, Conv_12 are pruned with 50% sparsity, as 'VGG-16-pruned-A' # If you want to skip some layer, you can use 'exclude' like follow. if args.pruner == 'slim': config_list = [{ 'sparsity': args.sparsity, 'op_types': ['BatchNorm2d'], }] elif args.model == 'resnet18': config_list = [{ 'sparsity': args.sparsity, 'op_types': ['Conv2d'] }, { 'exclude': True, 'op_names': ['layer1.0.conv1', 'layer1.0.conv2'] }] else: config_list = [{ 'sparsity': args.sparsity, 'op_types': ['Conv2d'], 'op_names': [ 'feature.0', 'feature.24', 'feature.27', 'feature.30', 'feature.34', 'feature.37' ] }] pruner = pruner_cls(model, config_list, **kw_args) # Pruner.compress() returns the masked model model = pruner.compress() pruner.get_pruned_weights() # export the pruned model masks for model speedup model_path = os.path.join( args.experiment_data_dir, 'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) mask_path = os.path.join( args.experiment_data_dir, 'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) pruner.export_model(model_path=model_path, mask_path=mask_path) if args.test_only: test(args, model, device, criterion, test_loader) if args.speedup: # Unwrap all modules to normal state pruner._unwrap_model() m_speedup = ModelSpeedup(model, dummy_input, mask_path, device) m_speedup.speedup_model() print('start finetuning...') # Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage. optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[ int(args.pretrain_epochs * 0.5), int(args.pretrain_epochs * 0.75) ], gamma=0.1) best_top1 = 0 save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth') for epoch in range(args.fine_tune_epochs): print('# Epoch {} #'.format(epoch)) train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() top1 = test(args, model, device, criterion, test_loader) if top1 > best_top1: best_top1 = top1 torch.save(model.state_dict(), save_path) flops, params, results = count_flops_params(model, dummy_input) print( f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_top1: .2f}' ) if args.nni: nni.report_final_result(best_top1)
if __name__ == '__main__': # model = MobileNetV2(n_class=10).to(device) model = VGG().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1) criterion = torch.nn.CrossEntropyLoss() for i in range(100): trainer(model, optimizer, criterion, i) pre_best_acc = evaluator(model) dummy_input = torch.rand(10, 3, 32, 32).to(device) pre_flops, pre_params, _ = count_flops_params(model, dummy_input) config_list = [{ 'op_types': ['Conv2d'], 'total_sparsity': 0.5, 'max_sparsity_per_layer': 0.8 }] # if you just want to keep the final result as the best result, you can pass evaluator as None. # or the result with the highest score (given by evaluator) will be the best result. ddpg_params = { 'hidden1': 300, 'hidden2': 300, 'lr_c': 1e-3, 'lr_a': 1e-4, 'warmup': 100,