def evaluate_model(model, criterion, test_loader, loggers, activations_collectors, args): # This sample application can be invoked to evaluate the accuracy of your model on # the test dataset. # You can optionally quantize the model to 8-bit integer before evaluation. # For example: # python3 compress_classifier.py --arch resnet20_cifar ../data.cifar10 -p=50 --resume=checkpoint.pth.tar --evaluate if not isinstance(loggers, list): loggers = [loggers] if args.quantize_eval: model.cpu() quantizer = quantization.SymmetricLinearQuantizer(model, args.qe_bits_acts, args.qe_bits_wts, args.qe_bits_accum, args.qe_clip_acts, args.qe_no_clip_layers) quantizer.prepare_model() model.cuda() top1, _, _ = test(test_loader, model, criterion, loggers, activations_collectors, args=args) if args.quantize_eval: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args.arch, model, optimizer=None, best_top1=top1, name='_'.split(args.name, checkpoint_name) if args.name else checkpoint_name, dir=msglogger.logdir)
def test_load_gpu_model_on_cpu_with_thinning(): # Issue #148 # 1. create a GPU model and remove 50% of the filters in one of the layers (thninning) # 2. save the thinned model in a checkpoint file # 3. load the checkpoint and place it on the CPU CPU_DEVICE_ID = -1 gpu_model = create_model(False, 'cifar10', 'resnet20_cifar') conv_pname = "module.layer1.0.conv1.weight" conv_p = distiller.model_find_param(gpu_model, conv_pname) pruner = distiller.pruning.L1RankedStructureParameterPruner("test_pruner", group_type="Filters", desired_sparsity=0.5, weights=conv_pname) zeros_mask_dict = distiller.create_model_masks_dict(gpu_model) pruner.set_param_mask(conv_p, conv_pname, zeros_mask_dict, meta=None) # Use the mask to prune zeros_mask_dict[conv_pname].apply_mask(conv_p) distiller.remove_filters(gpu_model, zeros_mask_dict, 'resnet20_cifar', 'cifar10', optimizer=None) assert hasattr(gpu_model, 'thinning_recipes') scheduler = distiller.CompressionScheduler(gpu_model) save_checkpoint(epoch=0, arch='resnet20_cifar', model=gpu_model, scheduler=scheduler, optimizer=None) CPU_DEVICE_ID = -1 cpu_model = create_model(False, 'cifar10', 'resnet20_cifar', device_ids=CPU_DEVICE_ID) load_checkpoint(cpu_model, "checkpoint.pth.tar") assert distiller.model_device(cpu_model) == 'cpu'
def evaluate_model(model, criterion, train_loader, test_loader, loggers, args): # This sample application can be invoked to evaluate the accuracy of your model on # the test dataset. # You can optionally quantize the model to 8-bit integer before evaluation. # For example: # python3 compress_classifier.py --arch resnet20_cifar ../data.cifar10 -p=50 --resume=checkpoint.pth.tar --evaluate if not isinstance(loggers, list): loggers = [loggers] if args.quantize_method: if args.quantize_method == "linear": quantizer = quantization.SymmetricLinearQuantizer( model, args.act_bits, args.weight_bits) if args.quantize_method == "ocs": quantizer = quantization.OCSQuantizer( model, args.act_bits, args.weight_bits, weight_expand_ratio=args.weight_expand_ratio, weight_clip_threshold=args.weight_clip_threshold, act_expand_ratio=args.act_expand_ratio, act_clip_threshold=args.act_clip_threshold) model.cpu() quantizer.prepare_model() model.cuda() if args.quantize_method == "ocs": # Profile the activation first for ocs quantization.ocs_set_profile_mode(True) _ = profile_for_quantization(train_loader, model, criterion, loggers, args) quantization.ocs_set_profile_mode(False) end = time.time() top1, _, _ = test(test_loader, model, criterion, loggers, args=args) msglogger.info('==> Test runtime: %d' % (time.time() - end)) if args.quantize_method: checkpoint_name = 'quantized' apputils.save_checkpoint( 0, args.arch, model, optimizer=None, best_top1=top1, name=args.name if args.name else checkpoint_name, dir=msglogger.logdir)
def main(): global msglogger check_pytorch_version() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_epochs = [distiller.MutableNamedTuple({'epoch': 0, 'top1': 0, 'sparsity': 0}) for i in range(args.num_best_scores)] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error('ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1') exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error('ERROR: Argument --gpus must be a comma-separated list of integers only') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error('ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name if 'cinic' in args.arch: args.dataset = 'cinic10' else: args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset in ['cifar10', 'cinic10'] else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model #model = create_model(args.pretrained, args.dataset, args.arch, # parallel=not args.load_serialized, device_ids=args.gpus) model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) # Get arch state_dict compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: #model, compression_scheduler, start_epoch = apputils.load_checkpoint( # model, chkpt_file=args.resume) # Load Pre-trained Model chkpt_file=args.resume print("=> loading checkpoint %s" % chkpt_file) checkpoint = torch.load(chkpt_file) model.load_state_dict(checkpoint['net']) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.ADC: return automated_deep_compression(model, criterion, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_collectors = create_activation_stats_collectors(model, collection_phase=args.activation_stats) if args.sensitivity is not None: return sensitivity_analysis(model, criterion, test_loader, pylogger, args) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.cuda() else: compression_scheduler = distiller.CompressionScheduler(model) args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics(epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # remember best top1 and save checkpoint #sparsity = distiller.model_sparsity(model) is_best = top1 > best_epochs[0].top1 if is_best: best_epochs[0].epoch = epoch best_epochs[0].top1 = top1 #best_epoch.sparsity = sparsity best_epochs = sorted(best_epochs, key=lambda score: score.top1) for score in reversed(best_epochs): if score.top1 > 0: msglogger.info('==> Best Top1: %.3f on Epoch: %d', score.top1, score.epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_epochs[0].top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main(): global msglogger check_pytorch_version() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_top1 = 0 if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' # Create the model png_summary = args.summary is not None and args.summary.startswith('png') is_parallel = not png_summary and args.summary != 'compute' # For PNG summary, parallel graphs are illegible model = create_model(args.pretrained, args.dataset, args.arch, parallel=is_parallel, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) if 'resnet' in args.arch and 'preact' not in args.arch and 'cifar' in args.arch: distiller.resnet_cifar_remove_layers(model) #model = distiller.resnet_cifar_remove_channels(model, compression_scheduler.zeros_mask_dict) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) # This sample application can be invoked to produce various summary reports. if args.summary: which_summary = args.summary if which_summary.startswith('png'): apputils.draw_img_classifier_to_file( model, 'model.png', args.dataset, which_summary == 'png_w_params') else: distiller.model_summary(model, which_summary, args.dataset) exit() # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_sparsity = None if args.activation_stats: # If your model has ReLU layers, then those layers have sparse activations. # ActivationSparsityCollector will collect information about this sparsity. # WARNING! Enabling activation sparsity collection will significantly slow down training! activations_sparsity = ActivationSparsityCollector(model) if args.sensitivity is not None: # This sample application can be invoked to execute Sensitivity Analysis on your # model. The ouptut is saved to CSV and PNG. msglogger.info("Running sensitivity tests") test_fnc = partial(test, test_loader=test_loader, criterion=criterion, loggers=[pylogger], print_freq=args.print_freq) which_params = [ param_name for param_name, _ in model.named_parameters() ] sensitivity = distiller.perform_sensitivity_analysis( model, net_params=which_params, sparsities=np.arange(0.0, 0.50, 0.05) if args.sensitivity == 'filter' else np.arange(0.0, 0.95, 0.05), test_func=test_fnc, group=args.sensitivity) distiller.sensitivities_to_png(sensitivity, 'sensitivity.png') distiller.sensitivities_to_csv(sensitivity, 'sensitivity.csv') exit() if args.evaluate: # This sample application can be invoked to evaluate the accuracy of your model on # the test dataset. # You can optionally quantize the model to 8-bit integer before evaluation. # For example: # python3 compress_classifier.py --arch resnet20_cifar ../data.cifar10 -p=50 --resume=checkpoint.pth.tar --evaluate if args.quantize: model.cpu() quantizer = quantization.SymmetricLinearQuantizer(model, 8, 8) quantizer.prepare_model() model.cuda() top1, _, _ = test(test_loader, model, criterion, [pylogger], args.print_freq) if args.quantize: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args.arch, model, optimizer=None, best_top1=top1, name='_'.split(args.name, checkpoint_name) if args.name else checkpoint_name, dir=msglogger.logdir) exit() if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], print_freq=args.print_freq, log_params_hist=args.log_params_histograms) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) if args.activation_stats: distiller.log_activation_sparsity(epoch, loggers=[tflogger, pylogger], collector=activations_sparsity) # evaluate on validation set top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args.print_freq, epoch) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch) # remember best top1 and save checkpoint is_best = top1 > best_top1 best_top1 = max(top1, best_top1) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], args.print_freq)
def objective(space): global model global count global global_min_score #Explore new model model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) count += 1 # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity) accuracy = 0 alpha = 0.3 # Super-parameter: the importance of inference time latency = 0.0 sparsity = 0.0 # Training hyperparameter if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) print('resume mode: {}'.format(args.resume)) print(global_min_score) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) """ distiller/distiller/config.py # Element-wise sparsity sparsity_levels = {net_param: sparsity_level} pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) scheduler = distiller.CompressionScheduler(model) scheduler.add_policy(policy, epochs=[0, 2, 4]) # Local search add multiple pruner for each layer """ sparsity_levels = {} for key, value in space.items(): sparsity_levels[key] = value #print(sparsity_levels) pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) # for SparsityLevelParameterPruner # pruner = distiller.pruning.SensitivityPruner(name='sensitivity', sensitivities=sparsity_levels) # for SensitivityPruner policy = distiller.PruningPolicy(pruner, pruner_args=None) lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1)) compression_scheduler = distiller.CompressionScheduler(model) compression_scheduler.add_policy(policy, epochs=[PrunerEpoch]) # compression_scheduler.add_policy(policy, starting_epoch=0, ending_epoch=38, frequency=2) compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=50, frequency=1) """ distiller/example/classifier_compression/compress_classifier.py For each epoch: compression_scheduler.on_epoch_begin(epoch) train() save_checkpoint() compression_scheduler.on_epoch_end(epoch) train(): For each training step: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ local_min_score = 2. for i in range(args.epochs): compression_scheduler.on_epoch_begin(i) train_accuracy = train(i,criterion, optimizer, compression_scheduler) val_accuracy = validate() # Validate hyperparameter setting t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) compression_scheduler.on_epoch_end(i, optimizer) apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False, 'hyperopt', './') print('Epoch: {}, train_acc: {:.4f}, val_acc: {:.4f}, sparsity: {:.4f}'.format(i, train_accuracy, val_accuracy, sparsity)) score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here if(score < global_min_score): global_min_score = score apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, True, 'best', './') if(score < local_min_score): local_min_score = score if (PrunerConstraint == True and i >= PrunerEpoch and (sparsity < Expected_Sparsity_Level_Low or sparsity > Expected_Sparsity_Level_High)): break test_accuracy = test() # Validate hyperparameter setting print('{} trials: score: {:.4f}, train_acc:{:.4f}, val_acc:{:.4f}, test_acc:{:.4f}, sparsity:{:.4f}'.format(count, local_min_score, train_accuracy, val_accuracy, test_accuracy, sparsity)) return local_min_score
def arbitrary_channel_pruning(config, channels_to_remove, is_parallel): """Test removal of arbitrary channels. The test receives a specification of channels to remove. Based on this specification, the channels are pruned and then physically removed from the model (via a "thinning" process). """ model, zeros_mask_dict = common.setup_test(config.arch, config.dataset, is_parallel) assert len(config.module_pairs ) == 1 # This is a temporary restriction on the test pair = config.module_pairs[0] conv2 = common.find_module_by_name(model, pair[1]) assert conv2 is not None # Test that we can access the weights tensor of the first convolution in layer 1 conv2_p = distiller.model_find_param(model, pair[1] + ".weight") assert conv2_p is not None assert conv2_p.dim() == 4 num_channels = conv2_p.size(1) cnt_nnz_channels = num_channels - len(channels_to_remove) mask = create_channels_mask(conv2_p, channels_to_remove) assert distiller.density_ch(mask) == ( conv2.in_channels - len(channels_to_remove)) / conv2.in_channels # Cool, so now we have a mask for pruning our channels. # Use the mask to prune zeros_mask_dict[pair[1] + ".weight"].mask = mask zeros_mask_dict[pair[1] + ".weight"].apply_mask(conv2_p) all_channels = set([ch for ch in range(num_channels)]) nnz_channels = set( distiller.find_nonzero_channels_list(conv2_p, pair[1] + ".weight")) channels_removed = all_channels - nnz_channels logger.info("Channels removed {}".format(channels_removed)) # Now, let's do the actual network thinning distiller.remove_channels(model, zeros_mask_dict, config.arch, config.dataset, optimizer=None) conv1 = common.find_module_by_name(model, pair[0]) assert conv1.out_channels == cnt_nnz_channels assert conv2.in_channels == cnt_nnz_channels assert conv1.weight.size(0) == cnt_nnz_channels assert conv2.weight.size(1) == cnt_nnz_channels if config.bn_name is not None: bn1 = common.find_module_by_name(model, config.bn_name) assert bn1.running_var.size(0) == cnt_nnz_channels assert bn1.running_mean.size(0) == cnt_nnz_channels assert bn1.num_features == cnt_nnz_channels assert bn1.bias.size(0) == cnt_nnz_channels assert bn1.weight.size(0) == cnt_nnz_channels dummy_input = torch.randn(1, 3, 32, 32).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.1) run_forward_backward(model, optimizer, dummy_input) # Let's test saving and loading a thinned model. # We save 3 times, and load twice, to make sure to cover some corner cases: # - Make sure that after loading, the model still has hold of the thinning recipes # - Make sure that after a 2nd load, there no problem loading (in this case, the # - tensors are already thin, so this is a new flow) # (1) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None) model_2 = create_model(False, config.dataset, config.arch, parallel=is_parallel) model(dummy_input) model_2(dummy_input) conv2 = common.find_module_by_name(model_2, pair[1]) assert conv2 is not None with pytest.raises(KeyError): model_2, compression_scheduler, start_epoch = load_checkpoint( model_2, 'checkpoint.pth.tar') compression_scheduler = distiller.CompressionScheduler(model) hasattr(model, 'thinning_recipes') run_forward_backward(model, optimizer, dummy_input) # (2) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint( model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done") # (3) save_checkpoint(epoch=0, arch=config.arch, model=model_2, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint( model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done 2")
def main(): global msglogger check_pytorch_version() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_top1 = 0 if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name # args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' # args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] args.dataset = 'mmr' # Create the model # model = torch_models.__dict__[args.arch](pretrained=args.pretrained) from importlib import import_module # alexnet = import_module(args.arch) # model = alexnet.alexnet(pretrained=args.pretrained) # for name, parameters in model.named_parameters(): # if 'weight' in name: # print(name) peleenet = import_module(args.arch) model = peleenet.PeleeNet(num_classes=args.num_classes) model = torch.nn.DataParallel(model, device_ids=args.gpus) model.cuda() compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.ADC: return automated_deep_compression(model, criterion, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_sparsity = None if args.activation_stats: # If your model has ReLU layers, then those layers have sparse activations. # ActivationSparsityCollector will collect information about this sparsity. # WARNING! Enabling activation sparsity collection will significantly slow down training! activations_sparsity = ActivationSparsityCollector(model) if args.sensitivity is not None: return sensitivity_analysis(model, criterion, test_loader, pylogger, args) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, args) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.cuda() for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) if args.activation_stats: distiller.log_activation_sparsity(epoch, loggers=[tflogger, pylogger], collector=activations_sparsity) # evaluate on validation set top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # remember best top1 and save checkpoint is_best = top1 > best_top1 if is_best: best_epoch = epoch best_top1 = top1 msglogger.info('==> Best validation Top1: %.3f Epoch: %d', best_top1, best_epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], args=args)
def greedy_pruner(pruned_model, app_args, fraction_to_prune, pruning_step, test_fn, train_fn): dataset = app_args.dataset arch = app_args.arch create_network_record_file() # Temporary ugly hack! resnet_layers = None resnet_params = None if arch == "resnet20_cifar": resnet_params = resnet20_params elif arch == "resnet56_cifar": resnet_params = resnet56_params elif arch == "resnet50": resnet_params = resnet50_params if resnet_params is not None: resnet_layers = [param[:-len(".weight")] for param in resnet_params] total_macs = dense_total_macs = get_model_compute_budget( pruned_model, dataset, resnet_layers) iteration = 0 model = pruned_model while total_macs > fraction_to_prune * dense_total_macs: iteration += 1 if app_args.greedy_finetuning_policy == "constant": effective_train_size = app_args.effective_train_size elif app_args.greedy_finetuning_policy == "linear-grow": effective_train_size = 1 - (total_macs / dense_total_macs) prec1, prec5, param_name, pruned_model, zeros_mask_dict = find_most_robust_layer( iteration, pruned_model, pruning_step, test_fn, train_fn, app_args, resnet_params, effective_train_size) total_macs = get_model_compute_budget(pruned_model, dataset, resnet_layers) densities = get_param_densities(model, pruned_model, resnet_params) compute_density = total_macs / dense_total_macs results = (iteration, prec1, param_name, compute_density, total_macs, densities) record_network_details(results) scheduler = create_scheduler(pruned_model, zeros_mask_dict) save_checkpoint(0, arch, pruned_model, optimizer=None, best_top1=prec1, scheduler=scheduler, name="greedy__{}__{:.1f}__{:.1f}".format( str(iteration).zfill(3), compute_density * 100, prec1), dir=msglogger.logdir) del scheduler del zeros_mask_dict msglogger.info("Iteration {}: top1-{:.2f} {} compute-{:.2f}".format( *results[0:4])) assert iteration > 0 prec1, prec5, loss = test_fn(model=pruned_model) print(prec1, prec5, loss) scheduler = create_scheduler(pruned_model, zeros_mask_dict) save_checkpoint(0, arch, pruned_model, optimizer=None, best_top1=prec1, scheduler=scheduler, name='_'.join(("greedy", str(fraction_to_prune))), dir=msglogger.logdir)
def objective(space): global model global count global best_dict #Explore new model model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) if args.resume: model, _, _ = apputils.load_checkpoint( model, chkpt_file=args.resume) count += 1 print('{} trial starting...'.format(count)) # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity) accuracy = 0 #alpha = 0.2 # Super-parameter: the importance of inference time alpha = 1.0 # Super-parameter: the importance of inference time sparsity = 0.0 # Training hyperparameter criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) """ distiller/distiller/config.py # Element-wise sparsity sparsity_levels = {net_param: sparsity_level} pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) scheduler = distiller.CompressionScheduler(model) scheduler.add_policy(policy, epochs=[0, 2, 4]) # Local search add multiple pruner for each layer """ sparsity_levels = {} for key, value in space.items(): sparsity_levels[key] = value pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)) compression_scheduler = distiller.CompressionScheduler(model) #compression_scheduler.add_policy(policy, epochs=[90]) compression_scheduler.add_policy(policy, epochs=[0]) compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=90, frequency=1) """ distiller/example/classifier_compression/compress_classifier.py For each epoch: compression_scheduler.on_epoch_begin(epoch) train() save_checkpoint() compression_scheduler.on_epoch_end(epoch) train(): For each training step: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ for i in range(args.epochs): compression_scheduler.on_epoch_begin(i) train_accuracy = train(i,criterion, optimizer, compression_scheduler) val_accuracy = validate() # Validate hyperparameter setting t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) compression_scheduler.on_epoch_end(i, optimizer) apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False, 'hyperopt', './') print('{} epochs => train acc:{:.2f}%, val acc:{:.2f}%'.format(i, train_accuracy, val_accuracy)) test_accuracy = validate(test_loader) # Validate hyperparameter setting #score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here # objective funtion here # accuracy: 98~90%, sparsity: 80%~50% score = -((val_accuracy/100.)**2-0.9**2 + alpha * ((sparsity/100.)**2-0.5**2)) print('{} trials: score: {:.2f}\ttrain acc:{:.2f}%\tval acc:{:.2f}%\ttest acc:{:.2f}%\tsparsity:{:.2f}%'.format(count, score, train_accuracy, val_accuracy, test_accuracy, sparsity)) if score < best_dict['score']: best_dict['trial'] = count best_dict['score'] = score best_dict['tr_acc'] = train_accuracy best_dict['v_acc'] = val_accuracy best_dict['te_acc'] = test_accuracy best_dict['sparsity'] = sparsity best_dict['model_best'] = copy.deepcopy(model) return score
def arbitrary_channel_pruning(config, channels_to_remove): """Test removal of arbitrary channels. The test receives a specification of channels to remove. Based on this specification, the channels are pruned and then physically removed from the model (via a "thinning" process). """ model, zeros_mask_dict = common.setup_test(config.arch, config.dataset) conv2 = common.find_module_by_name(model, config.conv2_name) assert conv2 is not None # Test that we can access the weights tensor of the first convolution in layer 1 conv2_p = distiller.model_find_param(model, config.conv2_name + ".weight") assert conv2_p is not None assert conv2_p.dim() == 4 num_filters = conv2_p.size(0) num_channels = conv2_p.size(1) kernel_height = conv2_p.size(2) kernel_width = conv2_p.size(3) cnt_nnz_channels = num_channels - len(channels_to_remove) # Let's build our 4D mask. # We start with a 1D mask of channels, with all but our specified channels set to one channels = torch.ones(num_channels) for ch in channels_to_remove: channels[ch] = 0 # Now let's expand back up to a 4D mask mask = channels.expand(num_filters, num_channels) mask.unsqueeze_(-1) mask.unsqueeze_(-1) mask = mask.expand(num_filters, num_channels, kernel_height, kernel_width).contiguous() assert mask.shape == conv2_p.shape assert distiller.density_ch(mask) == (conv2.in_channels - len(channels_to_remove)) / conv2.in_channels # Cool, so now we have a mask for pruning our channels. # Use the mask to prune zeros_mask_dict[config.conv2_name + ".weight"].mask = mask zeros_mask_dict[config.conv2_name + ".weight"].apply_mask(conv2_p) all_channels = set([ch for ch in range(num_channels)]) nnz_channels = set(distiller.find_nonzero_channels_list(conv2_p, config.conv2_name + ".weight")) channels_removed = all_channels - nnz_channels logger.info("Channels removed {}".format(channels_removed)) # Now, let's do the actual network thinning distiller.remove_channels(model, zeros_mask_dict, config.arch, config.dataset) conv1 = common.find_module_by_name(model, config.conv1_name) logger.info(conv1) logger.info(conv2) assert conv1.out_channels == cnt_nnz_channels assert conv2.in_channels == cnt_nnz_channels assert conv1.weight.size(0) == cnt_nnz_channels assert conv2.weight.size(1) == cnt_nnz_channels if config.bn_name is not None: bn1 = common.find_module_by_name(model, config.bn_name) assert bn1.running_var.size(0) == cnt_nnz_channels assert bn1.running_mean.size(0) == cnt_nnz_channels assert bn1.num_features == cnt_nnz_channels assert bn1.bias.size(0) == cnt_nnz_channels assert bn1.weight.size(0) == cnt_nnz_channels # Let's test saving and loading a thinned model. # We save 3 times, and load twice, to make sure to cover some corner cases: # - Make sure that after loading, the model still has hold of the thinning recipes # - Make sure that after a 2nd load, there no problem loading (in this case, the # - tensors are already thin, so this is a new flow) # (1) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None) model_2 = create_model(False, config.dataset, config.arch, parallel=False) dummy_input = torch.randn(1, 3, 32, 32) model(dummy_input) model_2(dummy_input) conv2 = common.find_module_by_name(model_2, config.conv2_name) assert conv2 is not None with pytest.raises(KeyError): model_2, compression_scheduler, start_epoch = load_checkpoint(model_2, 'checkpoint.pth.tar') compression_scheduler = distiller.CompressionScheduler(model) hasattr(model, 'thinning_recipes') # (2) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done") # (3) save_checkpoint(epoch=0, arch=config.arch, model=model_2, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done 2")
def test_arbitrary_channel_pruning(): ARCH = "resnet20_cifar" DATASET = "cifar10" model, zeros_mask_dict = setup_test(ARCH, DATASET) conv2 = find_module_by_name(model, "layer1.0.conv2") assert conv2 is not None # Test that we can access the weights tensor of the first convolution in layer 1 conv2_p = distiller.model_find_param(model, "layer1.0.conv2.weight") assert conv2_p is not None assert conv2_p.dim() == 4 num_filters = conv2_p.size(0) num_channels = conv2_p.size(1) kernel_height = conv2_p.size(2) kernel_width = conv2_p.size(3) channels_to_remove = [0, 2] # Let's build our 4D mask. # We start with a 1D mask of channels, with all but our specified channels set to one channels = torch.ones(num_channels) for ch in channels_to_remove: channels[ch] = 0 # Now let's expand back up to a 4D mask mask = channels.expand(num_filters, num_channels) mask.unsqueeze_(-1) mask.unsqueeze_(-1) mask = mask.expand(num_filters, num_channels, kernel_height, kernel_width).contiguous() assert mask.shape == conv2_p.shape assert distiller.density_ch(mask) == ( conv2.in_channels - len(channels_to_remove)) / conv2.in_channels # Cool, so now we have a mask for pruning our channels. # Use the mask to prune zeros_mask_dict["layer1.0.conv2.weight"].mask = mask zeros_mask_dict["layer1.0.conv2.weight"].apply_mask(conv2_p) all_channels = set([ch for ch in range(num_channels)]) channels_removed = all_channels - set( distiller.find_nonzero_channels(conv2_p, "layer1.0.conv2.weight")) logger.info(channels_removed) # Now, let's do the actual network thinning distiller.remove_channels(model, zeros_mask_dict, ARCH, DATASET) conv1 = find_module_by_name(model, "layer1.0.conv1") logger.info(conv1) logger.info(conv2) assert conv1.out_channels == 14 assert conv2.in_channels == 14 assert conv1.weight.size(0) == 14 assert conv2.weight.size(1) == 14 bn1 = find_module_by_name(model, "layer1.0.bn1") assert bn1.running_var.size(0) == 14 assert bn1.running_mean.size(0) == 14 assert bn1.num_features == 14 assert bn1.bias.size(0) == 14 assert bn1.weight.size(0) == 14 # Let's test saving and loading a thinned model. # We save 3 times, and load twice, to make sure to cover some corner cases: # - Make sure that after loading, the model still has hold of the thinning recipes # - Make sure that after a 2nd load, there no problem loading (in this case, the # - tensors are already thin, so this is a new flow) save_checkpoint(epoch=0, arch=ARCH, model=model, optimizer=None) model_2 = create_model(False, DATASET, ARCH, parallel=False) dummy_input = torch.randn(1, 3, 32, 32) model(dummy_input) model_2(dummy_input) conv2 = find_module_by_name(model_2, "layer1.0.conv2") assert conv2 is not None with pytest.raises(KeyError): model_2, compression_scheduler, start_epoch = load_checkpoint( model_2, 'checkpoint.pth.tar') compression_scheduler = distiller.CompressionScheduler(model) hasattr(model, 'thinning_recipes') save_checkpoint(epoch=0, arch=ARCH, model=model, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint( model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done") save_checkpoint(epoch=0, arch=ARCH, model=model_2, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint( model_2, 'checkpoint.pth.tar') logger.info("test_arbitrary_channel_pruning - Done 2")