def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file ]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) if args.evaluate: args.deterministic = True if args.deterministic: distiller.set_deterministic( args.seed) # For experiment reproducability else: if args.seed is not None: distiller.set_seed(args.seed) # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = distiller.apputils.classification_dataset_str_from_arch( args.arch) args.num_classes = distiller.apputils.classification_num_classes( args.dataset) if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning( 'The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.' ) if not args.reset_optimizer: msglogger.warning( 'If you wish to also reset the optimizer, call with: --reset-optimizer' ) args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # Define loss function (criterion) criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return if args.export_onnx is not None: return distiller.export_img_classifier_to_onnx(model, os.path.join( msglogger.logdir, args.export_onnx), args.dataset, add_softmax=True, verbose=False) if args.qe_calibration: return acts_quant_stats_collection(model, criterion, pylogger, args) if args.activation_histograms: return acts_histogram_collection(model, criterion, pylogger, args) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = load_data(args) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resumed_checkpoint_path.replace( ".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}' .format(start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer, metrics={ 'min': vloss, 'max': top1 }) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = { 'current_top1': top1, 'best_top1': perf_scores_history[0].top1, 'best_epoch': perf_scores_history[0].epoch } apputils.save_checkpoint(epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main(): global msglogger check_pytorch_version() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_top1 = 0 best_epoch = 0 if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.ADC: return automated_deep_compression(model, criterion, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_sparsity = None if args.activation_stats: # If your model has ReLU layers, then those layers have sparse activations. # ActivationSparsityCollector will collect information about this sparsity. # WARNING! Enabling activation sparsity collection will significantly slow down training! activations_sparsity = ActivationSparsityCollector(model) if args.sensitivity is not None: return sensitivity_analysis(model, criterion, test_loader, pylogger, args) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, args) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.cuda() else: compression_scheduler = distiller.CompressionScheduler(model) args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) if args.activation_stats: distiller.log_activation_sparsity(epoch, loggers=[tflogger, pylogger], collector=activations_sparsity) # evaluate on validation set top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # remember best top1 and save checkpoint is_best = top1 > best_top1 if is_best: best_epoch = epoch best_top1 = top1 msglogger.info('==> Best Top1: %.3f On Epoch: %d\n', best_top1, best_epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], args=args)
def _validate(data_loader, model, criterion, loggers, args, epoch=-1): """Execute the validation/test loop.""" losses = {'objective_loss': tnt.AverageValueMeter()} classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) if args.earlyexit_thresholds: # for Early Exit, we have a list of errors and losses for each of the exits. args.exiterrors = [] args.losses_exits = [] for exitnum in range(args.num_exits): args.exiterrors.append( tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) args.losses_exits.append(tnt.AverageValueMeter()) args.exit_taken = [0] * args.num_exits batch_time = tnt.AverageValueMeter() total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size if args.display_confusion: confusion = tnt.ConfusionMeter(args.num_classes) total_steps = total_samples / batch_size msglogger.info('%d samples (%d per mini-batch)', total_samples, batch_size) # Switch to evaluation mode model.eval() end = time.time() for validation_step, (inputs, target) in enumerate(data_loader): with torch.no_grad(): inputs, target = inputs.to(args.device), target.to(args.device) # compute output from model output = model(inputs) if not args.earlyexit_thresholds: # compute loss loss = criterion(output, target) # measure accuracy and record loss losses['objective_loss'].add(loss.item()) classerr.add(output.data, target) if args.display_confusion: confusion.add(output.data, target) else: earlyexit_validate_loss(output, target, criterion, args) # measure elapsed time batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step + 1) if steps_completed % args.print_freq == 0: if not args.earlyexit_thresholds: stats = ('', OrderedDict([('Loss', losses['objective_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5))])) else: stats_dict = OrderedDict() stats_dict['Test'] = validation_step for exitnum in range(args.num_exits): la_string = 'LossAvg' + str(exitnum) stats_dict[la_string] = args.losses_exits[exitnum].mean # Because of the nature of ClassErrorMeter, if an exit is never taken during the batch, # then accessing the value(k) will cause a divide by zero. So we'll build the OrderedDict # accordingly and we will not print for an exit error when that exit is never taken. if args.exit_taken[exitnum]: t1 = 'Top1_exit' + str(exitnum) t5 = 'Top5_exit' + str(exitnum) stats_dict[t1] = args.exiterrors[exitnum].value(1) stats_dict[t5] = args.exiterrors[exitnum].value(5) stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed, total_steps, args.print_freq, loggers) if not args.earlyexit_thresholds: msglogger.info('==> Top1: %.3f Top5: %.3f Loss: %.3f\n', classerr.value()[0], classerr.value()[1], losses['objective_loss'].mean) if args.display_confusion: msglogger.info('==> Confusion:\n%s\n', str(confusion.value())) return classerr.value(1), classerr.value( 5), losses['objective_loss'].mean else: total_top1, total_top5, losses_exits_stats = earlyexit_validate_stats( args) return total_top1, total_top5, losses_exits_stats[args.num_exits - 1]
def train(self, epoch, compression_scheduler): """ Train Process Arguments: epoch (int): epoch id, 当前是training过程的第几个epoch \n compression_scheduler (class CompressionScheduler): 由compression schedule定义文件***.yaml构建的CompressionScheduler对象 Examples: >>> compression_scheduler.on_minibatch_begin(epoch) >>> output = self.model(data) # 模型inferecne >>> loss = self.criterion(output, target) # 计算loss >>> compression_scheduler.before_backward_pass(epoch) >>> loss.backward() # 反向计算梯度 >>> self.optimizer.step() # 根据梯度优化权重 >>> compression_scheduler.on_minibatch_end(epoch) """ losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) batch_time = tnt.AverageValueMeter() self.model.train() start_time = time.time() for batch_num, (data, target) in enumerate(self.training_loader): data = self.img_preprocess(data) # resize input image size data, target = data.to(self.device), target.to(self.device) if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch=epoch, minibatch_id=batch_num, minibatches_per_epoch=len(self.training_loader), optimizer=self.optimizer) # self.model = self.model.to(self.device) loss = self.criterion(self.model(data), target) losses[OBJECTIVE_LOSS_KEY].add(loss.item()) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss agg_loss = compression_scheduler.before_backward_pass( epoch, minibatch_id=batch_num, minibatches_per_epoch=len(self.training_loader), loss=loss, optimizer=self.optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end( epoch=epoch, minibatch_id=batch_num, minibatches_per_epoch=len(self.training_loader), optimizer=self.optimizer) # debug # for param_name, param in self.model.named_parameters(): # print(param_name) # # print(param) # self.draw_model_to_file('arch_after_quantize.png') # dummy_input_test = torch.rand((1, 1, 5, 5), requires_grad=False).cuda() # # dummy_input_test = dummy_input_test * 2 - 1 # test_drop_res = self.model(dummy_input_test) - dummy_input_test # x = dummy_input_test # for mod_name, layer in self.model.named_modules(): # if not distiller.has_children(layer): # test_input = x # x = layer(test_input) # print('1') # progress_bar(batch_num, len(self.training_loader), 'Loss: %.4f' % (train_loss / (batch_num + 1))) # msglogger.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | loss {:5.2f}' # .format(epoch, batch_num, len(self.training_loader), lr, elapsed * 1000, cur_loss)) # log push in stats_dict = OrderedDict() batch_time.add(time.time() - start_time) steps_completed = batch_num + 1 lr = self.optimizer.param_groups[0]['lr'] if steps_completed % self.freq == 0: for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict['LR'] = lr stats_dict['Batch Time'] = batch_time.mean * 1000 stats = ('Performance/Training', stats_dict) distiller.log_training_progress(stats, self.model.named_parameters(), epoch, steps_completed, len(self.training_loader), self.freq, [tflogger, pylogger]) start_time = time.time()
def main(): global msglogger # Parse arguments args = parser.get_parser().parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_epochs = [distiller.MutableNamedTuple({'epoch': 0, 'top1': 0, 'sparsity': 0}) for i in range(args.num_best_scores)] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error('ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1') exit(1) # Use a well-known seed, for repeatability of experiments distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error('ERROR: Argument --gpus must be a comma-separated list of integers only') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error('ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint(model, chkpt_file=args.resume) model.to(args.device) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(args.device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors(model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info('\tStats will be collected for {:.1%} of test dataset'.format(args.qe_calibration)) msglogger.info('\tSetting constant seeds and converting model to serialized execution') distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update(create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resume is not None, "You must use --resume to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format(args.resume.replace(".pth.tar", "")), dir=msglogger.logdir) print("Note: your model may have collapsed to random inference, so you may want to fine-tune") return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics(epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint is_best = top1 > best_epochs[-1].top1 if top1 > best_epochs[0].top1: best_epochs[0].epoch = epoch best_epochs[0].top1 = top1 # Keep best_epochs sorted such that best_epochs[0] is the lowest top1 in the best_epochs list best_epochs = sorted(best_epochs, key=lambda score: score.top1) for score in reversed(best_epochs): if score.top1 > 0: msglogger.info('==> Best Top1: %.3f on Epoch: %d', score.top1, score.epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_epochs[-1].top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training loop for one epoch.""" losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit # So exiterrors is analogous to classerr for the non-Early Exit case if args.earlyexit_lossweights: args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) epoch_frac = args.partial_epoch steps_per_frac_epoch = math.ceil((total_samples*epoch_frac) / batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to('cuda'), target.to('cuda') if train_step == steps_per_frac_epoch: break # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if args.kd_policy is None: # Amir: Running output = model(inputs) else: output = args.kd_policy.forward(inputs) if not args.earlyexit_lossweights: # ------------------------------------------------------------------ AHMED edit sin2-reg - April19 """ adding sin2 regularization here""" qbits_dict = {} sin2_reg_loss = 0 #print('weights:', (model.module.conv2.weight.size())) bw = 3 qbits_dict['conv1'] = bw qbits_dict['conv2'] = bw qbits_dict['fc1'] = bw qbits_dict['fc2'] = bw qbits_dict['fc3'] = bw # ---------------------------------- q = 4 power = 2 step = 1/(2**(q)-0.5) # dorefa shift = step/2 #step = 1/(2**(q)-1) # wrpn #shift = 0 #amplitude = (np.sin(pi*(weight+step/2)/(step)))**2 kernel = model.module.conv1.float_weight kernel = model.module.conv1.weight #sin2_func_1 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['conv1']))-1)),2)) sin2_func_1 =torch.mean((torch.sin(pi*(kernel+shift)/(step)))**power) # dorefa #print(sin2_func_1.data[0]) kernel = model.module.conv2.float_weight #kernel = model.module.conv2.weight #sin2_func_2 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['conv2']))-1)),2)) sin2_func_2 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa kernel = model.module.fc1.float_weight #kernel = model.module.fc1.weight #sin2_func_3 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['fc1']))-1)),2)) sin2_func_3 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa kernel = model.module.fc2.float_weight #kernel = model.module.fc2.weight #sin2_func_4 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['fc2']))-1)),2)) sin2_func_4 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa kernel = model.module.fc3.float_weight #kernel = model.module.fc3.weight #sin2_func_5 = torch.mean(torch.pow(torch.sin(pi*kernel/(2**(-(qbits_dict['fc3']))-1)),2)) sin2_func_5 = torch.mean(torch.pow(torch.sin(pi*(kernel+shift)/step),power)) # dorefa # ---------------------------------- sin2_reg_loss = sin2_func_1 + sin2_func_2 + sin2_func_3 + sin2_func_4 + sin2_func_5 #loss = criterion(output, target) cost_factor = 1 reg_loss = cost_factor*sin2_reg_loss loss = criterion(output, target) + reg_loss #print('sin2_reg_LOSS:', sin2_reg_loss.data[0]) #print('total_LOSS:', loss.data[0]) #print('MODEL:', (model.state_dict())) # ------------------------------------------------------------------ AHMED edit sin2-reg - April19 # Measure accuracy and record loss classerr.add(output.data, target) else: # Measure accuracy and record loss loss = earlyexit_loss(output, target, criterion, args) losses[OBJECTIVE_LOSS_KEY].add(loss.item()) #print('sin2_reg_LOSS:', sin2_reg_loss.data[0]) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % args.print_freq == 0: # Log some statistics errs = OrderedDict() if not args.earlyexit_lossweights: errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) else: # for Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Peformance/Training/', stats_dict) params = model.named_parameters() if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time() kernel = model.module.conv1.float_weight print('00000000000000000000') w1 = kernel.data.cpu().numpy() np.save('w1_cifar', w1) print('======================================', reg_loss.data[0]) write_to_csv2([loss.data.cpu().numpy(), reg_loss.data.cpu().numpy()])
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, print_freq, log_params_hist): """Training loop for one epoch.""" losses = { 'objective_loss': tnt.AverageValueMeter(), 'regularizer_loss': tnt.AverageValueMeter() } if compression_scheduler is None: # Initialize the regularizer loss to zero losses['regularizer_loss'].add(0) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) target = target.cuda(async=True) input_var = inputs.cuda() target_var = torch.autograd.Variable(target) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) output = model(input_var) loss = criterion(output, target_var) # Measure accuracy and record loss classerr.add(output.data, target) losses['objective_loss'].add(loss.item()) if compression_scheduler: # Before running the backward phase, we add any regularization loss computed by the scheduler regularizer_loss = compression_scheduler.before_backward_pass( epoch, train_step, steps_per_epoch, loss, optimizer) loss += regularizer_loss losses['regularizer_loss'].add(regularizer_loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step + 1) if steps_completed % print_freq == 0: # Log some statistics lr = optimizer.param_groups[0]['lr'] stats = ('Peformance/Training/', OrderedDict([('Loss', losses['objective_loss'].mean), ('Reg Loss', losses['regularizer_loss'].mean), ('Top1', classerr.value(1)), ('Top5', classerr.value(5)), ('LR', lr), ('Time', batch_time.mean)])) distiller.log_training_progress( stats, model.named_parameters() if log_params_hist else None, epoch, steps_completed, steps_per_epoch, print_freq, loggers) end = time.time()
def _validate(data_loader, model, criterion, loggers, args, epoch=-1): """Execute the validation/test loop.""" batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() losses = tnt.AverageValueMeter() meter_dict = { 'paf': tnt.AverageValueMeter(), 'heatmap': tnt.AverageValueMeter(), 'max_ht': tnt.AverageValueMeter(), 'min_ht': tnt.AverageValueMeter(), 'max_paf': tnt.AverageValueMeter(), 'min_paf': tnt.AverageValueMeter() } total_samples = len(data_loader.sampler) batch_size = data_loader.batch_size total_steps = total_samples / batch_size msglogger.info('%d samples (%d per mini-batch)', total_samples, batch_size) model.eval() # TODO: model.train() in original repo end = time.time() # model = torch.nn.DataParallel(model, device_ids=args.gpus) # run_eval(image_dir=args.data, anno_dir=args.anno_dir, vis_dir=args.vis_dir, # image_list_txt=args.image_list_txt, # model=model, preprocess='vgg' if args.arch == 'vgg19' else 'rtpose') for validation_step, (inputs, heatmap_target, heat_mask, paf_target, paf_mask) in enumerate(data_loader): with torch.no_grad(): data_time.add(time.time() - end) inputs = inputs.to(args.device) heatmap_target = heatmap_target.to(args.device) heat_mask = heat_mask.to(args.device) paf_target = paf_target.to(args.device) paf_mask = paf_mask.to(args.device) _, saved_for_loss = model(inputs) total_loss, saved_for_log = criterion(saved_for_loss, heatmap_target, heat_mask, paf_target, paf_mask) losses.add(total_loss.item(), inputs.size(0)) batch_time.add(time.time() - end) end = time.time() steps_completed = (validation_step + 1) if steps_completed % args.print_freq == 0: stats = ('', OrderedDict([ ('Loss', losses.mean), ])) distiller.log_training_progress(stats, None, epoch, steps_completed, total_steps, args.print_freq, loggers) msglogger.info('==> Loss: %.6f\n', losses.mean) # TODO: refactor me with open( '/home/CORP.PKUSC.ORG/hatsu3/research/compression/distiller/examples/openpose_compression/notebooks/results.txt', 'w') as f: f.write('%.6f' % losses.mean) return losses.mean
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 200 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file ]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) if args.evaluate: args.deterministic = True if args.deterministic: distiller.set_deterministic( args.seed) # For experiment reproducability else: if args.seed is not None: distiller.set_seed(args.seed) # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name # TODO args.dataset = 'coco' # args.num_classes = 21 # wc -l ~/data/VOC2012/voc-model-labels.txt if args.load_vgg19 and args.arch != 'vgg19': raise ValueError( '``load_vgg19`` should be set only when vgg19 is used') model = create_pose_estimation_model(args.pretrained, args.dataset, args.arch, load_vgg19=args.load_vgg19, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # <editor-fold desc=">>> Load Model"> # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # </editor-fold> # Define loss function (criterion) # get_loss(saved_for_loss, heat_temp, heat_weight,vec_temp, vec_weight) criterion = { 'shufflenetv2': shufflenetv2_get_loss, 'vgg19': vgg19_get_loss, 'hourglass': hourglass_get_loss, }[args.arch] if optimizer is None: trainable_vars = [ param for param in model.parameters() if param.requires_grad ] optimizer = torch.optim.SGD(trainable_vars, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) # TODO: load lr_scheduler lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=3, min_lr=0, eps=1e-08) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return if args.export_onnx is not None: return distiller.export_img_classifier_to_onnx(model, os.path.join( msglogger.logdir, args.export_onnx), args.dataset, add_softmax=True, verbose=False) if args.qe_calibration: return acts_quant_stats_collection(model, criterion, pylogger, args) if args.activation_histograms: return acts_histogram_collection(model, criterion, pylogger, args) print('Building activations_collectors...') activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. print('Loading data...') train_loader, val_loader, test_loader, _ = load_data(args) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: # zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resumed_checkpoint_path.replace( ".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}' .format(start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin( epoch, metrics=(total_loss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: loss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) lr_scheduler.step(loss) stats = ('Performance/Validation/', OrderedDict([('Loss', loss)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, loss, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = { 'current_loss': loss, 'best_loss': perf_scores_history[0].loss, 'best_epoch': perf_scores_history[0].epoch } apputils.save_checkpoint(epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def step(self, pruning_action): """Take a step, given an action. The action represents the desired sparsity for the "current" layer (i.e. the percentage of weights to remove). This function is invoked by the Agent. """ if self.current_state_id == 0: msglogger.info("+" + "-" * 50 + "+") msglogger.info("Episode %d is starting" % self.episode) pruning_action = float(pruning_action[0]) msglogger.debug( "env.step - current_state_id=%d (%s) episode=%d action=%.2f" % (self.current_state_id, self.current_layer().name, self.episode, pruning_action)) self.agent_action_history.append(pruning_action) if is_using_continuous_action_space(self.amc_cfg.agent_algo): if self.amc_cfg.agent_algo == "ClippedPPO-continuous": # We need to map PPO's infinite action-space (actions sampled from a Gaussian) to our action-space. pruning_action = adjust_ppo_output(pruning_action, self.action_high, self.action_low) else: pruning_action = np.clip(pruning_action, self.action_low, self.action_high) else: # Divide the action space into 10 discrete levels (0%, 10%, 20%,....90% sparsity) pruning_action = pruning_action / 10 msglogger.debug( "\tAgent clipped pruning_action={}".format(pruning_action)) if self.amc_cfg.action_constrain_fn is not None: pruning_action = self.amc_cfg.action_constrain_fn( self, pruning_action=pruning_action) msglogger.debug( "Constrained pruning_action={}".format(pruning_action)) # Calculate the final compression rate total_macs_before, _ = self.net_wrapper.get_resources_requirements() layer_macs = self.net_wrapper.layer_macs(self.current_layer()) msglogger.debug("\tlayer_macs={:.2f}".format(layer_macs / self.original_model_macs)) msglogger.debug("\tremoved_macs={:.2f}".format(self.removed_macs_pct)) msglogger.debug("\trest_macs={:.2f}".format(self.rest_macs())) msglogger.debug("\tcurrent_layer_id = %d" % self.current_layer_id) self.current_state_id += 1 if pruning_action > 0: pruning_action = self.net_wrapper.remove_structures( self.current_layer_id, fraction_to_prune=pruning_action, prune_what=self.amc_cfg.pruning_pattern, prune_how=self.amc_cfg.pruning_method, group_size=self.amc_cfg.group_size, apply_thinning=self.episode_is_done(), ranking_noise=self.amc_cfg.ranking_noise) #random_state=self.random_state) else: pruning_action = 0 self.action_history.append(pruning_action) total_macs_after_act, total_nnz_after_act = self.net_wrapper.get_resources_requirements( ) layer_macs_after_action = self.net_wrapper.layer_macs( self.current_layer()) # Update the various counters after taking the step self.removed_macs += (total_macs_before - total_macs_after_act) msglogger.debug("\tactual_action={}".format(pruning_action)) msglogger.debug( "\tlayer_macs={} layer_macs_after_action={} removed now={}".format( layer_macs, layer_macs_after_action, (layer_macs - layer_macs_after_action))) msglogger.debug("\tself._removed_macs={}".format(self.removed_macs)) assert math.isclose(layer_macs_after_action / layer_macs, 1 - pruning_action) stats = ('Performance/Validation/', OrderedDict([('requested_action', pruning_action)])) distiller.log_training_progress( stats, None, self.episode, steps_completed=self.current_state_id, total_steps=self.net_wrapper.num_pruned_layers(), log_freq=1, loggers=[self.tflogger]) if self.episode_is_done(): msglogger.info("Episode %d is ending" % self.episode) observation = self.get_final_obs() reward, top1, top5, vloss = self.compute_reward( total_macs_after_act, total_nnz_after_act) self.finalize_episode(reward, (top1, top5, vloss), total_macs_after_act, total_nnz_after_act, self.action_history, self.agent_action_history) self.episode += 1 else: self.current_layer_id = self.net_wrapper.model_metadata.pruned_idxs[ self.current_state_id] if self.amc_cfg.ft_frequency is not None and self.current_state_id % self.amc_cfg.ft_frequency == 0: self.net_wrapper.train(1, self.episode) observation = self.get_obs() if self.amc_cfg.reward_frequency is not None and self.current_state_id % self.amc_cfg.reward_frequency == 0: reward, top1, top5, vloss = self.compute_reward( total_macs_after_act, total_nnz_after_act) else: reward = 0 self.prev_action = pruning_action if self.episode_is_done(): normalized_macs = total_macs_after_act / self.original_model_macs * 100 info = {"accuracy": top1, "compress_ratio": normalized_macs} if self.amc_cfg.protocol == "mac-constrained": # Sanity check (special case only for "mac-constrained") assert self.removed_macs_pct >= 1 - self.amc_cfg.target_density - 0.002 # 0.01 pass else: info = {} return observation, reward, self.episode_is_done(), info
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training loop for one epoch.""" batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() losses = tnt.AverageValueMeter() meter_dict = { 'paf': tnt.AverageValueMeter(), 'heatmap': tnt.AverageValueMeter(), 'max_ht': tnt.AverageValueMeter(), 'min_ht': tnt.AverageValueMeter(), 'max_paf': tnt.AverageValueMeter(), 'min_paf': tnt.AverageValueMeter() } total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) model.train() end = time.time() for train_step, (inputs, heatmap_target, heat_mask, paf_target, paf_mask) in enumerate(train_loader): data_time.add(time.time() - end) inputs = inputs.to(args.device) heatmap_target = heatmap_target.to(args.device) heat_mask = heat_mask.to(args.device) paf_target = paf_target.to(args.device) paf_mask = paf_mask.to(args.device) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) _, saved_for_loss = model(inputs) # criterion: get_loss total_loss, saved_for_log = criterion(saved_for_loss, heatmap_target, heat_mask, paf_target, paf_mask) for name, _ in meter_dict.items(): meter_dict[name].add(saved_for_log[name], inputs.size(0)) losses.add(total_loss, inputs.size(0)) # TODO: remove? if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass( epoch, train_step, steps_per_epoch, total_loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses['overall_loss'].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) else: losses['overall_loss'].add(total_loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() total_loss.backward() if compression_scheduler: compression_scheduler.before_parameter_optimization( epoch, train_step, steps_per_epoch, optimizer) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) batch_time.add(time.time() - end) steps_completed = (train_step + 1) if steps_completed % args.print_freq == 0: stats_dict = OrderedDict({ 'loss': losses.mean, 'LR': optimizer.param_groups[0]['lr'], 'Time': batch_time.mean, }) stats = ('Performance/Training/', stats_dict) params = model.named_parameters( ) if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time() return losses.mean
def train(train_loader, model, original_model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training loop for one epoch.""" losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit # So exiterrors is analogous to classerr for the non-Early Exit case if args.earlyexit_lossweights: args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append( tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) epoch_frac = args.partial_epoch steps_per_frac_epoch = math.ceil((total_samples * epoch_frac) / batch_size) # Switch to train mode model.train() end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to('cuda'), target.to('cuda') if train_step == steps_per_frac_epoch: break # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if args.kd_policy is None: torch.set_printoptions(precision=10) #model.freeze_partial([0, 4]) #model.module.freeze_partial([0, 3]) #print("Quantized") output = model(inputs) #new_tensor = model.module.act_conv2 #print(model) #new_tensor = model.act_conv2 new_tensor = output #print(new_tensor) #model.module.freeze() #model.module.fc1.weight.requires_grad = False #model.module.fc2.weight.requires_grad = False #original_model.module.freeze() original_model.freeze() #output_new = original_model.original_forward(inputs) #output_new = original_model.module.original_forward(inputs) #print("Original") output_new = original_model(inputs) #old_tensor = original_model.module.act_conv2 #old_tensor = original_model.act_conv2 old_tensor = output_new #print(torch.sum(model.module.fc3.weight), torch.sum(model.module.fc2.weight), torch.sum(model.module.fc1.weight), torch.sum(model.module.conv2.weight), torch.sum(model.module.conv1.weight)) #print(torch.sum(original_model.module.fc2.weight), torch.sum(original_model.module.fc1.weight), torch.sum(original_model.module.conv2.weight), torch.sum(original_model.module.conv1.weight)) #print(set(model.module.conv2.weight.data.cpu().numpy().ravel())) #print("Difference") #print(old_tensor - new_tensor) else: output = args.kd_policy.forward(inputs) if not args.earlyexit_lossweights: #loss = criterion(output, target) new_criterion = nn.PoissonNLLLoss( ) #nn.PoissonNLLLoss() #nn.L1Loss() #torch.nn.KLDivLoss() #torch.nn.MSELoss(size_average = False) old_tensor = torch.nn.functional.log_softmax(old_tensor) new_tensor = torch.nn.functional.softmax(new_tensor) loss = new_criterion(new_tensor, old_tensor) #loss = torch.sum(new_tensor - old_tensor) #print('loss >>>>>> ', loss) # Measure accuracy and record loss classerr.add(output.data, target) else: # Measure accuracy and record loss loss = earlyexit_loss(output, target, criterion, args) losses[OBJECTIVE_LOSS_KEY].add(loss.item()) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass( epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step + 1) if steps_completed % args.print_freq == 0: # Log some statistics errs = OrderedDict() if not args.earlyexit_lossweights: errs['Top1'] = classerr.value(1) csvlogger.writerow( [epoch, steps_completed, classerr.value(1), loss]) errs['Top5'] = classerr.value(5) else: # for Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Peformance/Training/', stats_dict) params = model.named_parameters( ) if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time()
collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics(epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, perf_scores_history[0].top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args) OVERALL_LOSS_KEY = 'Overall Loss'
def train(epoch, optimizer, compression_scheduler=None): # Turn on training mode which enables dropout. model.train() total_samples = train_data.size(0) steps_per_epoch = math.ceil(total_samples / args.bptt) total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) # The line below was fixed as per: https://github.com/pytorch/examples/issues/214 for batch, i in enumerate(range(0, train_data.size(0), args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) loss = compression_scheduler.before_backward_pass(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch, loss=loss, return_loss_components=False) optimizer.zero_grad() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() total_loss += loss.item() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time lr = optimizer.param_groups[0]['lr'] msglogger.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} ' '| loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() stats = ('Peformance/Training/', OrderedDict([ ('Loss', cur_loss), ('Perplexity', math.exp(cur_loss)), ('LR', lr), ('Batch Time', elapsed * 1000)]) ) steps_completed = batch + 1 distiller.log_training_progress(stats, model.named_parameters(), epoch, steps_completed, steps_per_epoch, args.log_interval, [tflogger])
def step(self, pruning_action): """Take a step, given an action. The action represents the desired sparsity. This function is invoked by the Agent. """ msglogger.info("env.step - current_layer_id={} episode={}".format( self.current_layer_id, self.episode)) msglogger.info("\tAgent pruning_action={}".format(pruning_action)) if is_using_continuous_action_space(self.amc_cfg.agent_algo): pruning_action = np.clip(pruning_action[0], self.action_low, self.action_high) else: # Divide the action space into 10 discrete levels (0%, 10%, 20%,....90% sparsity) pruning_action = pruning_action / 10 msglogger.info( "\tAgent clipped pruning_action={}".format(pruning_action)) self.agent_action_history.append(pruning_action) if self.amc_cfg.action_constrain_fn is not None: pruning_action = self.amc_cfg.action_constrain_fn( self, pruning_action=pruning_action) msglogger.info( "Constrained pruning_action={}".format(pruning_action)) total_macs_before, _ = self.net_wrapper.get_model_resources_requirements( self.model) layer_macs = self.net_wrapper.get_layer_macs(self.current_layer()) msglogger.info("\tlayer_macs={:.2f}".format(layer_macs / self.dense_model_macs)) msglogger.info("\tremoved_macs={:.2f}".format(self.removed_macs())) msglogger.info("\trest_macs={:.2f}".format(self.rest_macs())) if pruning_action > 0: pruning_action = self.net_wrapper.remove_structures( self.current_layer_id, fraction_to_prune=pruning_action, prune_what="filters") else: pruning_action = 0 self.action_history.append(pruning_action) total_macs_after, _ = self.net_wrapper.get_model_resources_requirements( self.model) layer_macs_after_action = self.net_wrapper.get_layer_macs( self.current_layer()) # Update the various counters after taking the step self.current_layer_id += 1 self._removed_macs += (total_macs_before - total_macs_after) msglogger.info("actual_action={}".format(pruning_action)) msglogger.info( "layer_macs={} layer_macs_after_action={} removed now={}".format( layer_macs, layer_macs_after_action, (layer_macs - layer_macs_after_action))) msglogger.info("self._removed_macs={}".format(self._removed_macs)) assert math.isclose(layer_macs_after_action / layer_macs, 1 - pruning_action) stats = ('Peformance/Validation/', OrderedDict([('requested_action', pruning_action)])) distiller.log_training_progress(stats, None, self.episode, steps_completed=self.current_layer_id, total_steps=self.amc_cfg.conv_cnt, log_freq=1, loggers=[self.tflogger]) if self.episode_is_done(): msglogger.info("Episode is ending") observation = self.get_final_obs() reward, top1, total_macs, total_nnz = self.compute_reward() normalized_macs = total_macs / self.dense_model_macs * 100 normalized_nnz = total_nnz / self.dense_model_size * 100 self.finalize_episode(top1, reward, total_macs, normalized_macs, normalized_nnz, self.action_history, self.agent_action_history) self.episode += 1 else: observation = self.get_obs() if self.amc_cfg.reward_frequency > 0 and self.current_layer_id % self.amc_cfg.reward_frequency == 0: reward, top1, total_macs, total_nnz = self.compute_reward( False) else: reward = 0 self.prev_action = pruning_action info = {} return observation, reward, self.episode_is_done(), info
def main(): global msglogger check_pytorch_version() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_top1 = 0 if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' # Create the model model = create_model(args.pretrained, args.dataset, args.arch, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.ADC: HAVE_GYM_INSTALLED = False if not HAVE_GYM_INSTALLED: raise ValueError( "ADC is currently experimental and uses non-public Coach features" ) import examples.automated_deep_compression.ADC as ADC train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) validate_fn = partial(validate, val_loader=test_loader, criterion=criterion, loggers=[pylogger], print_freq=args.print_freq) save_checkpoint_fn = partial(apputils.save_checkpoint, arch=args.arch, name='adc') ADC.do_adc(model, args.dataset, args.arch, val_loader, validate_fn, save_checkpoint_fn) exit() # This sample application can be invoked to produce various summary reports. if args.summary: which_summary = args.summary if which_summary.startswith('png'): apputils.draw_img_classifier_to_file( model, 'model.png', args.dataset, which_summary == 'png_w_params') else: distiller.model_summary(model, which_summary, args.dataset) exit() # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_sparsity = None if args.activation_stats: # If your model has ReLU layers, then those layers have sparse activations. # ActivationSparsityCollector will collect information about this sparsity. # WARNING! Enabling activation sparsity collection will significantly slow down training! activations_sparsity = ActivationSparsityCollector(model) if args.sensitivity is not None: # This sample application can be invoked to execute Sensitivity Analysis on your # model. The ouptut is saved to CSV and PNG. msglogger.info("Running sensitivity tests") test_fnc = partial(test, test_loader=test_loader, criterion=criterion, loggers=[pylogger], print_freq=args.print_freq) which_params = [ param_name for param_name, _ in model.named_parameters() ] sensitivity = distiller.perform_sensitivity_analysis( model, net_params=which_params, sparsities=np.arange(0.0, 0.95, 0.05), test_func=test_fnc, group=args.sensitivity) distiller.sensitivities_to_png(sensitivity, 'sensitivity.png') distiller.sensitivities_to_csv(sensitivity, 'sensitivity.csv') exit() if args.evaluate: # This sample application can be invoked to evaluate the accuracy of your model on # the test dataset. # You can optionally quantize the model to 8-bit integer before evaluation. # For example: # python3 compress_classifier.py --arch resnet20_cifar ../data.cifar10 -p=50 --resume=checkpoint.pth.tar --evaluate if args.quantize: model.cpu() quantizer = quantization.SymmetricLinearQuantizer(model, 8, 8) quantizer.prepare_model() model.cuda() top1, _, _ = test(test_loader, model, criterion, [pylogger], args.print_freq) if args.quantize: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args.arch, model, optimizer=None, best_top1=top1, name='_'.split(args.name, checkpoint_name) if args.name else checkpoint_name, dir=msglogger.logdir) exit() if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress) best_epoch = start_epoch for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], print_freq=args.print_freq, log_params_hist=args.log_params_histograms) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) if args.activation_stats: distiller.log_activation_sparsity(epoch, loggers=[tflogger, pylogger], collector=activations_sparsity) # evaluate on validation set top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args.print_freq, epoch) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # remember best top1 and save checkpoint is_best = top1 > best_top1 if is_best: best_epoch = epoch best_top1 = top1 msglogger.info('==> Best validation Top1: %.3f Epoch: %d', best_top1, best_epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], args.print_freq)
def train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers, args): """Training-with-compression loop for one epoch. For each training step in epoch: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() compression_scheduler.before_parameter_optimization(epoch) optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ OVERALL_LOSS_KEY = 'Overall Loss' OBJECTIVE_LOSS_KEY = 'Objective Loss' losses = OrderedDict([(OVERALL_LOSS_KEY, tnt.AverageValueMeter()), (OBJECTIVE_LOSS_KEY, tnt.AverageValueMeter())]) classerr = tnt.ClassErrorMeter(accuracy=True, topk=(1, 5)) batch_time = tnt.AverageValueMeter() data_time = tnt.AverageValueMeter() # For Early Exit, we define statistics for each exit # So exiterrors is analogous to classerr for the non-Early Exit case if early_exit_mode(args): args.exiterrors = [] for exitnum in range(args.num_exits): args.exiterrors.append(tnt.ClassErrorMeter(accuracy=True, topk=(1, 5))) total_samples = len(train_loader.sampler) batch_size = train_loader.batch_size steps_per_epoch = math.ceil(total_samples / batch_size) msglogger.info('Training epoch: %d samples (%d per mini-batch)', total_samples, batch_size) # Switch to train mode model.train() acc_stats = [] end = time.time() for train_step, (inputs, target) in enumerate(train_loader): # Measure data loading time data_time.add(time.time() - end) inputs, target = inputs.to(args.device), target.to(args.device) # Execute the forward phase, compute the output and measure loss if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, train_step, steps_per_epoch, optimizer) if not hasattr(args, 'kd_policy') or args.kd_policy is None: output = model(inputs) else: output = args.kd_policy.forward(inputs) if not early_exit_mode(args): loss = criterion(output, target) # Measure accuracy classerr.add(output.data, target) acc_stats.append([classerr.value(1), classerr.value(5)]) else: # Measure accuracy and record loss loss = earlyexit_loss(output, target, criterion, args) # Record loss losses[OBJECTIVE_LOSS_KEY].add(loss.item()) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass(epoch, train_step, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) loss = agg_loss.overall_loss losses[OVERALL_LOSS_KEY].add(loss.item()) for lc in agg_loss.loss_components: if lc.name not in losses: losses[lc.name] = tnt.AverageValueMeter() losses[lc.name].add(lc.value.item()) else: losses[OVERALL_LOSS_KEY].add(loss.item()) # Compute the gradient and do SGD step optimizer.zero_grad() loss.backward() if compression_scheduler: compression_scheduler.before_parameter_optimization(epoch, train_step, steps_per_epoch, optimizer) optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, train_step, steps_per_epoch, optimizer) # measure elapsed time batch_time.add(time.time() - end) steps_completed = (train_step+1) if steps_completed % args.print_freq == 0: # Log some statistics errs = OrderedDict() if not early_exit_mode(args): errs['Top1'] = classerr.value(1) errs['Top5'] = classerr.value(5) else: # for Early Exit case, the Top1 and Top5 stats are computed for each exit. for exitnum in range(args.num_exits): errs['Top1_exit' + str(exitnum)] = args.exiterrors[exitnum].value(1) errs['Top5_exit' + str(exitnum)] = args.exiterrors[exitnum].value(5) stats_dict = OrderedDict() for loss_name, meter in losses.items(): stats_dict[loss_name] = meter.mean stats_dict.update(errs) stats_dict['LR'] = optimizer.param_groups[0]['lr'] stats_dict['Time'] = batch_time.mean stats = ('Performance/Training/', stats_dict) params = model.named_parameters() if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.print_freq, loggers) end = time.time() #return acc_stats # NOTE: this breaks previous behavior, which returned a history of (top1, top5) values return classerr.value(1), classerr.value(5), losses[OVERALL_LOSS_KEY]
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. # 记录有关执行环境的各种详细信息。有时是有用的 # 参考过去的实验执行,这些信息可能有用。 apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 perf_scores_history = [] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) # 错误:设置--确定性要求将--workers/-j设置为0或1 exit(1) # 正常退出程序 # Use a well-known seed, for repeatability of experiments 使用一种众所周知的种子,用于实验的重复性。 distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cousm' if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = ResNet152() # model = torch.nn.DataParallel(model, device_ids=args.gpus) # 并行GPU model.to(args.device) compression_scheduler = None # 压缩调度 # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. # 创建两个日志后端 TensorBoardLogger以Google的Tensor板可以读取的格式写入日志文件。python logger将写入python记录器。 tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: # 加载训练模型 # checkpoint = torch.load(args.resume) # model.load_state_dict(checkpoint['state_dict']) model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) model.to(args.device) # Define loss function (criterion) and optimizer # 定义损失函数和优化器SGD criterion = nn.CrossEntropyLoss().to(args.device) # optimizer = torch.optim.SGD(model.fc.parameters(), lr=args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam(model.model.fc.parameters(), lr=args.lr, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: # 自动化的深层压缩 return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: # 贪婪的 return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. # 可以调用此示例应用程序来生成各种摘要报告。 if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) # 激活统计收集器 activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info( '\tStats will be collected for {:.1%} of test dataset'.format( args.qe_calibration)) msglogger.info( '\tSetting constant seeds and converting model to serialized execution' ) distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update( create_quantization_stats_collector(model)) # 量化统计收集器 args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. # 加载数据集:从传递的模型名称推断要加载的数据集 train_loader, val_loader, test_loader, _ = get_data_loaders( datasets_fn, r'/home/tian/Desktop/image_yasuo', args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) # 可以调用此示例应用程序来对模型执行敏感性分析。输出保存到csv和png。 if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. # #这个示例应用程序的主要用例是CNN压缩 # #需要yaml中的压缩计划配置文件。 compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) # 如果添加了参数(如PactQualifier),则模型会重新传输到GPU。 model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) # 压缩计划程序 if args.thinnify: # zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resume is not None, "You must use --resume to provide a checkpoint file to thinnify" # 必须使用--resume提供检查点文件以细化 distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resume.replace(".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) # 注意:您的模型可能已折叠为随机推理,因此您可能需要对其进行微调。 return args.kd_policy = None # 蒸馏 if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) lr = args.lr lr_decay = 0.5 for epoch in range(start_epoch, args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: # 打印掩盖稀疏表 在end of each epoch msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint # 更新到目前为止获得的最高分数列表,并保存检查点 sparsity = distiller.model_sparsity(model) perf_scores_history.append( distiller.MutableNamedTuple({ 'sparsity': sparsity, 'top1': top1, 'top5': top5, 'epoch': epoch })) # Keep perf_scores_history sorted from best to worst # Sort by sparsity as main sort key, then sort by top1, top5 and epoch # 保持绩效分数历史记录从最好到最差的排序 # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序 perf_scores_history.sort(key=operator.attrgetter( 'sparsity', 'top1', 'top5', 'epoch'), reverse=True) for score in perf_scores_history[:args.num_best_scores]: msglogger.info( '==> Best [Top1: %.3f Top5: %.3f Sparsity: %.2f on epoch: %d]', score.top1, score.top5, score.sparsity, score.epoch) is_best = epoch == perf_scores_history[0].epoch apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, perf_scores_history[0].top1, is_best, args.name, msglogger.logdir) if not is_best: lr = lr * lr_decay # 当loss大于上一次loss,降低学习率 for param_group in optimizer.param_groups: param_group['lr'] = lr # Finally run results on the test set # 最后在测试集上运行结果 test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)