def main(argv): parser = get_argument_parser() args = parser.parse_args(args=argv) config = create_sample_config(args, parser) if config.dist_url == "env://": config.update_from_env() configure_paths(config) copyfile(args.config, osp.join(config.log_dir, 'config.json')) source_root = Path(__file__).absolute().parents[2] # nncf root create_code_snapshot(source_root, osp.join(config.log_dir, "snapshot.tar.gz")) if config.seed is not None: warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') config.execution_mode = get_execution_mode(config) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) if not is_staged_quantization(config): start_worker(main_worker, config) else: from examples.classification.staged_quantization_worker import staged_quantization_main_worker start_worker(staged_quantization_main_worker, config)
def validate(val_loader, model, criterion, config): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input_, target) in enumerate(val_loader): input_ = input_.to(config.device) target = target.to(config.device) # compute output output = model(input_) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss, input_.size(0)) top1.update(acc1, input_.size(0)) top5.update(acc5, input_.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: logger.info( '{rank}' 'Test: [{0}/{1}] ' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Loss: {loss.val:.4f} ({loss.avg:.4f}) ' 'Acc@1: {top1.val:.3f} ({top1.avg:.3f}) ' 'Acc@5: {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5, rank='{}:'.format(config.rank) if config.multiprocessing_distributed else '' )) if is_main_process(): config.tb.add_scalar("val/loss", losses.avg, len(val_loader) * config.get('cur_epoch', 0)) config.tb.add_scalar("val/top1", top1.avg, len(val_loader) * config.get('cur_epoch', 0)) config.tb.add_scalar("val/top5", top5.avg, len(val_loader) * config.get('cur_epoch', 0)) logger.info(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}\n'.format(top1=top1, top5=top5)) acc = top1.avg / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) return top1.avg, top5.avg
def train(config, compression_ctrl, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1=0): for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch(train_loader, model, criterion, optimizer, compression_ctrl, epoch, config, is_inception) # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch if not isinstance(lr_scheduler, ReduceLROnPlateau) else best_acc1) # update compression scheduler state at the end of the epoch compression_ctrl.scheduler.epoch_step() # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) acc = best_acc1 / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'acc1': acc1, 'optimizer': optimizer.state_dict(), 'scheduler': compression_ctrl.scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar("compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def test(model, test_loader, class_weights, class_encoding, config): logger.info("\nTesting...\n") _, criterion = get_aux_loss_dependent_params(model, class_weights, 0, config) # Evaluation metric ignore_index = None ignore_unlabeled = config.get("ignore_unlabeled", True) if ignore_unlabeled and ('unlabeled' in class_encoding): ignore_index = list(class_encoding).index('unlabeled') metric = IoU(len(class_encoding), ignore_index=ignore_index) # Test the trained model on the test set test_obj = Test(model, test_loader, criterion, metric, config.device, config.model) logger.info(">>>> Running test dataset") loss, (iou, miou) = test_obj.run_epoch(config.print_step) class_iou = dict(zip(class_encoding.keys(), iou)) logger.info(">>>> Avg. loss: {0:.4f} | Mean IoU: {1:.4f}".format( loss, miou)) if config.metrics_dump is not None: write_metrics(miou, config.metrics_dump) # Print per class IoU for key, class_iou in zip(class_encoding.keys(), iou): logger.info("{0}: {1:.4f}".format(key, class_iou)) # Show a batch of samples and labels if config.imshow_batch: logger.info("A batch of predictions from the test set...") images, gt_labels = iter(test_loader).next() color_predictions = predict(model, images, class_encoding, config) from examples.common.models.segmentation.unet import UNet, center_crop if isinstance(model, UNet): # UNet predicts center image crops outputs_size_hw = (color_predictions.size()[2], color_predictions.size()[3]) gt_labels = center_crop(gt_labels, outputs_size_hw).contiguous() data_utils.show_ground_truth_vs_prediction(images, gt_labels, color_predictions, class_encoding)
def train_epoch_end(config, compression_algo, net, epoch, iteration, epoch_size, lr_scheduler, optimizer, test_data_loader, best_mAp): is_best = False test_freq_in_epochs = max(config.test_interval // epoch_size, 1) compression_algo.scheduler.epoch_step(epoch) if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) if epoch % test_freq_in_epochs == 0 and iteration != 0: if is_on_first_rank(config): print_statistics(compression_algo.statistics()) with torch.no_grad(): net.eval() mAP = test_net(net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) if mAP > best_mAp: is_best = True best_mAp = mAP if config.metrics_dump is not None: write_metrics(mAP, config) if isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(mAP) net.train() if is_on_first_rank(config): checkpoint_file_path = osp.join(config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'iter': iteration, 'scheduler': compression_algo.scheduler.state_dict() }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) return best_mAp
def main(argv): parser = get_argument_parser() args = parser.parse_args(args=argv) config = Config.from_json(args.config) config.update_from_args(args, parser) if config.dist_url == "env://": config.update_from_env() configure_paths(config) source_root = Path(__file__).absolute().parents[2] # nncf root create_code_snapshot(source_root, osp.join(config.log_dir, "snapshot.tar.gz")) if config.seed is not None: warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') config.execution_mode = get_execution_mode(config) if config.metrics_dump is not None: avg = 0 if config.resuming_checkpoint is None: model_name = os.path.basename(args.config).replace(".json", ".pth") else: model_name = os.path.basename(config.resuming_checkpoint) metrics = {model_name: avg} write_metrics(config, metrics) if not is_binarization(config): start_worker(main_worker, config) else: from examples.classification.binarization_worker import main_worker_binarization start_worker(main_worker_binarization, config)
def main_worker(current_gpu, config): configure_device(current_gpu, config) config.mlflow = SafeMLFLow(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) train_loader = val_loader = criterion = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) def criterion_fn(model_outputs, target, criterion_): labels, loss_outputs, _ = \ loss_funcs.do_model_specific_postprocessing(config.model, target, model_outputs) return criterion_(loss_outputs, labels) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: loaders, w_class = load_dataset(dataset, config) train_loader, val_loader, init_loader = loaders criterion = get_criterion(w_class, config) def autoq_test_fn(model, eval_loader): return test(model, eval_loader, criterion, color_encoding, config) nncf_config = register_default_init_args(nncf_config, init_loader, criterion, criterion_fn, autoq_test_fn, val_loader, config.device) model = load_model(config.model, pretrained=pretrained, num_classes=num_classes, model_params=config.get('model_params', {}), weights_path=config.get('weights')) model.to(config.device) resuming_model_sd = None resuming_checkpoint = None if resuming_checkpoint_path is not None: resuming_model_sd, resuming_checkpoint = load_resuming_model_state_dict_and_checkpoint_from_path( resuming_checkpoint_path) compression_ctrl, model = create_compressed_model( model, nncf_config, resuming_state_dict=resuming_model_sd) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() log_common_mlflow_params(config) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if is_main_process(): print_statistics(compression_ctrl.statistics()) if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) test(model, val_loader, criterion, color_encoding, config) elif config.mode.lower() == 'train': train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, color_encoding, config, resuming_checkpoint) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, class_encoding, config, resuming_checkpoint): logger.info("\nTraining...\n") # Check if the network architecture is correct logger.info(model) optim_config = config.get('optimizer', {}) optim_params = optim_config.get('optimizer_params', {}) lr = optim_params.get("lr", 1e-4) params_to_optimize = get_params_to_optimize(model_without_dp, lr * 10, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) # Evaluation metric ignore_index = None ignore_unlabeled = config.get("ignore_unlabeled", True) if ignore_unlabeled and ('unlabeled' in class_encoding): ignore_index = list(class_encoding).index('unlabeled') metric = IoU(len(class_encoding), ignore_index=ignore_index) best_miou = -1 best_compression_level = CompressionLevel.NONE # Optionally resume from a checkpoint if resuming_checkpoint is not None: if optimizer is not None: optimizer.load_state_dict(resuming_checkpoint['optimizer']) start_epoch = resuming_checkpoint['epoch'] best_miou = resuming_checkpoint['miou'] if "scheduler" in resuming_checkpoint and compression_ctrl.scheduler is not None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) logger.info("Resuming from model: Start epoch = {0} " "| Best mean IoU = {1:.4f}".format(start_epoch, best_miou)) config.start_epoch = start_epoch # Start Training train_obj = Train(model, train_loader, optimizer, criterion, compression_ctrl, metric, config.device, config.model) val_obj = Test(model, val_loader, criterion, metric, config.device, config.model) for epoch in range(config.start_epoch, config.epochs): compression_ctrl.scheduler.epoch_step() logger.info(">>>> [Epoch: {0:d}] Training".format(epoch)) if config.distributed: train_loader.sampler.set_epoch(epoch) epoch_loss, (iou, miou) = train_obj.run_epoch(config.print_step) if not isinstance(lr_scheduler, ReduceLROnPlateau): # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch) logger.info( ">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}". format(epoch, epoch_loss, miou)) if is_main_process(): config.tb.add_scalar("train/loss", epoch_loss, epoch) config.tb.add_scalar("train/mIoU", miou, epoch) config.tb.add_scalar("train/learning_rate", optimizer.param_groups[0]['lr'], epoch) config.tb.add_scalar("train/compression_loss", compression_ctrl.loss(), epoch) for key, value in compression_ctrl.statistics( quickly_collected_only=True).items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, epoch) if (epoch + 1) % config.save_freq == 0 or epoch + 1 == config.epochs: logger.info(">>>> [Epoch: {0:d}] Validation".format(epoch)) loss, (iou, miou) = val_obj.run_epoch(config.print_step) logger.info( ">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}". format(epoch, loss, miou)) if is_main_process(): config.tb.add_scalar("val/mIoU", miou, epoch) config.tb.add_scalar("val/loss", loss, epoch) for i, (key, class_iou) in enumerate(zip(class_encoding.keys(), iou)): config.tb.add_scalar( "{}/mIoU_Cls{}_{}".format(config.dataset, i, key), class_iou, epoch) compression_level = compression_ctrl.compression_level() is_best_by_miou = miou > best_miou and compression_level == best_compression_level is_best = is_best_by_miou or compression_level > best_compression_level if is_best: best_miou = miou best_compression_level = max(compression_level, best_compression_level) if config.metrics_dump is not None: write_metrics(best_miou, config.metrics_dump) if isinstance(lr_scheduler, ReduceLROnPlateau): # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(best_miou) # Print per class IoU on last epoch or if best iou if epoch + 1 == config.epochs or is_best: for key, class_iou in zip(class_encoding.keys(), iou): logger.info("{0}: {1:.4f}".format(key, class_iou)) # Save the model if it's the best thus far if is_main_process(): checkpoint_path = save_checkpoint(model, optimizer, epoch, best_miou, compression_level, compression_ctrl.scheduler, config) make_additional_checkpoints(checkpoint_path, is_best, epoch, config) print_statistics(compression_ctrl.statistics()) return model
def main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) config.device = get_device(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) weights = config.get('weights') model = load_model(config.model, pretrained=config.get('pretrained', True) if weights is None else False, num_classes=num_classes, model_params=config.get('model_params', {})) compression_ctrl, model = create_compressed_model(model, config) if weights: sd = torch.load(weights, map_location='cpu') load_state(model, sd) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() resuming_checkpoint = config.resuming_checkpoint if resuming_checkpoint is not None: if not config.pretrained: # Load the previously saved model state model, _, _, _, _ = \ load_checkpoint(model, resuming_checkpoint, config.device, compression_scheduler=compression_ctrl.scheduler) if config.to_onnx is not None: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) loaders, w_class = load_dataset(dataset, config) _, val_loader = loaders test(model, val_loader, w_class, color_encoding, config) print_statistics(compression_ctrl.statistics()) elif config.mode.lower() == 'train': loaders, w_class = load_dataset(dataset, config) train_loader, val_loader = loaders if not resuming_checkpoint: compression_ctrl.initialize(train_loader) train(model, model_without_dp, compression_ctrl, train_loader, val_loader, w_class, color_encoding, config) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def main_worker(current_gpu, config): ################################# # Setup experiment environment ################################# config.current_gpu = current_gpu config.distributed = config.execution_mode in (ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_on_first_rank(config): configure_logging(config) print_args(config) config.device = get_device(config) config.start_iter = 0 ########################## # Prepare metrics log file ########################## if config.metrics_dump and config.resuming_checkpoint is not None: avg = 0 metrics = {os.path.basename(config.resuming_checkpoint): avg} write_metrics(config, metrics) ################## # Prepare model ################## compression_algo, net = create_model(config) if config.distributed: config.batch_size //= config.ngpus_per_node config.workers //= config.ngpus_per_node compression_algo.distributed() ########################### # Criterion and optimizer ########################### params_to_optimize = get_parameter_groups(net, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) criterion = MultiBoxLoss( config, config['num_classes'], overlap_thresh=0.5, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=3, neg_overlap=0.5, encode_target=False, device=config.device ) ########################### # Load checkpoint ########################### resuming_checkpoint = config.resuming_checkpoint if resuming_checkpoint: print('Resuming training, loading {}...'.format(resuming_checkpoint)) checkpoint = torch.load(resuming_checkpoint, map_location='cpu') # use checkpoint itself in case of only state dict is saved # i.e. checkpoint is created with `torch.save(module.state_dict())` state_dict = checkpoint.get('state_dict', checkpoint) load_state(net, state_dict, is_resume=True) if config.mode.lower() == 'train' and config.to_onnx is None: compression_algo.scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint.get('optimizer', optimizer.state_dict())) config.start_iter = checkpoint.get('iter', 0) + 1 if config.to_onnx: compression_algo.export_model(config.to_onnx) print("Saved to {}".format(config.to_onnx)) return ########################### # Prepare data ########################### test_data_loader, train_data_loader = create_dataloaders(config) if config.mode.lower() == 'test': with torch.no_grad(): print_statistics(compression_algo.statistics()) net.eval() mAp = test_net(net, config.device, test_data_loader, distributed=config.distributed) if config.metrics_dump and config.resuming_checkpoint is not None: avg = mAp*100 metrics = {os.path.basename(config.resuming_checkpoint): round(avg, 2)} write_metrics(config, metrics) return if not resuming_checkpoint: compression_algo.initialize(train_data_loader) train(net, compression_algo, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler)
def main_worker(current_gpu, config): ################################# # Setup experiment environment ################################# config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_on_first_rank(config): configure_logging(logger, config) print_args(config) config.device = get_device(config) config.start_iter = 0 ########################## # Prepare metrics log file ########################## if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) ########################### # Criterion ########################### criterion = MultiBoxLoss(config, config['num_classes'], overlap_thresh=0.5, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=3, neg_overlap=0.5, encode_target=False, device=config.device) train_data_loader = test_data_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path ########################### # Prepare data ########################### pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: test_data_loader, train_data_loader = create_dataloaders(config) config.nncf_config = register_default_init_args( config.nncf_config, criterion, train_data_loader) ################## # Prepare model ################## resuming_checkpoint_path = config.resuming_checkpoint_path resuming_checkpoint = None resuming_model_state_dict = None if resuming_checkpoint_path: logger.info( 'Resuming from checkpoint {}...'.format(resuming_checkpoint_path)) resuming_checkpoint = torch.load(resuming_checkpoint_path, map_location='cpu') # use checkpoint itself in case only the state dict was saved, # i.e. the checkpoint was created with `torch.save(module.state_dict())` resuming_model_state_dict = resuming_checkpoint.get( 'state_dict', resuming_checkpoint) compression_ctrl, net = create_model(config, resuming_model_state_dict) if config.distributed: config.batch_size //= config.ngpus_per_node config.workers //= config.ngpus_per_node compression_ctrl.distributed() ########################### # Optimizer ########################### params_to_optimize = get_parameter_groups(net, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) ################################# # Load additional checkpoint data ################################# if resuming_checkpoint is not None and config.mode.lower( ) == 'train' and config.to_onnx is None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) optimizer.load_state_dict( resuming_checkpoint.get('optimizer', optimizer.state_dict())) config.start_iter = resuming_checkpoint.get('iter', 0) + 1 if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': with torch.no_grad(): print_statistics(compression_ctrl.statistics()) net.eval() mAp = test_net(net, config.device, test_data_loader, distributed=config.distributed) if config.metrics_dump is not None: write_metrics(mAp, config.metrics_dump) return train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler)
def train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch_size = len(train_data_loader) logger.info('Training {} on {} dataset...'.format( config.model, train_data_loader.dataset.name)) batch_iterator = None t_start = time.time() print_statistics(compression_ctrl.statistics()) best_mAp = 0 best_compression_level = CompressionLevel.NONE test_freq_in_epochs = max(config.test_interval // epoch_size, 1) for iteration in range(config.start_iter, config['max_iter']): if (not batch_iterator) or (iteration % epoch_size == 0): # create batch iterator batch_iterator = iter(train_data_loader) epoch = iteration // epoch_size if (iteration + 1) % epoch_size == 0: compression_ctrl.scheduler.epoch_step(epoch) compression_level = compression_ctrl.compression_level() is_best = False if (epoch + 1) % test_freq_in_epochs == 0: if is_on_first_rank(config): print_statistics(compression_ctrl.statistics()) with torch.no_grad(): net.eval() mAP = test_net( net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) is_best_by_mAP = mAP > best_mAp and compression_level == best_compression_level is_best = is_best_by_mAP or compression_level > best_compression_level if is_best: best_mAp = mAP best_compression_level = max(compression_level, best_compression_level) net.train() # Learning rate scheduling should be applied after optimizer’s update if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) else: lr_scheduler.step(mAP) if is_on_first_rank(config): logger.info('Saving state, iter: {}'.format(iteration)) checkpoint_file_path = osp.join( config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'iter': config['max_iter'], 'scheduler': compression_ctrl.scheduler.state_dict(), 'compression_level': compression_level, }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) compression_ctrl.scheduler.step(iteration - config.start_iter) optimizer.zero_grad() batch_iterator, batch_loss, batch_loss_c, batch_loss_l, loss_comp = train_step( batch_iterator, compression_ctrl, config, criterion, net, train_data_loader) optimizer.step() batch_loss_l = batch_loss_l / config.iter_size batch_loss_c = batch_loss_c / config.iter_size model_loss = (batch_loss_l + batch_loss_c) / config.iter_size batch_loss = batch_loss / config.iter_size loc_loss += batch_loss_l.item() conf_loss += batch_loss_c.item() ########################### # Logging ########################### if is_on_first_rank(config): config.tb.add_scalar("train/loss_l", batch_loss_l.item(), iteration) config.tb.add_scalar("train/loss_c", batch_loss_c.item(), iteration) config.tb.add_scalar("train/loss", batch_loss.item(), iteration) if iteration % config.print_freq == 0: t_finish = time.time() t_elapsed = t_finish - t_start t_start = time.time() logger.info( '{}: iter {} epoch {} || Loss: {:.4} || Time {:.4}s || lr: {} || CR loss: {}' .format( config.rank, iteration, epoch, model_loss.item(), t_elapsed, optimizer.param_groups[0]['lr'], loss_comp.item() if isinstance(loss_comp, torch.Tensor) else loss_comp)) if config.metrics_dump is not None: write_metrics(best_mAp, config.metrics_dump)
def main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) config.device = get_device(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) train_loader = val_loader = criterion = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: loaders, w_class = load_dataset(dataset, config) train_loader, val_loader = loaders criterion = get_criterion(w_class, config) if not resuming_checkpoint_path: nncf_config = register_default_init_args(nncf_config, criterion, train_loader) model = load_model(config.model, pretrained=pretrained, num_classes=num_classes, model_params=config.get('model_params', {}), weights_path=config.get('weights')) model.to(config.device) compression_ctrl, model = create_compressed_model(model, nncf_config) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() if resuming_checkpoint_path: if not config.pretrained: # Load the previously saved model state model, _, _, _, _ = \ load_checkpoint(model, resuming_checkpoint_path, config.device, compression_scheduler=compression_ctrl.scheduler) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) test(model, val_loader, criterion, color_encoding, config) print_statistics(compression_ctrl.statistics()) elif config.mode.lower() == 'train': train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, color_encoding, config) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def train(config, compression_ctrl, model, criterion, criterion_fn, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1=0): best_compression_level = CompressionLevel.NONE for epoch in range(config.start_epoch, config.epochs): # update compression scheduler state at the begin of the epoch compression_ctrl.scheduler.epoch_step() config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch(train_loader, model, criterion, criterion_fn, optimizer, compression_ctrl, epoch, config) # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch if not isinstance( lr_scheduler, ReduceLROnPlateau) else best_acc1) # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) compression_level = compression_ctrl.compression_level() # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint # still can be best if current compression level is bigger then best one. Compression levels in ascending # order: NONE, PARTIAL, FULL. is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level is_best = is_best_by_accuracy or compression_level > best_compression_level if is_best: best_acc1 = acc1 config.mlflow.safe_call('log_metric', "best_acc1", best_acc1) best_compression_level = max(compression_level, best_compression_level) acc = best_acc1 / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'compression_level': compression_level, 'acc1': acc1, 'optimizer': optimizer.state_dict(), 'scheduler': compression_ctrl.scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.mlflow.safe_call( 'log_metric', 'compression/statistics/{0}'.format(key), value, epoch) config.tb.add_scalar( "compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def main_worker(current_gpu, config): ################################# # Setup experiment environment ################################# configure_device(current_gpu, config) config.mlflow = SafeMLFLow(config) if is_on_first_rank(config): configure_logging(logger, config) print_args(config) config.start_iter = 0 nncf_config = config.nncf_config ########################## # Prepare metrics log file ########################## if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) ########################### # Criterion ########################### criterion = MultiBoxLoss(config, config['num_classes'], overlap_thresh=0.5, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=3, neg_overlap=0.5, encode_target=False, device=config.device) train_data_loader = test_data_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path ########################### # Prepare data ########################### pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: test_data_loader, train_data_loader, init_data_loader = create_dataloaders( config) def criterion_fn(model_outputs, target, criterion): loss_l, loss_c = criterion(model_outputs, target) return loss_l + loss_c def autoq_test_fn(model, eval_loader): # RL is maximization, change the loss polarity return -1 * test_net(model, config.device, eval_loader, distributed=config.distributed, loss_inference=True, criterion=criterion) nncf_config = register_default_init_args(nncf_config, init_data_loader, criterion, criterion_fn, autoq_test_fn, test_data_loader, config.device) ################## # Prepare model ################## resuming_checkpoint_path = config.resuming_checkpoint_path resuming_model_sd = None if resuming_checkpoint_path is not None: resuming_model_sd, resuming_checkpoint = load_resuming_model_state_dict_and_checkpoint_from_path( resuming_checkpoint_path) compression_ctrl, net = create_model(config, resuming_model_sd) if config.distributed: config.batch_size //= config.ngpus_per_node config.workers //= config.ngpus_per_node compression_ctrl.distributed() ########################### # Optimizer ########################### params_to_optimize = get_parameter_groups(net, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) ################################# # Load additional checkpoint data ################################# if resuming_checkpoint_path is not None and config.mode.lower( ) == 'train' and config.to_onnx is None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) optimizer.load_state_dict( resuming_checkpoint.get('optimizer', optimizer.state_dict())) config.start_iter = resuming_checkpoint.get('iter', 0) + 1 log_common_mlflow_params(config) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if is_main_process(): print_statistics(compression_ctrl.statistics()) if config.mode.lower() == 'test': with torch.no_grad(): net.eval() if config['ssd_params'].get('loss_inference', False): model_loss = test_net(net, config.device, test_data_loader, distributed=config.distributed, loss_inference=True, criterion=criterion) logger.info("Final model loss: {:.3f}".format(model_loss)) else: mAp = test_net(net, config.device, test_data_loader, distributed=config.distributed) if config.metrics_dump is not None: write_metrics(mAp, config.metrics_dump) return train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler)