def test_resume(config, tmp_path, multiprocessing_distributed, case_common_dirs): c = config checkpoint_save_dir = os.path.join(str(tmp_path), "models") config_factory = ConfigFactory(config['config'], tmp_path / 'config.json') ckpt_path = os.path.join( case_common_dirs["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel", get_name(config_factory.config) + "_last.pth") if "max_iter" in config_factory.config: config_factory.config["max_iter"] += 2 args = { "--mode": "train", "--data": c["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": c["batch_size"] * torch.cuda.device_count(), "--workers": 1, "--epochs": 2, "--checkpoint-save-dir": checkpoint_save_dir, "--resume": ckpt_path, } if multiprocessing_distributed: args["--multiprocessing-distributed"] = None runner = Command(create_command_line(args, c["sample_type"])) res = runner.run() assert res == 0 assert os.path.exists( os.path.join(checkpoint_save_dir, get_name(config_factory.config) + "_last.pth"))
def test_resume(config, tmp_path, multiprocessing_distributed, case_common_dirs): checkpoint_save_dir = os.path.join(str(tmp_path), "models") config_factory = ConfigFactory(config['nncf_config'], tmp_path / 'config.json') ckpt_path = get_resuming_checkpoint_path( config_factory, multiprocessing_distributed, case_common_dirs["checkpoint_save_dir"]) if "max_iter" in config_factory.config: config_factory.config["max_iter"] += 2 args = { "--mode": "train", "--data": config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": config["batch_size"] * torch.cuda.device_count(), "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--epochs": 2, "--checkpoint-save-dir": checkpoint_save_dir, "--resume": ckpt_path, } if multiprocessing_distributed: args["--multiprocessing-distributed"] = None runner = Command(create_command_line(args, config["sample_type"])) res = runner.run() assert res == 0 last_checkpoint_path = os.path.join( checkpoint_save_dir, get_name(config_factory.config) + "_last.pth") assert os.path.exists(last_checkpoint_path) assert torch.load(last_checkpoint_path)['compression_level'] in ( CompressionLevel.FULL, CompressionLevel.PARTIAL)
def test_trained_model_eval(config, tmp_path, multiprocessing_distributed, case_common_dirs): config_factory = ConfigFactory(config['nncf_config'], tmp_path / 'config.json') ckpt_path = os.path.join( case_common_dirs["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel", get_name(config_factory.config) + "_last.pth") args = { "--mode": "test", "--data": config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": config["batch_size"] * torch.cuda.device_count(), "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--weights": ckpt_path, } if multiprocessing_distributed: args["--multiprocessing-distributed"] = None runner = Command(create_command_line(args, config["sample_type"])) res = runner.run() assert res == 0
def test_pretrained_model_train(config, tmp_path, multiprocessing_distributed, case_common_dirs): checkpoint_save_dir = os.path.join( case_common_dirs["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel") config_factory = ConfigFactory(config['nncf_config'], tmp_path / 'config.json') args = { "--mode": "train", "--data": config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": config["batch_size"] * torch.cuda.device_count(), "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--epochs": 2, "--checkpoint-save-dir": checkpoint_save_dir, "--dist-url": "tcp://127.0.0.1:8989" } if multiprocessing_distributed: args["--multiprocessing-distributed"] = None runner = Command(create_command_line(args, config["sample_type"])) runner.run() last_checkpoint_path = os.path.join( checkpoint_save_dir, get_name(config_factory.config) + "_last.pth") assert os.path.exists(last_checkpoint_path) assert torch.load(last_checkpoint_path)['compression_level'] in ( CompressionLevel.FULL, CompressionLevel.PARTIAL)
def test_pretrained_model_train(config, tmp_path, multiprocessing_distributed): c = config checkpoint_save_dir = os.path.join( c["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel") config_factory = ConfigFactory(config['config'], tmp_path / 'config.json') args = { "--mode": "train", "--data": c["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": c["batch_size"] * torch.cuda.device_count(), "--workers": 1, "--epochs": 1, "--checkpoint-save-dir": checkpoint_save_dir, "--dist-url": "tcp://127.0.0.1:8989" } if multiprocessing_distributed: args["--multiprocessing-distributed"] = None runner = Command(create_command_line(args, c["sample_type"])) res = runner.run() assert res == 0 assert os.path.exists( os.path.join(checkpoint_save_dir, get_name(config_factory.config) + "_last.pth"))
def train(config, compression_algo, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader): global best_acc1 for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) lr_scheduler.step(epoch if not isinstance( lr_scheduler, ReduceLROnPlateau) else best_acc1) # train for one epoch train_epoch(train_loader, model, criterion, optimizer, compression_algo, epoch, config, is_inception) # compute compression algo statistics stats = compression_algo.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # update compression scheduler state at the end of the epoch compression_algo.scheduler.epoch_step() if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': compression_algo.scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def test_trained_model_export(config, tmp_path, multiprocessing_distributed): c = config config_factory = ConfigFactory(config['config'], tmp_path / 'config.json') ckpt_path = os.path.join( c["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel", get_name(config_factory.config) + "_last.pth") onnx_path = os.path.join(str(tmp_path), "model.onnx") args = { "--mode": "test", "--config": config_factory.serialize(), "--to-onnx": onnx_path, "--weights": ckpt_path } runner = Command(create_command_line(args, c["sample_type"])) res = runner.run() assert res == 0 assert os.path.exists(onnx_path)
def train_epoch_end(config, compression_algo, net, epoch, iteration, epoch_size, lr_scheduler, optimizer, test_data_loader, best_mAp): is_best = False test_freq_in_epochs = max(config.test_interval // epoch_size, 1) compression_algo.scheduler.epoch_step(epoch) if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) if epoch % test_freq_in_epochs == 0 and iteration != 0: if is_on_first_rank(config): print_statistics(compression_algo.statistics()) with torch.no_grad(): net.eval() mAP = test_net(net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) if mAP > best_mAp: is_best = True best_mAp = mAP if config.metrics_dump is not None: write_metrics(mAP, config) if isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(mAP) net.train() if is_on_first_rank(config): checkpoint_file_path = osp.join(config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'iter': iteration, 'scheduler': compression_algo.scheduler.state_dict() }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) return best_mAp
def get_resuming_checkpoint_path(config_factory, multiprocessing_distributed, checkpoint_save_dir): return os.path.join( checkpoint_save_dir, "distributed" if multiprocessing_distributed else "data_parallel", get_name(config_factory.config) + "_last.pth")
jconfig = json.load(config_path_.open()) args_ = { 'data': dataset_path, 'weights': weights_path_, 'config': str(config_path_) } if batch_size: args_['batch-size'] = batch_size if epochs: args_['epochs'] = epochs test_config_ = { 'sample_type': sample_type_, 'expected_accuracy': expected_accuracy_, 'absolute_tolerance_train': absolute_tolerance_train_, 'absolute_tolerance_eval': absolute_tolerance_eval_, 'checkpoint_name': get_name(jconfig) } CONFIG_PARAMS.append( tuple([test_config_, args_, execution_arg_, dataset_name_])) def get_config_name(config_path): base = os.path.basename(config_path) return os.path.splitext(base)[0] @pytest.fixture( scope='module', params=CONFIG_PARAMS, ids=[
def train(net, compression_algo, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch_size = len(train_data_loader) print('Training ', config.model, ' on ', train_data_loader.dataset.name, ' dataset...') batch_iterator = None t_start = time.time() print_statistics(compression_algo.statistics()) for iteration in range(config.start_iter, config['max_iter']): if (not batch_iterator) or (iteration % epoch_size == 0): # create batch iterator batch_iterator = iter(train_data_loader) epoch = iteration // epoch_size if iteration % epoch_size == 0: train_epoch_end(config, compression_algo, net, epoch, iteration, epoch_size, lr_scheduler, optimizer, test_data_loader) compression_algo.scheduler.step(iteration - config.start_iter) optimizer.zero_grad() batch_iterator, batch_loss, batch_loss_c, batch_loss_l, loss_comp = train_step( batch_iterator, compression_algo, config, criterion, net, train_data_loader ) optimizer.step() batch_loss_l = batch_loss_l / config.iter_size batch_loss_c = batch_loss_c / config.iter_size model_loss = (batch_loss_l + batch_loss_c) / config.iter_size batch_loss = batch_loss / config.iter_size loc_loss += batch_loss_l.item() conf_loss += batch_loss_c.item() ########################### # Logging ########################### if is_on_first_rank(config): config.tb.add_scalar("train/loss_l", batch_loss_l.item(), iteration) config.tb.add_scalar("train/loss_c", batch_loss_c.item(), iteration) config.tb.add_scalar("train/loss", batch_loss.item(), iteration) checkpoint_file_path = osp.join(config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save({ 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'iter': config['max_iter'], 'scheduler': compression_algo.scheduler.state_dict() }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=True, epoch=epoch + 1, config=config) if iteration % config.print_freq == 0: t_finish = time.time() t_elapsed = t_finish - t_start t_start = time.time() print('{}: iter {} epoch {} || Loss: {:.4} || Time {:.4}s || lr: {} || CR loss: {}'.format( config.rank, iteration, epoch, model_loss.item(), t_elapsed, optimizer.param_groups[0]['lr'], loss_comp.item() if isinstance(loss_comp, torch.Tensor) else loss_comp ))
def train(config, compression_ctrl, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1=0): best_compression_level = CompressionLevel.NONE for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch(train_loader, model, criterion, optimizer, compression_ctrl, epoch, config, is_inception) # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch if not isinstance( lr_scheduler, ReduceLROnPlateau) else best_acc1) # update compression scheduler state at the end of the epoch compression_ctrl.scheduler.epoch_step() # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) compression_level = compression_ctrl.compression_level() # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint # still can be best if current compression level is bigger then best one. Compression levels in ascending # order: NONE, PARTIAL, FULL. is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level is_best = is_best_by_accuracy or compression_level > best_compression_level if is_best: best_acc1 = acc1 best_compression_level = max(compression_level, best_compression_level) acc = best_acc1 / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'compression_level': compression_level, 'acc1': acc1, 'optimizer': optimizer.state_dict(), 'scheduler': compression_ctrl.scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch_size = len(train_data_loader) logger.info('Training {} on {} dataset...'.format( config.model, train_data_loader.dataset.name)) batch_iterator = None t_start = time.time() print_statistics(compression_ctrl.statistics()) best_mAp = 0 best_compression_level = CompressionLevel.NONE test_freq_in_epochs = max(config.test_interval // epoch_size, 1) for iteration in range(config.start_iter, config['max_iter']): if (not batch_iterator) or (iteration % epoch_size == 0): # create batch iterator batch_iterator = iter(train_data_loader) epoch = iteration // epoch_size if (iteration + 1) % epoch_size == 0: compression_ctrl.scheduler.epoch_step(epoch) compression_level = compression_ctrl.compression_level() is_best = False if (epoch + 1) % test_freq_in_epochs == 0: if is_on_first_rank(config): print_statistics(compression_ctrl.statistics()) with torch.no_grad(): net.eval() mAP = test_net( net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) is_best_by_mAP = mAP > best_mAp and compression_level == best_compression_level is_best = is_best_by_mAP or compression_level > best_compression_level if is_best: best_mAp = mAP best_compression_level = max(compression_level, best_compression_level) net.train() # Learning rate scheduling should be applied after optimizer’s update if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) else: lr_scheduler.step(mAP) if is_on_first_rank(config): logger.info('Saving state, iter: {}'.format(iteration)) checkpoint_file_path = osp.join( config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'iter': config['max_iter'], 'scheduler': compression_ctrl.scheduler.state_dict(), 'compression_level': compression_level, }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) compression_ctrl.scheduler.step(iteration - config.start_iter) optimizer.zero_grad() batch_iterator, batch_loss, batch_loss_c, batch_loss_l, loss_comp = train_step( batch_iterator, compression_ctrl, config, criterion, net, train_data_loader) optimizer.step() batch_loss_l = batch_loss_l / config.iter_size batch_loss_c = batch_loss_c / config.iter_size model_loss = (batch_loss_l + batch_loss_c) / config.iter_size batch_loss = batch_loss / config.iter_size loc_loss += batch_loss_l.item() conf_loss += batch_loss_c.item() ########################### # Logging ########################### if is_on_first_rank(config): config.tb.add_scalar("train/loss_l", batch_loss_l.item(), iteration) config.tb.add_scalar("train/loss_c", batch_loss_c.item(), iteration) config.tb.add_scalar("train/loss", batch_loss.item(), iteration) if iteration % config.print_freq == 0: t_finish = time.time() t_elapsed = t_finish - t_start t_start = time.time() logger.info( '{}: iter {} epoch {} || Loss: {:.4} || Time {:.4}s || lr: {} || CR loss: {}' .format( config.rank, iteration, epoch, model_loss.item(), t_elapsed, optimizer.param_groups[0]['lr'], loss_comp.item() if isinstance(loss_comp, torch.Tensor) else loss_comp)) if config.metrics_dump is not None: write_metrics(best_mAp, config.metrics_dump)
def train_staged(config, compression_ctrl, model, criterion, is_inception, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1=0): best_compression_level = CompressionLevel.NONE for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch_staged(train_loader, batch_multiplier, model, criterion, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl, epoch, config, is_inception) # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) compression_level = compression_ctrl.compression_level() # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint # still can be best if current compression level is bigger then best one. Compression levels in ascending # order: NONE, PARTIAL, FULL. is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level is_best = is_best_by_accuracy or compression_level > best_compression_level best_acc1 = max(acc1, best_acc1) best_compression_level = max(compression_level, best_compression_level) # statistics (e.g. portion of the enabled quantizers) is related to the finished epoch, # hence printing should happen before epoch_step, which may inform about state of the next epoch (e.g. next # portion of enabled quantizers) if is_main_process(): print_statistics(stats) # update compression scheduler state at the end of the epoch compression_ctrl.scheduler.epoch_step() optimizer_scheduler.epoch_step() if is_main_process(): checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'original_model_state_dict': kd_loss_calculator.original_model.state_dict(), 'best_acc1': best_acc1, 'compression_level': compression_level, 'optimizer': optimizer.state_dict(), 'compression_scheduler': compression_ctrl.scheduler.state_dict(), 'optimizer_scheduler': optimizer_scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, len(train_loader) * epoch)