def test_accuracy_aware_training_pipeline(accuracy_aware_config, tmp_path, multiprocessing_distributed): config_factory = ConfigFactory(accuracy_aware_config['nncf_config'], tmp_path / 'config.json') args = { "--mode": "train", "--data": accuracy_aware_config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": accuracy_aware_config["batch_size"] * NUM_DEVICES, "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--epochs": 2, "--dist-url": "tcp://127.0.0.1:8989" } if not torch.cuda.is_available(): args["--cpu-only"] = True elif multiprocessing_distributed: args["--multiprocessing-distributed"] = True runner = Command( create_command_line(args, accuracy_aware_config["sample_type"])) runner.run() from glob import glob time_dir_1 = glob( os.path.join(tmp_path, get_name(config_factory.config), '*/'))[0].split('/')[-2] time_dir_2 = glob( os.path.join(tmp_path, get_name(config_factory.config), time_dir_1, 'accuracy_aware_training', '*/'))[0].split('/')[-2] last_checkpoint_path = os.path.join(tmp_path, get_name(config_factory.config), time_dir_1, 'accuracy_aware_training', time_dir_2, 'acc_aware_checkpoint_last.pth') assert os.path.exists(last_checkpoint_path) if 'compression' in accuracy_aware_config['nncf_config']: allowed_compression_stages = (CompressionStage.FULLY_COMPRESSED, CompressionStage.PARTIALLY_COMPRESSED) else: allowed_compression_stages = (CompressionStage.UNCOMPRESSED, ) compression_stage = extract_compression_stage_from_checkpoint( last_checkpoint_path) assert compression_stage in allowed_compression_stages
def test_trained_model_eval(request, config, tmp_path, multiprocessing_distributed, case_common_dirs): depends_on_pretrained_train(request, config["test_case_id"], multiprocessing_distributed) config_factory = ConfigFactory(config['nncf_config'], tmp_path / 'config.json') config_factory.config = update_compression_algo_dict_with_legr_save_load_params( config_factory.config, case_common_dirs['save_coeffs_path']) ckpt_path = os.path.join( case_common_dirs["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel", get_name(config_factory.config) + "_last.pth") args = { "--mode": "test", "--data": config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": config["batch_size"] * NUM_DEVICES, "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--weights": ckpt_path, "--dist-url": "tcp://127.0.0.1:8987" } if not torch.cuda.is_available(): args["--cpu-only"] = True elif multiprocessing_distributed: args["--multiprocessing-distributed"] = True runner = Command(create_command_line(args, config["sample_type"])) runner.run()
def test_resume(request, config, tmp_path, multiprocessing_distributed, case_common_dirs): depends_on_pretrained_train(request, config["test_case_id"], multiprocessing_distributed) checkpoint_save_dir = os.path.join(str(tmp_path), "models") config_factory = ConfigFactory(config['nncf_config'], tmp_path / 'config.json') config_factory.config = update_compression_algo_dict_with_legr_save_load_params( config_factory.config, case_common_dirs['save_coeffs_path'], False) ckpt_path = get_resuming_checkpoint_path( config_factory, multiprocessing_distributed, case_common_dirs["checkpoint_save_dir"]) if "max_iter" in config_factory.config: config_factory.config["max_iter"] += 2 args = { "--mode": "train", "--data": config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": config["batch_size"] * NUM_DEVICES, "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--epochs": 3, "--checkpoint-save-dir": checkpoint_save_dir, "--resume": ckpt_path, "--dist-url": "tcp://127.0.0.1:8986" } if not torch.cuda.is_available(): args["--cpu-only"] = True elif multiprocessing_distributed: args["--multiprocessing-distributed"] = True runner = Command(create_command_line(args, config["sample_type"])) runner.run() last_checkpoint_path = os.path.join( checkpoint_save_dir, get_name(config_factory.config) + "_last.pth") assert os.path.exists(last_checkpoint_path) if 'compression' in config['nncf_config']: allowed_compression_stages = (CompressionStage.FULLY_COMPRESSED, CompressionStage.PARTIALLY_COMPRESSED) else: allowed_compression_stages = (CompressionStage.UNCOMPRESSED, ) compression_stage = extract_compression_stage_from_checkpoint( last_checkpoint_path) assert compression_stage in allowed_compression_stages
def test_pretrained_model_train(config, tmp_path, multiprocessing_distributed, case_common_dirs): checkpoint_save_dir = os.path.join( case_common_dirs["checkpoint_save_dir"], "distributed" if multiprocessing_distributed else "data_parallel") config_factory = ConfigFactory(config['nncf_config'], tmp_path / 'config.json') config_factory.config = update_compression_algo_dict_with_legr_save_load_params( config_factory.config, case_common_dirs['save_coeffs_path']) args = { "--mode": "train", "--data": config["dataset_path"], "--config": config_factory.serialize(), "--log-dir": tmp_path, "--batch-size": config["batch_size"] * NUM_DEVICES, "--workers": 0, # Workaround for the PyTorch MultiProcessingDataLoader issue "--epochs": 2, "--checkpoint-save-dir": checkpoint_save_dir, "--dist-url": "tcp://127.0.0.1:8989" } if not torch.cuda.is_available(): args["--cpu-only"] = True elif multiprocessing_distributed: args["--multiprocessing-distributed"] = True elif config['nncf_config']["model"] == "inception_v3": pytest.skip( "InceptionV3 may not be trained in DataParallel " "because it outputs namedtuple, which DP seems to be unable " "to support even still.") runner = Command(create_command_line(args, config["sample_type"])) runner.run() last_checkpoint_path = os.path.join( checkpoint_save_dir, get_name(config_factory.config) + "_last.pth") assert os.path.exists(last_checkpoint_path) if 'compression' in config['nncf_config']: allowed_compression_stages = (CompressionStage.FULLY_COMPRESSED, CompressionStage.PARTIALLY_COMPRESSED) else: allowed_compression_stages = (CompressionStage.UNCOMPRESSED, ) compression_stage = extract_compression_stage_from_checkpoint( last_checkpoint_path) assert compression_stage in allowed_compression_stages
def get_resuming_checkpoint_path(config_factory, multiprocessing_distributed, checkpoint_save_dir): return os.path.join( checkpoint_save_dir, "distributed" if multiprocessing_distributed else "data_parallel", get_name(config_factory.config) + "_last.pth")
jconfig = json.load(config_path_.open()) args_ = { 'data': dataset_path, 'weights': weights_path_, 'config': str(config_path_) } if batch_size: args_['batch-size'] = batch_size if epochs: args_['epochs'] = epochs test_config_ = { 'sample_type': sample_type_, 'expected_accuracy': expected_accuracy_, 'absolute_tolerance_train': absolute_tolerance_train_, 'absolute_tolerance_eval': absolute_tolerance_eval_, 'checkpoint_name': get_name(jconfig) } CONFIG_PARAMS.append( tuple([test_config_, args_, execution_arg_, dataset_name_])) def get_config_name(config_path): base = os.path.basename(config_path) return os.path.splitext(base)[0] @pytest.fixture( scope='module', params=CONFIG_PARAMS, ids=[
def train_staged(config, compression_ctrl, model, criterion, criterion_fn, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1=0): best_compression_stage = CompressionStage.UNCOMPRESSED for epoch in range(config.start_epoch, config.epochs): # update compression scheduler state at the start of the epoch compression_ctrl.scheduler.epoch_step() if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch_staged(train_loader, batch_multiplier, model, criterion, criterion_fn, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl, epoch, config) # compute compression algo statistics statistics = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set # pylint: disable=E1123 acc1, _, _ = validate(val_loader, model, criterion, config, epoch=epoch) compression_stage = compression_ctrl.compression_stage() # remember best acc@1, considering compression stage. If current acc@1 less then the best acc@1, checkpoint # still can be best if current compression stage is larger than the best one. Compression stages in ascending # order: UNCOMPRESSED, PARTIALLY_COMPRESSED, FULLY_COMPRESSED. is_best_by_accuracy = acc1 > best_acc1 and compression_stage == best_compression_stage is_best = is_best_by_accuracy or compression_stage > best_compression_stage best_acc1 = max(acc1, best_acc1) best_compression_stage = max(compression_stage, best_compression_stage) # statistics (e.g. portion of the enabled quantizers) is related to the finished epoch, # hence printing should happen before epoch_step, which may inform about state of the next epoch (e.g. next # portion of enabled quantizers) if is_main_process(): logger.info(statistics.to_str()) optimizer_scheduler.epoch_step() if is_main_process(): checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, MODEL_STATE_ATTR: model.state_dict(), COMPRESSION_STATE_ATTR: compression_ctrl.get_compression_state(), 'original_model_state_dict': kd_loss_calculator.original_model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'optimizer_scheduler': optimizer_scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in prepare_for_tensorboard(statistics).items(): config.mlflow.safe_call( 'log_metric', 'compression/statistics/{0}'.format(key), value, epoch) config.tb.add_scalar("compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler): net.train() loc_loss = 0 conf_loss = 0 epoch_size = len(train_data_loader) logger.info('Training {} on {} dataset...'.format( config.model, train_data_loader.dataset.name)) best_mAp = 0 best_compression_stage = CompressionStage.UNCOMPRESSED test_freq_in_epochs = config.test_interval if config.test_interval is None: test_freq_in_epochs = 1 max_epochs = config['epochs'] for epoch in range(config.start_epoch, max_epochs): compression_ctrl.scheduler.epoch_step(epoch) train_epoch(compression_ctrl, net, config, train_data_loader, criterion, optimizer, epoch_size, epoch, loc_loss, conf_loss) if is_main_process(): logger.info(compression_ctrl.statistics().to_str()) compression_stage = compression_ctrl.compression_stage() is_best = False if (epoch + 1) % test_freq_in_epochs == 0: with torch.no_grad(): net.eval() mAP = test_net(net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) is_best_by_mAP = mAP > best_mAp and compression_stage == best_compression_stage is_best = is_best_by_mAP or compression_stage > best_compression_stage if is_best: best_mAp = mAP best_compression_stage = max(compression_stage, best_compression_stage) if isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(mAP) net.train() if is_on_first_rank(config): logger.info('Saving state, epoch: {}'.format(epoch)) checkpoint_file_path = osp.join( config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { MODEL_STATE_ATTR: net.state_dict(), COMPRESSION_STATE_ATTR: compression_ctrl.get_compression_state(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) # Learning rate scheduling should be applied after optimizer’s update if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) compression_ctrl.scheduler.epoch_step(epoch) if is_main_process(): statistics = compression_ctrl.statistics() logger.info(statistics.to_str()) compression_stage = compression_ctrl.compression_stage() is_best = False if (epoch + 1) % test_freq_in_epochs == 0: with torch.no_grad(): net.eval() mAP = test_net(net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) is_best_by_mAP = mAP > best_mAp and compression_stage == best_compression_stage is_best = is_best_by_mAP or compression_stage > best_compression_stage if is_best: best_mAp = mAP best_compression_stage = max(compression_stage, best_compression_stage) if isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(mAP) net.train() if is_on_first_rank(config): logger.info('Saving state, epoch: {}'.format(epoch)) checkpoint_file_path = osp.join( config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { MODEL_STATE_ATTR: net.state_dict(), COMPRESSION_STATE_ATTR: compression_ctrl.get_compression_state(), 'optimizer': optimizer.state_dict(), 'epoch': epoch, }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) # Learning rate scheduling should be applied after optimizer’s update if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) if config.metrics_dump is not None: write_metrics(best_mAp, config.metrics_dump)