예제 #1
0
def test_accuracy_aware_training_pipeline(accuracy_aware_config, tmp_path,
                                          multiprocessing_distributed):
    config_factory = ConfigFactory(accuracy_aware_config['nncf_config'],
                                   tmp_path / 'config.json')

    args = {
        "--mode": "train",
        "--data": accuracy_aware_config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": accuracy_aware_config["batch_size"] * NUM_DEVICES,
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--epochs": 2,
        "--dist-url": "tcp://127.0.0.1:8989"
    }

    if not torch.cuda.is_available():
        args["--cpu-only"] = True
    elif multiprocessing_distributed:
        args["--multiprocessing-distributed"] = True

    runner = Command(
        create_command_line(args, accuracy_aware_config["sample_type"]))
    runner.run()

    from glob import glob
    time_dir_1 = glob(
        os.path.join(tmp_path, get_name(config_factory.config),
                     '*/'))[0].split('/')[-2]
    time_dir_2 = glob(
        os.path.join(tmp_path, get_name(config_factory.config), time_dir_1,
                     'accuracy_aware_training', '*/'))[0].split('/')[-2]
    last_checkpoint_path = os.path.join(tmp_path,
                                        get_name(config_factory.config),
                                        time_dir_1, 'accuracy_aware_training',
                                        time_dir_2,
                                        'acc_aware_checkpoint_last.pth')

    assert os.path.exists(last_checkpoint_path)
    if 'compression' in accuracy_aware_config['nncf_config']:
        allowed_compression_stages = (CompressionStage.FULLY_COMPRESSED,
                                      CompressionStage.PARTIALLY_COMPRESSED)
    else:
        allowed_compression_stages = (CompressionStage.UNCOMPRESSED, )
    compression_stage = extract_compression_stage_from_checkpoint(
        last_checkpoint_path)
    assert compression_stage in allowed_compression_stages
예제 #2
0
def test_trained_model_eval(request, config, tmp_path,
                            multiprocessing_distributed, case_common_dirs):
    depends_on_pretrained_train(request, config["test_case_id"],
                                multiprocessing_distributed)
    config_factory = ConfigFactory(config['nncf_config'],
                                   tmp_path / 'config.json')
    config_factory.config = update_compression_algo_dict_with_legr_save_load_params(
        config_factory.config, case_common_dirs['save_coeffs_path'])

    ckpt_path = os.path.join(
        case_common_dirs["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel",
        get_name(config_factory.config) + "_last.pth")
    args = {
        "--mode": "test",
        "--data": config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": config["batch_size"] * NUM_DEVICES,
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--weights": ckpt_path,
        "--dist-url": "tcp://127.0.0.1:8987"
    }

    if not torch.cuda.is_available():
        args["--cpu-only"] = True
    elif multiprocessing_distributed:
        args["--multiprocessing-distributed"] = True

    runner = Command(create_command_line(args, config["sample_type"]))
    runner.run()
예제 #3
0
def test_resume(request, config, tmp_path, multiprocessing_distributed,
                case_common_dirs):
    depends_on_pretrained_train(request, config["test_case_id"],
                                multiprocessing_distributed)
    checkpoint_save_dir = os.path.join(str(tmp_path), "models")
    config_factory = ConfigFactory(config['nncf_config'],
                                   tmp_path / 'config.json')
    config_factory.config = update_compression_algo_dict_with_legr_save_load_params(
        config_factory.config, case_common_dirs['save_coeffs_path'], False)

    ckpt_path = get_resuming_checkpoint_path(
        config_factory, multiprocessing_distributed,
        case_common_dirs["checkpoint_save_dir"])
    if "max_iter" in config_factory.config:
        config_factory.config["max_iter"] += 2
    args = {
        "--mode": "train",
        "--data": config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": config["batch_size"] * NUM_DEVICES,
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--epochs": 3,
        "--checkpoint-save-dir": checkpoint_save_dir,
        "--resume": ckpt_path,
        "--dist-url": "tcp://127.0.0.1:8986"
    }

    if not torch.cuda.is_available():
        args["--cpu-only"] = True
    elif multiprocessing_distributed:
        args["--multiprocessing-distributed"] = True

    runner = Command(create_command_line(args, config["sample_type"]))
    runner.run()
    last_checkpoint_path = os.path.join(
        checkpoint_save_dir,
        get_name(config_factory.config) + "_last.pth")
    assert os.path.exists(last_checkpoint_path)
    if 'compression' in config['nncf_config']:
        allowed_compression_stages = (CompressionStage.FULLY_COMPRESSED,
                                      CompressionStage.PARTIALLY_COMPRESSED)
    else:
        allowed_compression_stages = (CompressionStage.UNCOMPRESSED, )
    compression_stage = extract_compression_stage_from_checkpoint(
        last_checkpoint_path)
    assert compression_stage in allowed_compression_stages
예제 #4
0
def test_pretrained_model_train(config, tmp_path, multiprocessing_distributed,
                                case_common_dirs):
    checkpoint_save_dir = os.path.join(
        case_common_dirs["checkpoint_save_dir"],
        "distributed" if multiprocessing_distributed else "data_parallel")
    config_factory = ConfigFactory(config['nncf_config'],
                                   tmp_path / 'config.json')
    config_factory.config = update_compression_algo_dict_with_legr_save_load_params(
        config_factory.config, case_common_dirs['save_coeffs_path'])

    args = {
        "--mode": "train",
        "--data": config["dataset_path"],
        "--config": config_factory.serialize(),
        "--log-dir": tmp_path,
        "--batch-size": config["batch_size"] * NUM_DEVICES,
        "--workers":
        0,  # Workaround for the PyTorch MultiProcessingDataLoader issue
        "--epochs": 2,
        "--checkpoint-save-dir": checkpoint_save_dir,
        "--dist-url": "tcp://127.0.0.1:8989"
    }

    if not torch.cuda.is_available():
        args["--cpu-only"] = True
    elif multiprocessing_distributed:
        args["--multiprocessing-distributed"] = True
    elif config['nncf_config']["model"] == "inception_v3":
        pytest.skip(
            "InceptionV3 may not be trained in DataParallel "
            "because it outputs namedtuple, which DP seems to be unable "
            "to support even still.")

    runner = Command(create_command_line(args, config["sample_type"]))
    runner.run()
    last_checkpoint_path = os.path.join(
        checkpoint_save_dir,
        get_name(config_factory.config) + "_last.pth")
    assert os.path.exists(last_checkpoint_path)
    if 'compression' in config['nncf_config']:
        allowed_compression_stages = (CompressionStage.FULLY_COMPRESSED,
                                      CompressionStage.PARTIALLY_COMPRESSED)
    else:
        allowed_compression_stages = (CompressionStage.UNCOMPRESSED, )
    compression_stage = extract_compression_stage_from_checkpoint(
        last_checkpoint_path)
    assert compression_stage in allowed_compression_stages
예제 #5
0
def get_resuming_checkpoint_path(config_factory, multiprocessing_distributed,
                                 checkpoint_save_dir):
    return os.path.join(
        checkpoint_save_dir,
        "distributed" if multiprocessing_distributed else "data_parallel",
        get_name(config_factory.config) + "_last.pth")
예제 #6
0
                jconfig = json.load(config_path_.open())
                args_ = {
                    'data': dataset_path,
                    'weights': weights_path_,
                    'config': str(config_path_)
                }
                if batch_size:
                    args_['batch-size'] = batch_size
                if epochs:
                    args_['epochs'] = epochs
                test_config_ = {
                    'sample_type': sample_type_,
                    'expected_accuracy': expected_accuracy_,
                    'absolute_tolerance_train': absolute_tolerance_train_,
                    'absolute_tolerance_eval': absolute_tolerance_eval_,
                    'checkpoint_name': get_name(jconfig)
                }
                CONFIG_PARAMS.append(
                    tuple([test_config_, args_, execution_arg_,
                           dataset_name_]))


def get_config_name(config_path):
    base = os.path.basename(config_path)
    return os.path.splitext(base)[0]


@pytest.fixture(
    scope='module',
    params=CONFIG_PARAMS,
    ids=[
예제 #7
0
def train_staged(config,
                 compression_ctrl,
                 model,
                 criterion,
                 criterion_fn,
                 optimizer_scheduler,
                 model_name,
                 optimizer,
                 train_loader,
                 train_sampler,
                 val_loader,
                 kd_loss_calculator,
                 batch_multiplier,
                 best_acc1=0):
    best_compression_stage = CompressionStage.UNCOMPRESSED
    for epoch in range(config.start_epoch, config.epochs):
        # update compression scheduler state at the start of the epoch
        compression_ctrl.scheduler.epoch_step()

        if config.distributed:
            train_sampler.set_epoch(epoch)

        # train for one epoch
        train_epoch_staged(train_loader, batch_multiplier, model, criterion,
                           criterion_fn, optimizer, optimizer_scheduler,
                           kd_loss_calculator, compression_ctrl, epoch, config)

        # compute compression algo statistics
        statistics = compression_ctrl.statistics()

        acc1 = best_acc1
        if epoch % config.test_every_n_epochs == 0:
            # evaluate on validation set
            # pylint: disable=E1123
            acc1, _, _ = validate(val_loader,
                                  model,
                                  criterion,
                                  config,
                                  epoch=epoch)

        compression_stage = compression_ctrl.compression_stage()
        # remember best acc@1, considering compression stage. If current acc@1 less then the best acc@1, checkpoint
        # still can be best if current compression stage is larger than the best one. Compression stages in ascending
        # order: UNCOMPRESSED, PARTIALLY_COMPRESSED, FULLY_COMPRESSED.
        is_best_by_accuracy = acc1 > best_acc1 and compression_stage == best_compression_stage
        is_best = is_best_by_accuracy or compression_stage > best_compression_stage
        best_acc1 = max(acc1, best_acc1)
        best_compression_stage = max(compression_stage, best_compression_stage)

        # statistics (e.g. portion of the enabled quantizers) is related to the finished epoch,
        # hence printing should happen before epoch_step, which may inform about state of the next epoch (e.g. next
        # portion of enabled quantizers)
        if is_main_process():
            logger.info(statistics.to_str())

        optimizer_scheduler.epoch_step()

        if is_main_process():
            checkpoint_path = osp.join(config.checkpoint_save_dir,
                                       get_name(config) + '_last.pth')
            checkpoint = {
                'epoch':
                epoch + 1,
                'arch':
                model_name,
                MODEL_STATE_ATTR:
                model.state_dict(),
                COMPRESSION_STATE_ATTR:
                compression_ctrl.get_compression_state(),
                'original_model_state_dict':
                kd_loss_calculator.original_model.state_dict(),
                'best_acc1':
                best_acc1,
                'optimizer':
                optimizer.state_dict(),
                'optimizer_scheduler':
                optimizer_scheduler.state_dict()
            }

            torch.save(checkpoint, checkpoint_path)
            make_additional_checkpoints(checkpoint_path, is_best, epoch + 1,
                                        config)

            for key, value in prepare_for_tensorboard(statistics).items():
                config.mlflow.safe_call(
                    'log_metric', 'compression/statistics/{0}'.format(key),
                    value, epoch)
                config.tb.add_scalar("compression/statistics/{0}".format(key),
                                     value,
                                     len(train_loader) * epoch)
예제 #8
0
def train(net, compression_ctrl, train_data_loader, test_data_loader,
          criterion, optimizer, config, lr_scheduler):
    net.train()
    loc_loss = 0
    conf_loss = 0

    epoch_size = len(train_data_loader)
    logger.info('Training {} on {} dataset...'.format(
        config.model, train_data_loader.dataset.name))

    best_mAp = 0
    best_compression_stage = CompressionStage.UNCOMPRESSED
    test_freq_in_epochs = config.test_interval
    if config.test_interval is None:
        test_freq_in_epochs = 1

    max_epochs = config['epochs']

    for epoch in range(config.start_epoch, max_epochs):
        compression_ctrl.scheduler.epoch_step(epoch)

        train_epoch(compression_ctrl, net, config, train_data_loader,
                    criterion, optimizer, epoch_size, epoch, loc_loss,
                    conf_loss)

        if is_main_process():
            logger.info(compression_ctrl.statistics().to_str())

        compression_stage = compression_ctrl.compression_stage()
        is_best = False
        if (epoch + 1) % test_freq_in_epochs == 0:
            with torch.no_grad():
                net.eval()
                mAP = test_net(net,
                               config.device,
                               test_data_loader,
                               distributed=config.multiprocessing_distributed)
                is_best_by_mAP = mAP > best_mAp and compression_stage == best_compression_stage
                is_best = is_best_by_mAP or compression_stage > best_compression_stage
                if is_best:
                    best_mAp = mAP
                best_compression_stage = max(compression_stage,
                                             best_compression_stage)
                if isinstance(lr_scheduler, ReduceLROnPlateau):
                    lr_scheduler.step(mAP)
                net.train()

        if is_on_first_rank(config):
            logger.info('Saving state, epoch: {}'.format(epoch))

            checkpoint_file_path = osp.join(
                config.checkpoint_save_dir,
                "{}_last.pth".format(get_name(config)))
            torch.save(
                {
                    MODEL_STATE_ATTR:
                    net.state_dict(),
                    COMPRESSION_STATE_ATTR:
                    compression_ctrl.get_compression_state(),
                    'optimizer':
                    optimizer.state_dict(),
                    'epoch':
                    epoch,
                }, str(checkpoint_file_path))
            make_additional_checkpoints(checkpoint_file_path,
                                        is_best=is_best,
                                        epoch=epoch + 1,
                                        config=config)

        # Learning rate scheduling should be applied after optimizer’s update
        if not isinstance(lr_scheduler, ReduceLROnPlateau):
            lr_scheduler.step(epoch)

        compression_ctrl.scheduler.epoch_step(epoch)
        if is_main_process():
            statistics = compression_ctrl.statistics()
            logger.info(statistics.to_str())

        compression_stage = compression_ctrl.compression_stage()
        is_best = False
        if (epoch + 1) % test_freq_in_epochs == 0:
            with torch.no_grad():
                net.eval()
                mAP = test_net(net,
                               config.device,
                               test_data_loader,
                               distributed=config.multiprocessing_distributed)
                is_best_by_mAP = mAP > best_mAp and compression_stage == best_compression_stage
                is_best = is_best_by_mAP or compression_stage > best_compression_stage
                if is_best:
                    best_mAp = mAP
                best_compression_stage = max(compression_stage,
                                             best_compression_stage)
                if isinstance(lr_scheduler, ReduceLROnPlateau):
                    lr_scheduler.step(mAP)
                net.train()

        if is_on_first_rank(config):
            logger.info('Saving state, epoch: {}'.format(epoch))

            checkpoint_file_path = osp.join(
                config.checkpoint_save_dir,
                "{}_last.pth".format(get_name(config)))
            torch.save(
                {
                    MODEL_STATE_ATTR:
                    net.state_dict(),
                    COMPRESSION_STATE_ATTR:
                    compression_ctrl.get_compression_state(),
                    'optimizer':
                    optimizer.state_dict(),
                    'epoch':
                    epoch,
                }, str(checkpoint_file_path))
            make_additional_checkpoints(checkpoint_file_path,
                                        is_best=is_best,
                                        epoch=epoch + 1,
                                        config=config)

        # Learning rate scheduling should be applied after optimizer’s update
        if not isinstance(lr_scheduler, ReduceLROnPlateau):
            lr_scheduler.step(epoch)

    if config.metrics_dump is not None:
        write_metrics(best_mAp, config.metrics_dump)