def test_training_loop(random_data_loader): """Test the training loop.""" device = torch.device('cpu') model = QLeNet5(F.nll_loss).to(device) metrics = {'Loss': LossMetric(model.loss_fn, accumulate=False)} optimizer = get_optimizer(model.parameters(), { 'algorithm': 'sgd', 'lr': 0.1 }) scheduler = get_lr_scheduler(optimizer, { 'scheduler': 'step_lr', 'step_size': 1, 'gamma': 0.7 }, 3, len(random_data_loader)) fake_hook = mock.MagicMock() hooks = [fake_hook] losses = [] for epoch in range(1, 3): train(model=model, train_loader=random_data_loader, metrics=metrics, optimizer=optimizer, scheduler=scheduler, device=device, epoch=epoch, log_interval=4, hooks=hooks) losses.append(metrics['Loss'].compute()) # Ensure that hooks are called and loss is changing assert fake_hook.called assert losses[1] != losses[0]
def test_get_multi_step_lr_scheduler(): """Test get multi step lr scheduler.""" model = QLeNet5(F.nll_loss) optimizer = get_optimizer(model.parameters(), { 'algorithm': 'sgd', 'lr': 0.1 }) scheduler = get_lr_scheduler(optimizer, { 'scheduler': 'multi_step_lr', 'milestones': [30, 70], 'gamma': 0.7 }, 70, 100) assert isinstance(scheduler, lr_scheduler.MultiStepLR) for _ in range(30 * 100): assert optimizer.param_groups[0]['lr'] == 0.1 optimizer.step() scheduler.step() for _ in range(40 * 100): assert optimizer.param_groups[0]['lr'] == 0.7 * 0.1 optimizer.step() scheduler.step() assert optimizer.param_groups[0]['lr'] == 0.7 * 0.7 * 0.1
def test_get_linear_lr_scheduler(): """Test get linear lr scheduler.""" model = QLeNet5(F.nll_loss) optimizer = get_optimizer(model.parameters(), { 'algorithm': 'sgd', 'lr': 0.1 }) scheduler = get_lr_scheduler(optimizer, { 'scheduler': 'linear_lr', 'min_lr': 1e-5 }, 80, 100) assert isinstance(scheduler, LinearLR)
def test_get_step_lr_scheduler(): """Test get step lr scheduler.""" model = QLeNet5(F.nll_loss) optimizer = get_optimizer(model.parameters(), { 'algorithm': 'sgd', 'lr': 0.1 }) scheduler = get_lr_scheduler(optimizer, { 'scheduler': 'step_lr', 'step_size': 1, 'gamma': 0.7 }, 5, 100) assert isinstance(scheduler, lr_scheduler.StepLR) for _ in range(100): assert optimizer.param_groups[0]['lr'] == 0.1 optimizer.step() scheduler.step() assert optimizer.param_groups[0]['lr'] == 0.7 * 0.1
def test_get_lambda_lr_scheduler(): """Test get lambda lr scheduler.""" model = QLeNet5(F.nll_loss) optimizer = get_optimizer(model.parameters(), { 'algorithm': 'sgd', 'lr': 0.1 }) lr_lambda = """lambda s: next( v for (a, b), v in {(0, 200): 1, (200, 1000): 0.75}.items() if a <= s < b )""" scheduler = get_lr_scheduler(optimizer, { 'scheduler': 'lambda_lr', 'lr_lambda': lr_lambda }, 10, 100) assert isinstance(scheduler, lr_scheduler.LambdaLR) for _ in range(200): assert optimizer.param_groups[0]['lr'] == 0.1 optimizer.step() scheduler.step() assert optimizer.param_groups[0]['lr'] == 0.75 * 0.1
def classification_task( config: dict, experiment_root_directory: Path, data_loader_cls: Type[QuantDataLoader], get_hooks: Callable[[dict, Path, MetricDict, MetricDict], Tuple[List[Hook], List[Hook]]], restore_experiment: Optional[Path] = None, ) -> Tuple[List[Dict[str, float]], List[Dict[str, float]]]: """ Driver program for running classification task. Args: config: merged config with CLI args experiment_root_directory: root directory for storing logs, checkpoints, etc. data_loader_cls: The QuantDataLoader class get_hooks: a function that returns lists of training and testing hooks restore_experiment: path to experiment to restore, None for do not restore Returns: (List of training set metrics for each epoch, list of test set metrics for each epoch). """ env_config = config['environment'] data_config = config['data'] model_config = config['model'] optimization_config = config['optimization'] log_config = config['log'] init_logging(log_config['level']) device = get_device(env_config['ngpus'], config.get('seed'), **env_config.get('cuda', {})) data_loader = data_loader_cls(**data_config) train_loader = data_loader.get_train_loader( ) if not config.get('skip_training') else None test_loader = data_loader.get_test_loader() epochs = optimization_config['epochs'] teacher = None use_kd = 'kd_config' in model_config if use_kd: teacher, kd_loss = get_teacher_and_kd_loss( device=device, ngpus=env_config['ngpus'], strict_keys=model_config.get('strict_keys', True), **model_config['kd_config']) loss_fn = get_loss_fn(model_config['loss']) if not use_kd else kd_loss model = get_model( architecture=model_config['architecture'], loss_fn=loss_fn, arch_config=model_config['arch_config'], device=device, ngpus=env_config['ngpus'], ) optimizer, scheduler = None, None if not config.get('skip_training'): optimizer = get_optimizer(model.parameters(), optimization_config['optimizer']) scheduler = get_lr_scheduler( optimizer, optimization_config['lr_scheduler'], epochs, len(train_loader)) # type: ignore # noqa: E501 if restore_experiment is not None: checkpoint_path = get_path_to_checkpoint(restore_experiment) model, restored_optimizer, restored_scheduler, start_epoch = restore_from_checkpoint( model, optimizer, scheduler, checkpoint_path, device, model_config.get('strict_keys', True), ) optimizer, scheduler = restored_optimizer, restored_scheduler start_epoch += 1 elif config.get('init_from_checkpoint'): model, _, _, _ = restore_from_checkpoint( model, None, None, config['init_from_checkpoint'], device, model_config.get('strict_keys', True), ) start_epoch = 1 else: start_epoch = 1 train_metrics = { 'Loss': LossMetric(loss_fn, accumulate=True), 'Top-1 Accuracy': Top1Accuracy(accumulate=True), 'Top-5 Accuracy': TopKAccuracy(5, accumulate=True), } test_metrics = { 'Loss': LossMetric(get_loss_fn(model_config['loss']), accumulate=True), 'Top-1 Accuracy': Top1Accuracy(accumulate=True), 'Top-5 Accuracy': TopKAccuracy(5, accumulate=True), } train_hooks, test_hooks = get_hooks(config, experiment_root_directory, train_metrics, test_metrics) train_epoch_metrics, test_epoch_metrics = [], [] if config.get('skip_training'): computed_test_metrics = evaluate( model=model, test_loader=test_loader, metrics=test_metrics, device=device, epoch=1, hooks=test_hooks, ) test_epoch_metrics.append(computed_test_metrics) else: for epoch in range(start_epoch, start_epoch + epochs): computed_train_metrics = train( model=model, train_loader=train_loader, # type: ignore metrics=train_metrics, optimizer=optimizer, scheduler=scheduler, # type: ignore device=device, epoch=epoch, log_interval=log_config['interval'], hooks=train_hooks, teacher=teacher, ) computed_test_metrics = evaluate( model=model, test_loader=test_loader, metrics=test_metrics, device=device, epoch=epoch, hooks=test_hooks, ) train_epoch_metrics.append(computed_train_metrics) test_epoch_metrics.append(computed_test_metrics) if epoch % log_config['save_model_freq'] == 0 or epoch == epochs: log_checkpoints( experiment_root_directory / config['experiment_name'] / 'checkpoints', model, optimizer, # type: ignore scheduler, # type: ignore epoch, ) data_loader.cleanup() return train_epoch_metrics, test_epoch_metrics