示例#1
0
def train(run_dir: str = './run',
          datasets_dir: str = './data',
          dataset: str = 'cifar10',
          augmentation: bool = True,
          validation: int = 0,
          shuffle: bool = True,
          arch: str = 'preact20',
          optimizer: str = 'sgd',
          epochs: Tuple[int, ...] = (1, 90, 45, 45),
          learning_rates: Tuple[float, ...] = (0.01, 0.1, 0.01, 0.001),
          momentum: float = 0.9,
          weight_decay: float = 5e-4,
          batch_size: int = 128,
          eval_batch_size: int = 128,
          cuda: bool = True,
          device_ids: Tuple[int, ...] = tuple(range(cuda.device_count())),
          num_workers: int = 0,
          eval_num_workers: int = 0,
          seed: Optional[int] = None,
          checkpoint: str = 'best',
          track_test_acc: bool = True):
    """
    Train deep learning models (e.g., ResNet) on CIFAR10 and CIFAR100.

    Parameters
    ----------
    run_dir : str, default './run'
        Path to log results and other artifacts.
    datasets_dir : str, default './data'
        Path to datasets.
    dataset : str, default 'cifar10'
        Dataset to use in experiment (i.e., CIFAR10 or CIFAR100)
    augmentation : bool, default True
        Add data augmentation (i.e., random crop and horizontal flip).
    validation : int, default 0
        Number of examples from training set to use for valdiation.
    shuffle : bool, default True
        Shuffle training data before splitting into training and validation.
    arch : str, default 'preact20'
        Model architecture. `preact20` is short for ResNet20 w/ Pre-Activation.
    optimizer : str, default = 'sgd'
        Optimizer for training.
    epochs : Tuple[int, ...], default (1, 90, 45, 45)
        Epochs for training. Each number corresponds to a learning rate below.
    learning_rates : Tuple[float, ...], default (0.01, 0.1, 0.01, 0.001)
        Learning rates for training. Each learning rate is used for the
        corresponding number of epochs above.
    momentum : float, default 0.9
        Momentum for SGD.
    weight_decay : float, default 5e-4
        Weight decay for SGD.
    batch_size : int, default 128
        Minibatch size for training.
    eval_batch_size : int, default 128
        Minibatch size for evaluation (validation and testing).
    cuda : bool, default True
        Enable or disable use of available GPUs
    device_ids : Tuple[int, ...], default True
        GPU device ids to use.
    num_workers : int, default 0
        Number of data loading workers for training.
    eval_num_workers : int, default 0
        Number of data loading workers for evaluation.
    seed : Optional[int], default None
        Random seed for numpy, torch, and others. If None, a random int is
        chosen and logged in the experiments config file.
    checkpoint : str, default 'best'
        Specify when to create a checkpoint for the model: only checkpoint the
        best performing model on the validation data or the training data if
        `validation == 0` ("best"), after every epoch ("all"), or only the last
        epoch of each segment of the learning rate schedule ("last").
    track_test_acc : bool, default True
        Calculate performance of the models on the test data in addition or
        instead of the validation dataset.'

    Returns
    -------
    model : nn.Module
        Trained model.
    accuracies : Tuple[float, ...]
        The best accuracies from the model on the train, dev, and test splits.
    times : Tuple[timedelta, ...]
        Time spent training or evaluating on the train, dev, and test splits.
    """
    # Set seeds for reproducibility.
    seed = utils.set_random_seed(seed)
    # Capture all of the arguments to save alongside the results.
    config = utils.capture_config(**locals())
    # Create a unique timestamped directory for this experiment.
    run_dir = utils.create_run_dir(run_dir, timestamp=config['timestamp'])
    utils.save_config(config, run_dir)
    # Update the computing arguments based on the runtime system.
    use_cuda, device, device_ids, num_workers = utils.config_run_env(
        cuda=cuda, device_ids=device_ids, num_workers=num_workers)

    # Create the training dataset.
    train_dataset = create_dataset(dataset,
                                   datasets_dir,
                                   train=True,
                                   augmentation=augmentation)
    # Create the test dataset.
    test_dataset = None
    if track_test_acc:
        test_dataset = create_dataset(dataset,
                                      datasets_dir,
                                      train=False,
                                      augmentation=False)

    # Create data loaders
    train_loader, dev_loader, test_loader = create_loaders(
        train_dataset,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        validation=validation,
        run_dir=run_dir,
        test_dataset=test_dataset,
        use_cuda=use_cuda,
        shuffle=shuffle,
        num_workers=num_workers,
        eval_num_workers=eval_num_workers)

    # Calculate the number of classes (e.g., 10 or 100) so the model has
    #   the right dimension for its output.
    num_classes = len(set(train_dataset.targets))  # type: ignore

    # Create the model and optimizer for training.
    model, _optimizer = create_model_and_optimizer(
        run_dir=run_dir,
        arch=arch,
        num_classes=num_classes,
        optimizer=optimizer,
        learning_rate=learning_rates[0],
        momentum=momentum,
        weight_decay=weight_decay)

    # Create the loss criterion.
    criterion = nn.CrossEntropyLoss()

    # Move the model and loss to the appropriate devices.
    model = model.to(device)
    if use_cuda:
        model = nn.DataParallel(model, device_ids=device_ids)
    criterion = criterion.to(device)

    # Run training.
    return run_training(model=model,
                        optimizer=_optimizer,
                        criterion=criterion,
                        device=device,
                        train_loader=train_loader,
                        epochs=epochs,
                        learning_rates=learning_rates,
                        dev_loader=dev_loader,
                        test_loader=test_loader,
                        run_dir=run_dir,
                        checkpoint=checkpoint)
示例#2
0
    seed = utils.set_random_seed(seed)
    # Capture all of the arguments to save alongside the results.
    config = utils.capture_config(**locals())
    if scale_learning_rates:
        # For convenience, scale the learning rate for large-batch SGD
        learning_rates = tuple(np.array(learning_rates) * (batch_size / 256))
        config['scaled_learning_rates'] = learning_rates
    if proxy_scale_learning_rates:
        # For convenience, scale the learning rate for large-batch SGD
        proxy_learning_rates = tuple(
            np.array(proxy_learning_rates) *
            (proxy_batch_size / 256))  # noqa: E501
        config['proxy_scaled_learning_rates'] = proxy_learning_rates
    # Create a unique timestamped directory for this experiment.
    run_dir = utils.create_run_dir(run_dir, timestamp=config['timestamp'])
    utils.save_config(config, run_dir)
    # Update the computing arguments based on the runtime system.
    use_cuda, device, device_ids, num_workers = utils.config_run_env(
        cuda=cuda, device_ids=device_ids, num_workers=num_workers)

    # Create the training dataset.
    train_dataset = create_dataset(dataset,
                                   datasets_dir,
                                   train=True,
                                   augmentation=augmentation)
    # Verify there is enough training data for validation and
    #   the final selected subset.
    validate_splits(train_dataset, validation, subset)

    # Create the test dataset.
    test_dataset = None
示例#3
0
def train(run_dir: str = './run',
          datasets_dir: str = './data',
          dataset: str = 'imagenet',
          augmentation: bool = True,
          validation: int = 0,
          shuffle: bool = True,
          arch: str = 'resnet18',
          optimizer: str = 'sgd',
          epochs: Tuple[int, ...] = (1, 1, 1, 1, 1, 25, 30, 20, 20),
          learning_rates: Tuple[float,
                                ...] = (0.0167, 0.0333, 0.05, 0.0667, 0.0833,
                                        0.1, 0.01, 0.001, 0.0001),
          scale_learning_rates: bool = True,
          momentum: float = 0.9,
          weight_decay: float = 1e-4,
          batch_size: int = 256,
          eval_batch_size: int = 256,
          fp16: bool = False,
          label_smoothing: float = 0.1,
          loss_scale: float = 256.0,
          cuda: bool = True,
          device_ids: Tuple[int, ...] = tuple(range(cuda.device_count())),
          num_workers: int = 0,
          eval_num_workers: int = 0,
          seed: Optional[int] = None,
          checkpoint: str = 'best',
          track_test_acc: bool = True):
    """
    Train deep learning models (e.g., ResNet) on ImageNet.

    Parameters
    ----------
    run_dir : str, default './run'
        Path to log results and other artifacts.
    datasets_dir : str, default './data'
        Path to datasets.
    dataset : str, default 'imagenet'
        Dataset to use in experiment (unnecessary but kept for consistency)
    augmentation : bool, default True
        Add data augmentation (i.e., random crop and horizontal flip).
    validation : int, default 0
        Number of examples from training set to use for valdiation.
    shuffle : bool, default True
        Shuffle training data before splitting into training and validation.
    arch : str, default 'resnet18'
        Model architecture. `resnet18` is short for ResNet18. Other models are
        pulled from `torchvision.models`.
    optimizer : str, default = 'sgd'
        Optimizer for training.
    epochs : Tuple[int, ...], default (1, 1, 1, 1, 1, 25, 30, 20, 20)
        Epochs for training. Each number corresponds to a learning rate below.
    learning_rates : Tuple[float, ...], default (
            0.0167, 0.0333, 0.05, 0.0667, 0.0833,  0.1, 0.01, 0.001, 0.0001)
        Learning rates for training. Each learning rate is used for the
        corresponding number of epochs above.
    scale_learning_rates : bool, default True
        Scale learning rates above based on (`batch_size / 256`). Mainly for
        convenience with large minibatch training.
    momentum : float, default 0.9
        Momentum for SGD.
    weight_decay : float, default 1e-4
        Weight decay for SGD.
    batch_size : int, default 256
        Minibatch size for training.
    eval_batch_size : int, default 256
        Minibatch size for evaluation (validation and testing).
    fp16 : bool, default False
        Use mixed precision training.
    label_smoothing : float, default 0.1
        Amount to smooth labels for loss.
    loss_scale : float, default 256
        Amount to scale loss for mixed precision training.
    cuda : bool, default True
        Enable or disable use of available GPUs
    device_ids : Tuple[int, ...], default True
        GPU device ids to use.
    num_workers : int, default 0
        Number of data loading workers for training.
    eval_num_workers : int, default 0
        Number of data loading workers for evaluation.
    seed : Optional[int], default None
        Random seed for numpy, torch, and others. If None, a random int is
        chosen and logged in the experiments config file.
    checkpoint : str, default 'best'
        Specify when to create a checkpoint for the model: only checkpoint the
        best performing model on the validation data or the training data if
        `validation == 0` ("best"), after every epoch ("all"), or only the last
        epoch of each segment of the learning rate schedule ("last").
    track_test_acc : bool, default True
        Calculate performance of the models on the test data in addition or
        instead of the validation dataset.'

    Returns
    -------
    model : nn.Module
        Trained model.
    accuracies : Tuple[float, ...]
        The best accuracies from the model on the train, dev, and test splits.
    times : Tuple[timedelta, ...]
        Time spent training or evaluating on the train, dev, and test splits.
    """
    # Set seeds for reproducibility.
    seed = utils.set_random_seed(seed)
    # Capture all of the arguments to save alongside the results.
    config = utils.capture_config(**locals())
    if scale_learning_rates:
        # For convenience, scale the learning rate for large-batch SGD
        learning_rates = tuple(np.array(learning_rates) * (batch_size / 256))
        config['scaled_learning_rates'] = learning_rates
    # Create a unique timestamped directory for this experiment.
    run_dir = utils.create_run_dir(run_dir, timestamp=config['timestamp'])
    utils.save_config(config, run_dir)
    # Update the computing arguments based on the runtime system.
    use_cuda, device, device_ids, num_workers = utils.config_run_env(
        cuda=cuda, device_ids=device_ids, num_workers=num_workers)

    # Create the training dataset.
    train_dataset = create_dataset(dataset,
                                   datasets_dir,
                                   train=True,
                                   augmentation=augmentation)
    # Create the test dataset.
    test_dataset = None
    if track_test_acc:
        test_dataset = create_dataset(dataset,
                                      datasets_dir,
                                      train=False,
                                      augmentation=False)

    # Create data loaders
    train_loader, dev_loader, test_loader = create_loaders(
        train_dataset,
        batch_size=batch_size,
        eval_batch_size=eval_batch_size,
        validation=validation,
        run_dir=run_dir,
        test_dataset=test_dataset,
        use_cuda=use_cuda,
        shuffle=shuffle,
        num_workers=num_workers,
        eval_num_workers=eval_num_workers)

    # Calculate the number of classes (e.g., 1000) so the model has
    #   the right dimension for its output.
    num_classes = 1_000  # type: ignore

    # Create the model and optimizer for training.
    model, _optimizer = create_model_and_optimizer(
        run_dir=run_dir,
        arch=arch,
        num_classes=num_classes,
        optimizer=optimizer,
        learning_rate=learning_rates[0],
        momentum=momentum,
        weight_decay=weight_decay)

    # Create the loss criterion.
    criterion = _LabelSmoothing(label_smoothing)

    # Move the model and loss to the appropriate devices.
    model = model.to(device)
    criterion = criterion.to(device)

    if fp16:
        from apex import amp  # avoid dependency unless necessary.
        model, _optimizer = amp.initialize(model,
                                           _optimizer,
                                           loss_scale=loss_scale)
    if use_cuda:
        model = nn.DataParallel(model, device_ids=device_ids)

    # Run training.
    return run_training(model=model,
                        optimizer=_optimizer,
                        criterion=criterion,
                        device=device,
                        train_loader=train_loader,
                        epochs=epochs,
                        learning_rates=learning_rates,
                        dev_loader=dev_loader,
                        test_loader=test_loader,
                        fp16=fp16,
                        run_dir=run_dir,
                        checkpoint=checkpoint)