예제 #1
0
def save_model_training(
    model: Module,
    optim: Optimizer,
    input_shape: Tuple[int, ...],
    save_name: str,
    save_dir: str,
    epoch: int,
    val_res: Union[ModuleRunResults, None],
    convert_qat: bool = False,
):
    """
    :param model: model architecture
    :param optim: The optimizer used
    :param input_shape: A tuple of integers representing the input shape
    :param save_name: name to save model to
    :param save_dir: directory to save results in
    :param epoch: integer representing umber of epochs to
    :param val_res: results from validation run
    :param convert_qat: True if model is to be quantized before saving
    """
    has_top1 = "top1acc" in val_res.results
    metric_name = "top-1 accuracy" if has_top1 else "val_loss"
    metric = val_res.result_mean(
        "top1acc" if has_top1 else DEFAULT_LOSS_KEY).item()
    print(f"Saving model for epoch {epoch} and {metric_name} "
          f"{metric} to {save_dir} for {save_name}")
    exporter = ModuleExporter(model, save_dir)
    exporter.export_pytorch(optim, epoch, f"{save_name}.pth")
    exporter.export_onnx(
        torch.randn(1, *input_shape),
        f"{save_name}.onnx",
        convert_qat=convert_qat,
    )

    info_path = os.path.join(save_dir, f"{save_name}.txt")

    with open(info_path, "w") as info_file:
        info_lines = [
            f"epoch: {epoch}",
        ]

        if val_res is not None:
            for loss in val_res.results.keys():
                info_lines.append(
                    f"{loss}: {val_res.result_mean(loss).item()}")

        info_file.write("\n".join(info_lines))
예제 #2
0
def export(args: ExportArgs, model: Module, val_loader: DataLoader,
           save_dir: str) -> None:
    """
    Utility method to export the model and data

    :param args : An ExportArgs object containing config for export task.
    :param model: loaded model architecture to export
    :param val_loader: A DataLoader for validation data
    :param save_dir: Directory to store checkpoints at during exporting process
    """
    exporter = ModuleExporter(model, save_dir)

    # export PyTorch state dict
    LOGGER.info(f"exporting pytorch in {save_dir}")

    exporter.export_pytorch(use_zipfile_serialization_if_available=(
        args.use_zipfile_serialization_if_available))
    onnx_exported = False

    for batch, data in tqdm(
            enumerate(val_loader),
            desc="Exporting samples",
            total=args.num_samples if args.num_samples > 1 else 1,
    ):
        if not onnx_exported:
            # export onnx file using first sample for graph freezing
            LOGGER.info(f"exporting onnx in {save_dir}")
            exporter.export_onnx(data[0],
                                 opset=args.onnx_opset,
                                 convert_qat=True)
            onnx_exported = True

        if args.num_samples > 0:
            exporter.export_samples(sample_batches=[data[0]],
                                    sample_labels=[data[1]],
                                    exp_counter=batch)
예제 #3
0
def main(args):
    ############################
    # logging and saving setup #
    ############################
    save_dir = os.path.abspath(os.path.expanduser(args.save_dir))

    # get unique model tag, defaults to '{model_name}'
    if not args.model_tag:
        model_tag = args.model.replace("/", ".")
        model_id = model_tag
        model_inc = 0

        while os.path.exists(os.path.join(args.save_dir, model_id)):
            model_inc += 1
            model_id = "{}__{:02d}".format(model_tag, model_inc)
    else:
        model_id = args.model_tag
    save_dir = os.path.join(save_dir, model_id)
    create_dirs(save_dir)
    print("Model id is set to {}".format(model_id))

    ###########################
    # standard training setup #
    ###########################

    # create data loaders
    train_loader, _, _ = _create_imagefolder_dataloader(args, train=True)
    val_loader, num_classes, image_shape = _create_imagefolder_dataloader(
        args, train=False
    )
    dataloaders = {"train": train_loader, "val": val_loader}

    # create model
    model = _get_torchvision_model(
        args.model,
        num_classes,
        args.pretrained,
        args.checkpoint_path,
    )
    print("created model: {}".format(model))
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print("using device: {}".format(device))

    # create standard SGD optimizer and cross entropy loss function
    criterion = CrossEntropyLoss()
    optimizer = SGD(
        model.parameters(), lr=0.001, momentum=0.9
    )  # lr will be overridden by recipe

    ##########################
    # add sparseml modifiers #
    ##########################
    manager = ScheduledModifierManager.from_yaml(args.recipe_path)
    optimizer = ScheduledOptimizer(
        optimizer,
        model,
        manager,
        steps_per_epoch=len(train_loader),
        loggers=[PythonLogger()],
    )

    ########################
    # torchvision training #
    ########################
    model, val_acc_history = train_model(
        model,
        dataloaders,
        criterion,
        optimizer,
        device,
        num_epochs=manager.max_epochs,
        is_inception="inception" in args.model,
    )

    ########################
    # export trained model #
    ########################
    exporter = ModuleExporter(model, save_dir)
    sample_input = torch.randn(image_shape).unsqueeze(0)  # sample batch for ONNX export
    exporter.export_onnx(sample_input)
    exporter.export_pytorch()
    print("Model ONNX export and PyTorch weights saved to {}".format(save_dir))
예제 #4
0
def train(
    working_dir: str,
    config_path: str,
    model: Module,
    train_dataset: Dataset,
    val_dataset: Dataset,
    batch_size: int,
    optim_const: Callable[[Module], Optimizer],
    loss: Union[LossWrapper, Callable[[Any, Any], Tensor]],
    devices: str,
):
    """
    Dataset setup
    """
    LOGGER.info("batch_size set to {}".format(batch_size))
    LOGGER.info("train_dataset set to {}".format(train_dataset))
    LOGGER.info("val_dataset set to {}".format(val_dataset))

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=8,
        pin_memory=True,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=8,
        pin_memory=True,
    )

    """
    Model, optimizer, loss setup
    """
    model_dir = clean_path(os.path.join(working_dir, "model"))
    optim = optim_const(model)

    LOGGER.info("model set to {}".format(model))
    LOGGER.info("optimizer set to {}".format(optim))
    LOGGER.info("loss set to {}".format(loss))
    LOGGER.info("devices set to {}".format(devices))

    """
    Manager and config setup
    """
    manager = ScheduledModifierManager.from_yaml(config_path)
    logs_dir = clean_path(os.path.join(working_dir, "logs"))
    loggers = [TensorBoardLogger(logs_dir), PythonLogger()]
    optim = ScheduledOptimizer(
        optim, model, manager, steps_per_epoch=len(train_loader), loggers=loggers
    )

    """
    Training and testing
    """
    model, device, device_ids = model_to_device(model, devices)
    trainer = ModuleTrainer(model, device, loss, optim, loggers=loggers)
    tester = ModuleTester(model, device, loss, loggers=loggers, log_steps=-1)

    epoch = -1
    tester.run_epoch(val_loader, epoch=epoch)

    for epoch in range(manager.max_epochs):
        LOGGER.info("starting training epoch {}".format(epoch))
        train_res = trainer.run_epoch(train_loader, epoch)
        LOGGER.info("finished training epoch {}: {}".format(epoch, train_res))
        val_res = tester.run_epoch(val_loader, epoch)
        LOGGER.info("finished validation epoch {}: {}".format(epoch, val_res))

    exporter = ModuleExporter(model, model_dir)
    exporter.export_pytorch(optim, epoch)

    for data in val_loader:
        exporter.export_onnx(data)