예제 #1
0
def batch_size_linear_search():
    min = 8
    max = 600
    step_size = 8

    optimizer = lambda x: torch.optim.SGD(x, lr=0.1)
    experiment_name = "batch_size_linear_search"
    t = Timer()

    batch_size_times = {}
    for i, batch_size in enumerate(range(min, max, step_size)):
        t.start()
        main(experiment_name, optimizer, epochs=i + 2, batch_size=batch_size)
        elapsed_time = t.stop()
        batch_size_times[batch_size] = elapsed_time

    pickle.dump(batch_size_times, open("batch_size_times.pickle", "wb"))

    # Plot
    batch_sizes = []
    times = []
    for k in sorted(batch_size_times):
        batch_sizes.append(k)
        times.append(batch_size_times[k])

    plt.plot(np.array(batch_sizes), np.array(times))
    plt.xlabel("Batch Size")
    plt.ylabel("Epoch Time")
    plt.title("Batch Size vs Epoch Time")
    plt.show()
def main(experiment_name,
         optimizer,
         output_directory_root="experiments/resnet18_logistic_cifar10",
         epochs=60,
         batch_size=512,
         num_workers=1):

    output_directory = os.path.join(output_directory_root, experiment_name)
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    # Setup regular log file + tensorboard
    logfile_path = os.path.join(output_directory, "logfile.txt")
    setup_logger_tqdm(logfile_path)

    tensorboard_log_directory = os.path.join("runs",
                                             "resnet18_logistic_cifar10",
                                             experiment_name)
    tensorboard_summary_writer = SummaryWriter(
        log_dir=tensorboard_log_directory)

    # Choose Training Device
    use_cuda = torch.cuda.is_available()
    logger.info(f"CUDA Available? {use_cuda}")
    device = "cuda" if use_cuda else "cpu"

    # Datasets and Loaders
    train_set_loader, test_set_loader = get_data_loaders(
        batch_size, num_workers)

    # Create Model & Optimizer
    model = torchvision.models.resnet18(pretrained=True)
    for param in model.parameters():
        param.requires_grad = False
    num_classes = 10
    model.fc = nn.Linear(model.fc.in_features, 10)
    model.to(device)
    optimizer = optimizer(model.parameters())

    logger.info("=========== Commencing Training ===========")
    logger.info(f"Epoch Count: {epochs}")
    logger.info(f"Batch Size: {batch_size}")

    # Load Checkpoint
    checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth")
    start_epoch = 0
    if os.path.exists(checkpoint_file_path):
        logger.info("Checkpoint Found - Loading!")

        checkpoint = torch.load(checkpoint_file_path)
        logger.info(f"Last completed epoch: {checkpoint['epoch']}")
        logger.info(f"Average Train Loss: {checkpoint['train_loss']}")
        logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}")
        logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}")
        start_epoch = checkpoint["epoch"] + 1
        logger.info(f"Resuming at epoch {start_epoch}")

        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    else:
        logger.info("No checkpoint found, starting from scratch.")

    # Training Loop
    t = Timer()
    for epoch in range(start_epoch, epochs):
        t.start()
        logger.info("-" * 10)
        logger.info(f"Epoch {epoch}")
        logger.info("-" * 10)

        train_loss, train_accuracy = train_model(device, model,
                                                 train_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch)
        tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy,
                                              epoch)

        test_accuracy = test_model(device, model, test_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy,
                                              epoch)

        # Save Checkpoint
        logger.info("Saving checkpoint.")
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy
            }, checkpoint_file_path)

        elapsed_time = t.stop()
        logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.")
        logger.info(f"Average Train Loss: {train_loss}")
        logger.info(f"Top-1 Train Accuracy: {train_accuracy}")
        logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
        logger.info("")
예제 #3
0
def main(device, mp_args, dataloader_func, model, optimizer_callback,
         output_directory, tensorboard_log_directory, epochs):

    global_rank = mp_args.nr * mp_args.gpus + device
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=mp_args.world_size,
                            rank=global_rank)

    output_directory = os.path.join(output_directory, f"rank_{global_rank}")
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    # Setup regular log file
    logfile_path = os.path.join(output_directory, "logfile.txt")
    setup_logger_tqdm(logfile_path)

    # Setup TensorBoard logging
    tensorboard_log_directory = os.path.join(tensorboard_log_directory,
                                             f"rank_{global_rank}")
    tensorboard_summary_writer = SummaryWriter(
        log_dir=tensorboard_log_directory)

    # Dataloaders
    train_set_loader, test_set_loader = dataloader_func(
        mp_args.world_size, global_rank)

    # Model & Optimizer
    model.to(device)
    optimizer = optimizer_callback(model)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])

    logger.info(f"Epoch Count: {epochs}")

    # Load Checkpoint
    checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth")
    start_epoch = 0
    if os.path.exists(checkpoint_file_path):
        logger.info("Checkpoint Found - Loading!")

        checkpoint = torch.load(checkpoint_file_path)
        logger.info(f"Last completed epoch: {checkpoint['epoch']}")
        logger.info(f"Average Train Loss: {checkpoint['train_loss']}")
        logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}")
        logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}")
        start_epoch = checkpoint["epoch"] + 1
        logger.info(f"Resuming at epoch {start_epoch}")

        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    else:
        logger.info("No checkpoint found, starting from scratch.")

    # Training Loop
    t = Timer()
    #progress = tqdm(total=epochs, initial=start_epoch, desc="Epochs")
    for epoch in range(start_epoch, epochs):
        t.start()
        logger.info(f"Commence EPOCH {epoch}")

        # Train
        train_loss, train_accuracy = train_model(device, model,
                                                 train_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch)
        tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy,
                                              epoch)

        # Test
        test_accuracy = test_model(device, model, test_set_loader)
        tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy,
                                              epoch)

        # Save Checkpoint
        logger.info("Saving checkpoint.")
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy
            }, checkpoint_file_path)

        elapsed_time = t.stop()
        logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.")
        logger.info(f"Average Train Loss: {train_loss}")
        logger.info(f"Top-1 Train Accuracy: {train_accuracy}")
        logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
예제 #4
0
def main(dataloader_func,
         model,
         optimizer_callback,
         output_directory,
         tensorboard_log_directory,
         lr_scheduler=None,
         epochs=150):

    if not os.path.isdir(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    # Setup regular log file
    logfile_path = os.path.join(output_directory, "logfile.txt")
    setup_logger_tqdm(logfile_path)

    # Setup TensorBoard logging
    tensorboard_summary_writer = SummaryWriter(
        log_dir=tensorboard_log_directory)

    # Choose Training Device
    use_cuda = torch.cuda.is_available()
    logger.info(f"CUDA Available? {use_cuda}")
    device = "cuda" if use_cuda else "cpu"

    # Dataloaders
    train_set_loader, test_set_loader = dataloader_func()

    # Model & Optimizer
    model.to(device)
    optimizer = optimizer_callback(model)
    if lr_scheduler:
        lr_scheduler = lr_scheduler(optimizer)

    logger.info(f"Epoch Count: {epochs}")

    # Load Checkpoint
    checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth")
    start_epoch = 0
    if os.path.exists(checkpoint_file_path):
        logger.info("Checkpoint Found - Loading!")

        checkpoint = torch.load(checkpoint_file_path)
        logger.info(f"Last completed epoch: {checkpoint['epoch']}")
        logger.info(f"Average Train Loss: {checkpoint['train_loss']}")
        logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}")
        logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}")
        start_epoch = checkpoint["epoch"] + 1
        logger.info(f"Resuming at epoch {start_epoch}")

        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        if lr_scheduler:
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
    else:
        logger.info("No checkpoint found, starting from scratch.")

    # Training Loop
    t = Timer()
    for epoch in range(start_epoch, epochs):
        t.start()
        logger.info(f"Commence EPOCH {epoch}")

        # Train
        train_loss, train_accuracy = train_model(device, model,
                                                 train_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch)
        tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy,
                                              epoch)

        # Test
        test_accuracy = test_model(device, model, test_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy,
                                              epoch)

        scheduler_dict = None
        if lr_scheduler:
            lr_scheduler.step()
            scheduler_dict = lr_scheduler.state_dict()

        # Save Checkpoint
        logger.info("Saving checkpoint.")
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'lr_scheduler_state_dict': scheduler_dict,
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy
            }, checkpoint_file_path)

        elapsed_time = t.stop()
        logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.")
        logger.info(f"Average Train Loss: {train_loss}")
        logger.info(f"Top-1 Train Accuracy: {train_accuracy}")
        logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
예제 #5
0
def main(carrier_path,
         marking_network,
         target_network,
         target_checkpoint,
         batch_size=256,
         num_workers=1,
         align=True,
         test_set_loader=None):

    # Setup Device
    use_cuda = torch.cuda.is_available()
    logger.info(f"CUDA Available? {use_cuda}")
    device = torch.device("cuda" if use_cuda else "cpu")

    # Load Carrier
    carrier = torch.load(carrier_path).numpy()

    t = Timer()
    t.start()

    # Align spaces (Or Not)
    W = target_checkpoint["model_state_dict"]["fc.weight"].cpu().numpy()
    if align:
        logger.info(
            "Aligning marking and target network feature space with least squares"
        )

        marking_network.to(device)
        marking_network.eval()

        target_network.to(device)
        target_network.eval()

        # Setup Dataloader
        if not test_set_loader:
            test_set_loader = get_data_loader(batch_size, num_workers)

        logger.info(
            "Extracting image features from marking and target networks.")
        features_marking, _ = extract_features(test_set_loader,
                                               marking_network,
                                               device,
                                               verbose=False)
        features_target, _ = extract_features(test_set_loader,
                                              target_network,
                                              device,
                                              verbose=False)
        features_marking = features_marking.numpy()
        features_target = features_target.numpy()

        X, residuals, rank, s = np.linalg.lstsq(features_marking,
                                                features_target)
        logger.info(
            "Norm of residual: %.4e" %
            np.linalg.norm(np.dot(features_marking, X) - features_target)**2)
        W = np.matmul(W, X.T)

    # Computing scores
    W /= np.linalg.norm(W, axis=1, keepdims=True)
    scores = np.sum(W * carrier, axis=1)
    #print(f"SCORES: {scores}")

    logger.info("Mean p-value is at %d times sigma" %
                int(scores.mean() * np.sqrt(W.shape[0] * carrier.shape[1])))
    logger.info("Epoch of the model: %d" % target_checkpoint["epoch"])

    p_vals = [cosine_pvalue(c, d=carrier.shape[1]) for c in list(scores)]
    #print(f"Cosine P values: {p_vals}")
    #print(f"np.sum(np.log(p_vals)): {np.sum(np.log(p_vals))}")

    #logger.info(p_vals)
    combined_pval = combine_pvalues(p_vals)[1]
    logger.info(f"log10(p)={np.log10(combined_pval)}")

    elapsed_time = t.stop()
    logger.info("Total took %.2f" % (elapsed_time))

    return (scores, p_vals, combined_pval)
예제 #6
0
def main(device,
         mp_args,
         experiment_name,
         optimizer,
         output_directory_root="experiments/resnet18_distributed",
         lr_scheduler=None,
         epochs=150,
         batch_size=512,
         num_workers=1):

    global_rank = mp_args.nr * mp_args.gpus + device
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=mp_args.world_size,
                            rank=global_rank)

    output_directory = os.path.join(output_directory_root, experiment_name,
                                    f"rank_{global_rank}")
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory, exist_ok=True)

    # Setup regular log file + tensorboard
    logfile_path = os.path.join(output_directory, "logfile.txt")
    setup_logger_tqdm(logfile_path)

    tensorboard_log_directory = os.path.join("runs", "resnet18_distributed",
                                             experiment_name,
                                             f"rank_{global_rank}")
    tensorboard_summary_writer = SummaryWriter(
        log_dir=tensorboard_log_directory)

    # Datasets and Loaders
    train_set_loader, test_set_loader = get_data_loaders(
        mp_args.world_size, global_rank, batch_size, num_workers)

    # Create Model & Optimizer (uses Partial Functions)
    model = torchvision.models.resnet18(pretrained=False, num_classes=10)
    model.to(device)
    optimizer = optimizer(model.parameters())
    model = nn.parallel.DistributedDataParallel(model, device_ids=[device])

    if lr_scheduler:
        lr_scheduler = lr_scheduler(optimizer)

    logger.info("=========== Commencing Training ===========")
    logger.info(f"Epoch Count: {epochs}")
    logger.info(f"Batch Size: {batch_size}")

    # Load Checkpoint
    checkpoint_file_path = os.path.join(output_directory, "checkpoint.pth")
    start_epoch = 0
    if os.path.exists(checkpoint_file_path):
        logger.info("Checkpoint Found - Loading!")
        checkpoint = torch.load(checkpoint_file_path)
        logger.info(f"Last completed epoch: {checkpoint['epoch']}")
        logger.info(f"Average Train Loss: {checkpoint['train_loss']}")
        logger.info(f"Top-1 Train Accuracy: {checkpoint['train_accuracy']}")
        logger.info(f"Top-1 Test Accuracy: {checkpoint['test_accuracy']}")
        start_epoch = checkpoint["epoch"] + 1
        logger.info(f"Resuming at epoch {start_epoch}")

        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        if lr_scheduler:
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state_dict"])
    else:
        logger.info("No checkpoint found, starting from scratch.")

    # Training Loop
    t = Timer()
    for epoch in range(start_epoch, epochs):
        t.start()
        logger.info("-" * 10)
        logger.info(f"Epoch {epoch}")
        logger.info("-" * 10)

        train_loss, train_accuracy = train_model(device, model,
                                                 train_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("train_loss", train_loss, epoch)
        tensorboard_summary_writer.add_scalar("train_accuracy", train_accuracy,
                                              epoch)

        test_accuracy = test_model(device, model, test_set_loader, optimizer)
        tensorboard_summary_writer.add_scalar("test_accuracy", test_accuracy,
                                              epoch)

        scheduler_dict = None
        if lr_scheduler:
            lr_scheduler.step()
            scheduler_dict = lr_scheduler.state_dict()

        # Save Checkpoint
        logger.info("Saving checkpoint.")
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'lr_scheduler_state_dict': scheduler_dict,
                'train_loss': train_loss,
                'train_accuracy': train_accuracy,
                'test_accuracy': test_accuracy
            }, checkpoint_file_path)

        elapsed_time = t.stop()
        logger.info(f"End of epoch {epoch}, took {elapsed_time:0.4f} seconds.")
        logger.info(f"Average Train Loss: {train_loss}")
        logger.info(f"Top-1 Train Accuracy: {train_accuracy}")
        logger.info(f"Top-1 Test Accuracy: {test_accuracy}")
        logger.info("")