Пример #1
0
def create_handwriting_dataloaders(config,
                                   train_json_path,
                                   val_json_path,
                                   test_json_path,
                                   unlabelled_json_path="",
                                   twohead=False):
    assert config.batchnorm_track  # recommended (for test time invariance to batch size)

    # Transforms:
    if config.sobel:
        tf1, tf2, tf3 = sobel_make_transforms(config)
    else:
        tf1, tf2, tf3 = greyscale_make_transforms(config)
    actual_dataset_root = os.path.join(config.dataset_root, config.dataset)

    if config.leave_out_unlabelled:
        train_files = [train_json_path]
    else:
        assert unlabelled_json_path != ""
        train_files = [train_json_path, unlabelled_json_path]

    # Training data:
    dataloader_list = [
        _create_hw_dataloaders(config, train_files, actual_dataset_root, tf1,
                               tf2)
    ]
    if twohead:
        dataloader_list.append(
            _create_hw_dataloaders(config, train_files, actual_dataset_root,
                                   tf1, tf2))

    # Testing data (labelled):
    mapping_assignment_dataloader = _create_hw_mapping_loader(
        config, [val_json_path], actual_dataset_root, tf3)
    mapping_test_dataloader = _create_hw_mapping_loader(
        config, [test_json_path], actual_dataset_root, tf3)

    return dataloader_list, mapping_assignment_dataloader, mapping_test_dataloader
def make_triplets_data(config):
    target_transform = None

    if "CIFAR" in config.dataset:
        config.train_partitions_head_A = [True, False]
        config.train_partitions_head_B = config.train_partitions_head_A

        config.mapping_assignment_partitions = [True, False]
        config.mapping_test_partitions = [True, False]

        if config.dataset == "CIFAR10":
            dataset_class = torchvision.datasets.CIFAR10
        elif config.dataset == "CIFAR100":
            dataset_class = torchvision.datasets.CIFAR100
        elif config.dataset == "CIFAR20":
            dataset_class = torchvision.datasets.CIFAR100
            target_transform = _cifar100_to_cifar20
        else:
            assert (False)

        # datasets produce either 2 or 5 channel images based on config.include_rgb
        tf1, tf2, tf3 = sobel_make_transforms(config)

    elif config.dataset == "STL10":
        assert (config.mix_train)
        if not config.stl_leave_out_unlabelled:
            print("adding unlabelled data for STL10")
            config.train_partitions_head_A = ["train+unlabeled", "test"]
        else:
            print("not using unlabelled data for STL10")
            config.train_partitions_head_A = ["train", "test"]

        config.train_partitions_head_B = ["train", "test"]

        config.mapping_assignment_partitions = ["train", "test"]
        config.mapping_test_partitions = ["train", "test"]

        dataset_class = torchvision.datasets.STL10

        # datasets produce either 2 or 5 channel images based on config.include_rgb
        tf1, tf2, tf3 = sobel_make_transforms(config)

    elif config.dataset == "MNIST":
        config.train_partitions_head_A = [True, False]
        config.train_partitions_head_B = config.train_partitions_head_A

        config.mapping_assignment_partitions = [True, False]
        config.mapping_test_partitions = [True, False]

        dataset_class = torchvision.datasets.MNIST

        tf1, tf2, tf3 = greyscale_make_transforms(config)

    else:
        assert (False)

    dataloaders = \
        _create_dataloaders(config, dataset_class, tf1, tf2,
                            partitions=config.train_partitions_head_A,
                            target_transform=target_transform)

    dataloader_original = dataloaders[0]
    dataloader_positive = dataloaders[1]

    shuffled_dataloaders = \
        _create_dataloaders(config, dataset_class, tf1, tf2,
                            partitions=config.train_partitions_head_A,
                            target_transform=target_transform,
                            shuffle=True)

    dataloader_negative = shuffled_dataloaders[0]

    # since this is fully unsupervised, assign dataloader = test dataloader
    dataloader_test = \
        _create_mapping_loader(config, dataset_class, tf3,
                               partitions=config.mapping_test_partitions,
                               target_transform=target_transform)

    return dataloader_original, dataloader_positive, dataloader_negative, \
        dataloader_test
Пример #3
0
def create_basic_clustering_dataloaders(config):
    """
    My original data loading code is complex to cover all my experiments. Here is a simple version.
    Use it to replace cluster_twohead_create_dataloaders() in the scripts.

    This uses ImageFolder but you could use your own subclass of torch.utils.data.Dataset.
    (ImageFolder data is not shuffled so an ideally deterministic random sampler is needed.)

    :param config: Requires num_dataloaders and values used by *make_transforms(), e.g. crop size,
    input size etc.
    :return: Training and testing dataloaders
    """

    # Change these according to your data:
    greyscale = False
    train_data_path = os.path.join(config.dataset_root, "train")
    test_val_data_path = os.path.join(config.dataset_root, "none")
    test_data_path = os.path.join(config.dataset_root, "none")
    assert (config.batchnorm_track
            )  # recommended (for test time invariance to batch size)

    # Transforms:
    if greyscale:
        tf1, tf2, tf3 = greyscale_make_transforms(config)
    else:
        tf1, tf2, tf3 = sobel_make_transforms(config)

    # Training data:
    # main output head (B), auxiliary overclustering head (A), same data for both
    dataset_head_B = torchvision.datasets.ImageFolder(root=train_data_path,
                                                      transform=tf1),
    datasets_tf_head_B = [
        torchvision.datasets.ImageFolder(root=train_data_path, transform=tf2)
        for _ in range(config.num_dataloaders)
    ]
    dataloaders_head_B = [torch.utils.data.DataLoader(
        dataset_head_B,
        batch_size=config.dataloader_batch_sz,
        shuffle=False,
        sampler=DeterministicRandomSampler(dataset_head_B),
        num_workers=0,
        drop_last=False)] + \
                         [torch.utils.data.DataLoader(
                             datasets_tf_head_B[i],
                             batch_size=config.dataloader_batch_sz,
                             shuffle=False,
                             sampler=DeterministicRandomSampler(datasets_tf_head_B[i]),
                             num_workers=0,
                             drop_last=False) for i in range(config.num_dataloaders)]

    dataset_head_A = torchvision.datasets.ImageFolder(root=train_data_path,
                                                      transform=tf1)
    datasets_tf_head_A = [
        torchvision.datasets.ImageFolder(root=train_data_path, transform=tf2)
        for _ in range(config.num_dataloaders)
    ]
    dataloaders_head_A = [torch.utils.data.DataLoader(
        dataset_head_A,
        batch_size=config.dataloader_batch_sz,
        shuffle=False,
        sampler=DeterministicRandomSampler(dataset_head_A),
        num_workers=0,
        drop_last=False)] + \
                         [torch.utils.data.DataLoader(
                             datasets_tf_head_A[i],
                             batch_size=config.dataloader_batch_sz,
                             shuffle=False,
                             sampler=DeterministicRandomSampler(datasets_tf_head_A[i]),
                             num_workers=0,
                             drop_last=False) for i in range(config.num_dataloaders)]

    # Testing data (labelled):
    mapping_assignment_dataloader, mapping_test_dataloader = None, None
    if os.path.exists(test_data_path):
        mapping_assignment_dataset = torchvision.datasets.ImageFolder(
            test_val_data_path, transform=tf3)
        mapping_assignment_dataloader = torch.utils.data.DataLoader(
            mapping_assignment_dataset,
            batch_size=config.batch_sz,
            shuffle=False,
            sampler=DeterministicRandomSampler(mapping_assignment_dataset),
            num_workers=0,
            drop_last=False)

        mapping_test_dataset = torchvision.datasets.ImageFolder(test_data_path,
                                                                transform=tf3)
        mapping_test_dataloader = torch.utils.data.DataLoader(
            mapping_test_dataset,
            batch_size=config.batch_sz,
            shuffle=False,
            sampler=DeterministicRandomSampler(mapping_test_dataset),
            num_workers=0,
            drop_last=False)

    return dataloaders_head_A, dataloaders_head_B, mapping_assignment_dataloader, mapping_test_dataloader
Пример #4
0
def cluster_twohead_create_dataloaders(config):
    assert (config.mode == "IID")
    assert config.twohead

    target_transform = None

    if "CIFAR" in config.dataset:
        config.train_partitions_head_A = [True, False]
        config.train_partitions_head_B = config.train_partitions_head_A

        config.mapping_assignment_partitions = [True, False]
        config.mapping_test_partitions = [True, False]

        if config.dataset == "CIFAR10":
            dataset_class = torchvision.datasets.CIFAR10
        elif config.dataset == "CIFAR100":
            dataset_class = torchvision.datasets.CIFAR100
        elif config.dataset == "CIFAR20":
            dataset_class = torchvision.datasets.CIFAR100
            target_transform = _cifar100_to_cifar20
        else:
            assert (False)

        # datasets produce either 2 or 5 channel images based on config.include_rgb
        tf1, tf2, tf3 = sobel_make_transforms(config)

    elif config.dataset == "STL10":
        assert (config.mix_train)
        if not config.leave_out_unlabelled:
            print("adding unlabelled data for STL10")
            config.train_partitions_head_A = ["train+unlabeled", "test"]
        else:
            print("not using unlabelled data for STL10")
            config.train_partitions_head_A = ["train", "test"]

        config.train_partitions_head_B = ["train", "test"]

        config.mapping_assignment_partitions = ["train", "test"]
        config.mapping_test_partitions = ["train", "test"]

        dataset_class = torchvision.datasets.STL10

        # datasets produce either 2 or 5 channel images based on config.include_rgb
        tf1, tf2, tf3 = sobel_make_transforms(config)

    elif config.dataset == "MNIST":
        config.train_partitions_head_A = [True, False]
        config.train_partitions_head_B = config.train_partitions_head_A

        config.mapping_assignment_partitions = [True, False]
        config.mapping_test_partitions = [True, False]

        dataset_class = torchvision.datasets.MNIST

        tf1, tf2, tf3 = greyscale_make_transforms(config)

    else:
        assert False

    print("Making datasets with %s and %s" % (dataset_class, target_transform))
    sys.stdout.flush()

    dataloaders_head_A = _create_dataloaders(
        config,
        dataset_class,
        tf1,
        tf2,
        partitions=config.train_partitions_head_A,
        target_transform=target_transform)

    dataloaders_head_B = _create_dataloaders(
        config,
        dataset_class,
        tf1,
        tf2,
        partitions=config.train_partitions_head_B,
        target_transform=target_transform)

    mapping_assignment_dataloader = _create_mapping_loader(
        config,
        dataset_class,
        tf3,
        partitions=config.mapping_assignment_partitions,
        target_transform=target_transform)

    mapping_test_dataloader = _create_mapping_loader(
        config,
        dataset_class,
        tf3,
        partitions=config.mapping_test_partitions,
        target_transform=target_transform)

    return dataloaders_head_A, dataloaders_head_B, mapping_assignment_dataloader, mapping_test_dataloader
Пример #5
0
def cluster_create_dataloaders(config):
    assert (config.mode == "IID+")
    assert (not config.twohead)

    target_transform = None

    # separate train/test sets
    if "CIFAR" in config.dataset:
        config.train_partitions = [True]
        config.mapping_assignment_partitions = [True]
        config.mapping_test_partitions = [False]

        if config.dataset == "CIFAR10":
            dataset_class = torchvision.datasets.CIFAR10
        elif config.dataset == "CIFAR100":
            dataset_class = torchvision.datasets.CIFAR100
        elif config.dataset == "CIFAR20":
            dataset_class = torchvision.datasets.CIFAR100
            target_transform = _cifar100_to_cifar20
        else:
            assert (False)

        # datasets produce either 2 or 5 channel images based on config.include_rgb
        tf1, tf2, tf3 = sobel_make_transforms(config)

    elif config.dataset == "STL10":
        config.train_partitions = ["train+unlabeled"]
        config.mapping_assignment_partitions = ["train"]
        config.mapping_test_partitions = ["test"]

        dataset_class = torchvision.datasets.STL10

        # datasets produce either 2 or 5 channel images based on config.include_rgb
        tf1, tf2, tf3 = sobel_make_transforms(config)

    elif config.dataset == "MNIST":
        config.train_partitions = [True]
        config.mapping_assignment_partitions = [True]
        config.mapping_test_partitions = [False]

        dataset_class = torchvision.datasets.MNIST

        tf1, tf2, tf3 = greyscale_make_transforms(config)

    else:
        assert (False)

    print("Making datasets with %s and %s" % (dataset_class, target_transform))
    sys.stdout.flush()

    dataloaders = \
        _create_dataloaders(config, dataset_class, tf1, tf2,
                            partitions=config.train_partitions,
                            target_transform=target_transform)

    mapping_assignment_dataloader = \
        _create_mapping_loader(config, dataset_class, tf3,
                               partitions=config.mapping_assignment_partitions,
                               target_transform=target_transform)

    mapping_test_dataloader = \
        _create_mapping_loader(config, dataset_class, tf3,
                               partitions=config.mapping_test_partitions,
                               target_transform=target_transform)

    return dataloaders, mapping_assignment_dataloader, mapping_test_dataloader
assert ("MNIST" == config.dataset)
dataset_class = torchvision.datasets.MNIST
assert (config.train_partitions == [True])
assert (config.mapping_assignment_partitions == [True])
assert (config.mapping_test_partitions == [False])

# append to old results
if not hasattr(config, "assign_set_szs_pc_acc") or given_config.rewrite:
    print("resetting config.assign_set_szs_pc_acc to empty")
    config.assign_set_szs_pc_acc = {}

for pc in new_assign_set_szs_pc:
    print("doing %f" % pc)
    sysout.flush()

    tf1, tf2, tf3 = greyscale_make_transforms(config)

    mapping_assignment_dataloader = \
      _create_mapping_loader(config, dataset_class, tf3,
                             partitions=config.mapping_assignment_partitions,
                             truncate=True, truncate_pc=pc)

    mapping_test_dataloader = \
      _create_mapping_loader(config, dataset_class, tf3,
                             partitions=config.mapping_test_partitions)

    print("num assign batches: %d" % len(mapping_assignment_dataloader))
    num_imgs = len(mapping_assignment_dataloader.dataset)
    print("num imgs in assign dataset: %d" % num_imgs)

    # networks and optimisers