예제 #1
0
def main():
    # move up and down
    train_data_sub_translation = []
    for i in range(0, 3000):
        train_data_sub_translation.extend(map(lambda x: img_translation(trainset_import.train_data[i], x), np.random.randint(1,4,10)))

    # add noise
    train_data_sub_noise = []
    for i in range(0, 3000):
        train_data_sub_noise.extend(map(lambda x: add_noise(trainset_import.train_data[i], x), range(0, 10)))

    # rotate
    train_data_sub_rotate = []
    for i in range(0, 3000):
        train_data_sub_rotate.extend(rotate_random(trainset_import.train_data[i]))

    train_data_sub = train_data_sub_rotate + train_data_sub_translation + train_data_sub_noise

    # label array
    train_labels_sub = []
    for i in range(0, 3000):
        train_labels_sub.extend([trainset_import.train_labels[i]] * 10)
    train_labels = train_labels_sub + train_labels_sub + train_labels_sub
    train_labels = torch.from_numpy(np.array(train_labels))

    transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.1307,), (0.3081,))
                             ])
    trainset_new = subMNIST(root='./data/', train=True, download=True, transform=transform, k=90000)

    trainset_new.train_data = train_data_sub
    trainset_new.train_labels = train_labels
    pickle.dump(trainset_new, open("./data/train_labeled_allmethod.p", "wb" ))
def loadDataForLabeling():
    train_loader2 = torch.utils.data.DataLoader(
        trainset_new_unl, batch_size=1,
        shuffle=False)  #Careful must not be shuffled !!!WORKS LOSS IS REDUCED

    #Add unlabeled test set
    testAndLabel(1, trainedModel, train_loader2)

    #Train on full newly labeled set
    print('Beginning training loops phase 2')

    #Load new Labels
    train_labels_sub_unl = torch.from_numpy(
        np.load("trainset_np_unlLabels.npy"))
    train_labels_sub_unl = train_labels_sub_unl.type(torch.long)
    trainset_new_unl.train_labels = train_labels_sub_unl

    trainset_full = subMNIST(root='./data',
                             train=True,
                             transform=transform,
                             k=50000)
    trainset_full.train_data = torch.cat(
        (trainset_new_unl.train_data, trainset_new.train_data), 0)
    trainset_full.train_labels = torch.cat(
        (trainset_new_unl.train_labels, trainset_new.train_labels), 0)

    train_loader3 = torch.utils.data.DataLoader(trainset_full,
                                                batch_size=64,
                                                shuffle=False)
    print('Trainset train_data ' + str(trainset_full.train_data.size()))
    return train_loader3
예제 #3
0
def join_MNIST_tensors(in1, in2):
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    joint_data = torch.cat((in1.train_data, in2.train_data), 0)
    joint_labels = torch.cat((in1.train_labels, in2.train_labels), 0)

    joint_total = subMNIST(root='./data',
                           train=True,
                           download=True,
                           transform=transform,
                           k=in1.train_data.size()[0] +
                           in2.train_data.size()[0])
    joint_total.train_data = joint_data.clone()
    joint_total.train_labels = joint_labels.clone()

    return joint_total
예제 #4
0
da = DataAug()

augmented_data, augmented_label = da.dataaug(train_data_sub[0],
                                             train_labels_sub[0])
for i in range(1, train_data_sub.shape[0]):
    tdata, tlabel = da.dataaug(train_data_sub[i], train_labels_sub[i])
    augmented_data = np.append(augmented_data, tdata, axis=0)
    augmented_label = np.append(augmented_label, tlabel)

train_data_sub = np.append(train_data_sub, augmented_data, axis=0)
train_labels_sub = np.append(train_labels_sub, augmented_label, axis=0)

augdata = train_data_sub
auglabel = train_labels_sub
print(augdata.shape)
print(auglabel.shape)

train_data_sub = torch.from_numpy(augdata)
train_labels_sub = torch.from_numpy(auglabel)
print(train_labels_sub.size())
print(train_data_sub.size())

trainset_new = subMNIST(root='./data',
                        train=True,
                        download=True,
                        transform=transform,
                        k=18000)
trainset_new.train_data = train_data_sub.clone()
trainset_new.train_labels = train_labels_sub.clone()

pickle.dump(trainset_new, open("data/train_labeled_aug.p", "wb"))
예제 #5
0
def dataLoader():
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    trainset_original = datasets.MNIST('../data',
                                       train=True,
                                       download=True,
                                       transform=transform)
    train_label_index = []
    valid_label_index = []
    for i in range(10):
        train_label_list = trainset_original.train_labels.numpy()
        label_index = np.where(train_label_list == i)[0]
        label_subindex = list(label_index[:300])
        valid_subindex = list(label_index[300:1000 + 300])
        train_label_index += label_subindex
        valid_label_index += valid_subindex

    #Train Set
    trainset_np = trainset_original.train_data.numpy()
    trainset_label_np = trainset_original.train_labels.numpy()
    train_data_sub = torch.from_numpy(trainset_np[train_label_index])
    train_labels_sub = torch.from_numpy(trainset_label_np[train_label_index])

    trainset_new = subMNIST(root='./data',
                            train=True,
                            download=True,
                            transform=transform,
                            k=3000)
    trainset_new.train_data = train_data_sub.clone()
    trainset_new.train_labels = train_labels_sub.clone()

    pickle.dump(trainset_new, open("train_labeled.p", "wb"))

    #### Augmenting training set
    ## initialize trainset as usual
    trainset_aug = subMNIST(root='./data',
                            train=True,
                            download=True,
                            transform=transform,
                            k=30000)
    ## turns out you can just repeat a tensor, cool
    ## http://pytorch.org/docs/tensors.html#torch.Tensor.repeat
    trainset_aug.train_data = train_data_sub.clone().repeat(
        10, 1, 1)  ## 4 in the first axis
    print(train_data_sub.size())
    print(trainset_aug.train_data.size())
    trainset_aug.train_labels = train_labels_sub.clone().repeat(
        10)  ## only one axi
    print(train_labels_sub.size())
    print(trainset_aug.train_labels.size())
    ## dims look correct!

    ## load scipy image tools and distributions for sampling
    from scipy import ndimage
    from scipy import stats
    import random
    random.seed(1337)
    #from math import ceil, floor

    ## iterate through and augment
    n = trainset_aug.train_data.size()[0] / 10
    print(n)
    iter_vals = range(0, n)
    #iter_vals = range(0, 5)
    ''' 
    for i in iter_vals:
        image_in = trainset_aug.train_data[i]
        trainset_aug.train_data[i+n] = zoom_image(image_in)
        trainset_aug.train_data[i+(n*2)] = translate_image(image_in)
        trainset_aug.train_data[i+(n*3)] = rotate_image(image_in)           
    '''
    for i in iter_vals:
        image_in = trainset_aug.train_data[i]
        trainset_aug.train_data[i + n] = zoom_image(image_in)
        trainset_aug.train_data[i + (n * 2)] = zoom_image(image_in)
        trainset_aug.train_data[i + (n * 3)] = zoom_image(image_in)
        trainset_aug.train_data[i + (n * 4)] = translate_image(image_in)
        trainset_aug.train_data[i + (n * 5)] = translate_image(image_in)
        trainset_aug.train_data[i + (n * 6)] = translate_image(image_in)
        trainset_aug.train_data[i + (n * 7)] = rotate_image(image_in)
        trainset_aug.train_data[i + (n * 8)] = rotate_image(image_in)
        trainset_aug.train_data[i + (n * 9)] = rotate_image(image_in)

    ## dump to pickle
    pickle.dump(trainset_aug, open("train_labeled_aug.p", "wb"))

    #Validation Set
    validset_np = trainset_original.train_data.numpy()
    validset_label_np = trainset_original.train_labels.numpy()
    valid_data_sub = torch.from_numpy(validset_np[valid_label_index])
    valid_labels_sub = torch.from_numpy(validset_label_np[valid_label_index])

    validset = subMNIST(root='./data',
                        train=False,
                        download=True,
                        transform=transform,
                        k=10000)
    validset.test_data = valid_data_sub.clone()
    validset.test_labels = valid_labels_sub.clone()

    pickle.dump(validset, open("validation.p", "wb"))

    #Unlabeled Data
    train_unlabel_index = []
    for i in range(60000):
        if i in train_label_index or i in valid_label_index:
            pass
        else:
            train_unlabel_index.append(i)

    trainset_np = trainset_original.train_data.numpy()
    trainset_label_np = trainset_original.train_labels.numpy()
    train_data_sub_unl = torch.from_numpy(trainset_np[train_unlabel_index])
    #train_labels_sub_unl = torch.from_numpy(trainset_label_np[train_unlabel_index])
    temp = np.empty(47000)
    temp.fill(-1)
    train_labels_sub_unl = torch.from_numpy(temp)

    trainset_new_unl = subMNIST(root='./data',
                                train=True,
                                download=True,
                                transform=transform,
                                k=47000)
    trainset_new_unl.train_data = train_data_sub_unl.clone()
    trainset_new_unl.train_labels = train_labels_sub_unl.clone()

    pickle.dump(trainset_new_unl, open("train_unlabeled.p", "wb"))
예제 #6
0
def split_dataset(trainset_original,
                  n_train_labels_pc=10,
                  n_train_unlabeled_pc=None,
                  n_validation_pc=1000):
    """

    Parameters
    ----------
    trainset_original : torch.utils.data.Dataset
        A dataset object as defined by torch

    n_train_labels_pc : int
        Number of labeled samples per class to use for training.

    n_train_unlabeled_pc : int
        Number of unlabeled samples per class to use for training.

    n_validation_pc : int
        Number of labeled samples per class to use for validation.

    Returns
    -------
    trainset_new, validset, trainset_new_unl: torch.utils.data.Dataset objects

    """

    train_label_index = []
    train_unlabel_index = []
    valid_label_index = []

    classes = np.unique(trainset_original.train_labels.numpy())
    n_classes = len(classes)

    for i in range(n_classes):
        train_label_list = trainset_original.train_labels.numpy()
        label_index = np.where(train_label_list == i)[0]
        n_class_samples = len(label_index)

        n_tv = n_train_labels_pc + n_validation_pc
        if n_train_unlabeled_pc is not None:
            n_tv += n_train_unlabeled_pc

        if n_tv > n_class_samples:
            raise ValueError('Class {} has not enough samples ({}) to split '
                             'in training labeled, training unlabeled and '
                             'validation set'.format(classes[i],
                                                     n_class_samples))

        label_subindex = list(label_index[:n_train_labels_pc])
        ind_end = n_train_labels_pc + n_validation_pc
        valid_subindex = list(label_index[n_train_labels_pc:ind_end])
        ind_start = ind_end

        if n_train_unlabeled_pc is not None:
            ind_end += n_train_labels_pc
        else:
            ind_end = n_class_samples

        unlabel_subindex = list(label_index[ind_start:ind_end])
        train_label_index += label_subindex
        valid_label_index += valid_subindex
        train_unlabel_index += unlabel_subindex

    trainset_np = trainset_original.train_data.numpy()
    trainset_label_np = trainset_original.train_labels.numpy()
    train_data_sub = torch.from_numpy(trainset_np[train_label_index])
    train_labels_sub = torch.from_numpy(trainset_label_np[train_label_index])

    trainset_new = subMNIST(root='./../data',
                            train=True,
                            download=True,
                            transform=mnist_transform,
                            k=n_train_labels_pc * n_classes)
    trainset_new.train_data = train_data_sub.clone()
    trainset_new.train_labels = train_labels_sub.clone()

    # pickle.dump(trainset_new, open("./../data/train_labeled.p", "wb"))

    validset_np = trainset_original.train_data.numpy()
    validset_label_np = trainset_original.train_labels.numpy()
    valid_data_sub = torch.from_numpy(validset_np[valid_label_index])
    valid_labels_sub = torch.from_numpy(validset_label_np[valid_label_index])

    validset = subMNIST(root='./../data',
                        train=False,
                        download=True,
                        transform=mnist_transform,
                        k=n_validation_pc * n_classes)
    validset.test_data = valid_data_sub.clone()
    validset.test_labels = valid_labels_sub.clone()

    # pickle.dump(validset, open("./../data/validation.p", "wb"))

    n_unlabeled_set = len(train_unlabel_index)
    trainset_np = trainset_original.train_data.numpy()
    trainset_label_np = trainset_original.train_labels.numpy()
    train_data_sub_unl = torch.from_numpy(trainset_np[train_unlabel_index])
    # train_labels_sub_unl = torch.from_numpy(trainset_label_np[train_unlabel_index])

    trainset_new_unl = subMNIST(root='./../data',
                                train=True,
                                download=True,
                                transform=mnist_transform,
                                k=n_unlabeled_set)
    trainset_new_unl.train_data = train_data_sub_unl.clone()
    trainset_new_unl.train_labels = None  # Unlabeled

    # pickle.dump(trainset_new_unl, open("./../data/train_unlabeled.p", "wb"))

    return trainset_new, trainset_new_unl, validset
예제 #7
0
train_label_index = []
valid_label_index = []
for i in range(10):
    train_label_list = trainset_original.train_labels.numpy()
    label_index = np.where(train_label_list == i)[0]
    label_subindex = list(label_index[:300])
    valid_subindex = list(label_index[300: 1000 + 300])
    train_label_index += label_subindex
    valid_label_index += valid_subindex

trainset_np = trainset_original.train_data.numpy()
trainset_label_np = trainset_original.train_labels.numpy()
train_data_sub = torch.from_numpy(trainset_np[train_label_index])
train_labels_sub = torch.from_numpy(trainset_label_np[train_label_index])

trainset_new = subMNIST(root='data', train=True, download=True, transform=transform, k=3000)
trainset_new.data = train_data_sub.clone()
trainset_new.targets = train_labels_sub.clone()

pickle.dump(trainset_new, open("data/train_labeled.p", "wb"))


validset_np = trainset_original.train_data.numpy()
validset_label_np = trainset_original.train_labels.numpy()
valid_data_sub = torch.from_numpy(validset_np[valid_label_index])
valid_labels_sub = torch.from_numpy(validset_label_np[valid_label_index])


validset = subMNIST(root='data', train=False, download=True, transform=transform, k=10000)
validset.data = valid_data_sub.clone()
validset.targets = valid_labels_sub.clone()
예제 #8
0
def all_transformations(dataset_pickle,
                        data_type,
                        type_transformation=['rotation'],
                        value_rotation=45.0,
                        value_scale=0.8,
                        distance_translation=0.1,
                        direction_translation='right',
                        horizontal_translation=True,
                        elastic_alpha=34,
                        elastic_sigma=4):
    '''
    This function takes the pickle data (train and test) and pre-process with 3 different data-augmentation techniques.
    We can change default values of each transformation.
    
    INPUT: dataset_pickle (pickle pytorch), 
           data_type ='train' or 'train_unlabel' or 'test',
           type_transformation=['rotation'], ['scale'], ['translation'], combinations (i.e. ['rotation','scale']) or ['all'], 
           
    RETURNS: Transformed data, ready to load with torch.utils.data.DataLoader
    
    '''
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    if data_type == 'train':
        dataset_loader_numpy = dataset_pickle.train_data.numpy()
        dataset_label_numpy = dataset_pickle.train_labels.numpy()
        dataset_import = subMNIST(root='./data',
                                  train=True,
                                  transform=transform,
                                  download=True,
                                  k=dataset_pickle.train_data.size()[0])
    if data_type == 'train_unlabel':
        dataset_loader_numpy = dataset_pickle.train_data.numpy()
        dataset_label_numpy = dataset_pickle.train_labels.numpy()
        dataset_import = subMNIST(root='./data',
                                  train=True,
                                  transform=transform,
                                  download=True,
                                  k=dataset_pickle.train_data.size()[0])

    if type_transformation == ['rotation']:
        dataset_loader_numpy_transformed = np.array(
            [rotation(x, value_rotation) for x in dataset_loader_numpy])
    if type_transformation == ['elastic']:
        dataset_loader_numpy_transformed = np.array([
            elastic_transform(x, elastic_alpha, elastic_sigma)
            for x in dataset_loader_numpy
        ])
    if type_transformation == ['scale']:
        dataset_loader_numpy_transformed = np.array(
            [scale(x, value_scale) for x in dataset_loader_numpy])
    if type_transformation == ['translation']:
        dataset_loader_numpy_transformed = np.array([
            translation(x,
                        distance_translation,
                        direction=direction_translation,
                        horizontal=horizontal_translation)
            for x in dataset_loader_numpy
        ])
    if type_transformation == ['rotation', 'scale'] or type_transformation == [
            'scale', 'rotation'
    ]:
        dataset_loader_numpy_transformed = np.array(
            [rotation(x, value_rotation) for x in dataset_loader_numpy])
        dataset_loader_numpy_transformed = np.array(
            [scale(x, value_scale) for x in dataset_loader_numpy_transformed])
    if type_transformation == [
            'rotation', 'translation'
    ] or type_transformation == ['translation', 'rotation']:
        dataset_loader_numpy_transformed = np.array(
            [rotation(x, value_rotation) for x in dataset_loader_numpy])
        dataset_loader_numpy_transformed = np.array([
            translation(x,
                        distance_translation,
                        direction=direction_translation,
                        horizontal=horizontal_translation)
            for x in dataset_loader_numpy_transformed
        ])
    if type_transformation == [
            'scale', 'translation'
    ] or type_transformation == ['translation', 'scale']:
        dataset_loader_numpy_transformed = np.array(
            [scale(x, value_scale) for x in dataset_loader_numpy])
        dataset_loader_numpy_transformed = np.array([
            translation(x,
                        distance_translation,
                        direction=direction_translation,
                        horizontal=horizontal_translation)
            for x in dataset_loader_numpy_transformed
        ])
    if type_transformation == ['rotation', 'scale', 'translation', 'elastic']:
        dataset_loader_numpy_transformed = np.array(
            [rotation(x, value_rotation) for x in dataset_loader_numpy])
        dataset_loader_numpy_transformed = np.array(
            [scale(x, value_scale) for x in dataset_loader_numpy_transformed])
        dataset_loader_numpy_transformed = np.array([
            translation(x,
                        distance_translation,
                        direction=direction_translation,
                        horizontal=horizontal_translation)
            for x in dataset_loader_numpy_transformed
        ])
        dataset_loader_numpy_transformed = np.array([
            elastic_transform(x, elastic_alpha, elastic_sigma)
            for x in dataset_loader_numpy_transformed
        ])
    if type_transformation == ['rotation', 'elastic']:
        dataset_loader_numpy_transformed = np.array(
            [rotation(x, value_rotation) for x in dataset_loader_numpy])
        dataset_loader_numpy_transformed = np.array([
            elastic_transform(x, elastic_alpha, elastic_sigma)
            for x in dataset_loader_numpy_transformed
        ])

    if type_transformation == ['all']:
        dataset_loader_numpy_transformed = np.array(
            [rotation(x, value_rotation) for x in dataset_loader_numpy])
        dataset_loader_numpy_transformed = np.array(
            [scale(x, value_scale) for x in dataset_loader_numpy_transformed])
        dataset_loader_numpy_transformed = np.array([
            translation(x,
                        distance_translation,
                        direction=direction_translation,
                        horizontal=horizontal_translation)
            for x in dataset_loader_numpy_transformed
        ])

    dataset_loader_preprocessed = torch.from_numpy(
        dataset_loader_numpy_transformed)
    dataset_loader_preprocessed2 = dataset_loader_preprocessed.type(
        torch.ByteTensor)

    if data_type == 'train':
        print("TRAIN TYPE")
        dataset_import.train_data = dataset_loader_preprocessed2.clone()
        dataset_import.train_labels = torch.from_numpy(
            dataset_label_numpy).clone()
    if data_type == 'train_unlabel':
        dataset_import.train_data = dataset_loader_preprocessed2.clone()
        dataset_import.train_labels = torch.from_numpy(
            np.repeat(-1,
                      dataset_import.train_data.size()[0])).clone()

    return dataset_import