Exemplo n.º 1
0
def get_ardis_dataset():
    # load the data from csv's
    ardis_images=np.loadtxt('./data/ARDIS/ARDIS_train_2828.csv', dtype='float')
    ardis_labels=np.loadtxt('./data/ARDIS/ARDIS_train_labels.csv', dtype='float')


    #### reshape to be [samples][width][height]
    ardis_images = ardis_images.reshape(ardis_images.shape[0], 28, 28).astype('float32')

    # labels are one-hot encoded
    indices_seven = np.where(ardis_labels[:,7] == 1)[0]
    images_seven = ardis_images[indices_seven,:]
    images_seven = torch.tensor(images_seven).type(torch.uint8)

    labels_seven = torch.tensor([7 for y in ardis_labels])

    ardis_dataset = EMNIST('./data', split="digits", train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))

    ardis_dataset.data = images_seven
    ardis_dataset.targets = labels_seven

    return ardis_dataset
Exemplo n.º 2
0
def get_single_task(dataroot, task):
    tf = transforms.ToTensor()

    if task.startswith('EMNIST'):
        split = task.split('/', maxsplit=2)[1]
        dataroot = join(dataroot, 'emnist')
        tf_target = (lambda x: x - 1) if split == 'letters' else None
        output_size = 26 if split == 'letters' else 10
        trainset = EMNIST(dataroot,
                          split=split,
                          train=True,
                          transform=tf,
                          target_transform=tf_target)
        trainset = stratified_subset(trainset, trainset.targets.tolist(), 500)
        testset = EMNIST(dataroot,
                         split=split,
                         train=False,
                         transform=tf,
                         target_transform=tf_target)
    elif task == 'KMNIST':
        dataroot = join(dataroot, 'kmnist')
        output_size = 10
        trainset = KMNIST(dataroot, train=True, transform=tf)
        trainset = stratified_subset(trainset, trainset.targets.tolist(), 500)
        testset = KMNIST(dataroot, train=False, transform=tf)
    else:
        raise ValueError(task)

    return trainset, testset, output_size
Exemplo n.º 3
0
def emnist_flat(split, batch_size, dev):
	kwargs = {} if dev == torch.device('cpu') \
			else { 'num_workers': 4, 'pin_memory': True }

	train = EMNIST(EMNIST_ROOT, split, train=True, download=True, transform=flat_transform)
	test = EMNIST(EMNIST_ROOT, split, train=False, download=True, transform=flat_transform)

	train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, **kwargs)
	test_loader = DataLoader(test, batch_size=batch_size, **kwargs)

	return train_loader, test_loader
def main(args):
    device = torch.device("cpu" if args.gpu < 0 else "cuda:" + str(args.gpu))

    # create the dataset
    transform = transforms.Compose([transforms.ToTensor()])
    if args.mode == 'train':
        train_dataset = EMNIST(root='./data',
                               split='mnist',
                               train=True,
                               download=True,
                               transform=transform)
        # train_dataset, _ = train_test_split(train_dataset, train_size=int(0.1*len(train_dataset)))
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=args.batch_size,
                                      shuffle=True)
    if args.mode != 'custom_mnist':
        test_dataset = EMNIST(root='./data',
                              split='mnist',
                              train=False,
                              download=True,
                              transform=transform)
    if args.mode == 'custom_mnist':
        custom_transform = transforms.Compose([
            lambda x: np.asarray(x),
            lambda x: cv2.cvtColor(255 - x, cv2.COLOR_RGB2GRAY),
            lambda x: cv2.GaussianBlur(x, (3, 3), 1), lambda x: get_roi(x),
            transforms.ToTensor(),
            transforms.Resize((28, 28)), lambda x: torch.transpose(x, 1, 2)
        ])
        test_dataset = ImageFolder('./data/CustomMNIST',
                                   transform=custom_transform)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=True)
    n_classes = len(test_dataset.classes)

    # create the model, loss function and optimizer
    model = MNISTClassifier(n_classes).to(device)
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    # train and test
    if args.mode == "train":
        train(model, loss_fcn, device, optimizer, train_dataloader,
              test_dataloader, args)
        torch.save(model.state_dict(), MODEL_STATE_FILE)
    model.load_state_dict(torch.load(MODEL_STATE_FILE))
    if args.mode != 'train':
        visualization_dataset64, _ = train_test_split(test_dataset,
                                                      train_size=64)
        visualize_predictions(model, visualization_dataset64, device)
    return test(model, loss_fcn, device, test_dataloader)
Exemplo n.º 5
0
def get_datasets(split='balanced', save=False):
    download_folder = './data'

    transform = Compose([ToTensor()])

    dataset = ConcatDataset([
        EMNIST(root=download_folder,
               split=split,
               download=True,
               train=False,
               transform=transform),
        EMNIST(root=download_folder,
               split=split,
               download=True,
               train=True,
               transform=transform)
    ])

    # Ignore the code below with argument 'save'
    if save:
        random_seed = 4211  # do not change
        n_samples = len(dataset)
        eval_size = 0.2
        indices = list(range(n_samples))
        split = int(np.floor(eval_size * n_samples))

        np.random.seed(random_seed)
        np.random.shuffle(indices)

        train_indices, eval_indices = indices[split:], indices[:split]

        # cut to half
        train_indices = train_indices[:len(train_indices) // 2]
        eval_indices = eval_indices[:len(eval_indices) // 2]

        np.savez('train_test_split.npz',
                 train=train_indices,
                 test=eval_indices)

    # just use save=False for students
    # load train test split indices
    else:
        with np.load('./train_test_split.npz') as f:
            train_indices = f['train']
            eval_indices = f['test']

    train_dataset = Subset(dataset, indices=train_indices)
    eval_dataset = Subset(dataset, indices=eval_indices)

    return train_dataset, eval_dataset
def letters(root):
    from torchvision.datasets import EMNIST
    transform = transforms.Compose([
        lambda x: x.convert("RGB"),
        transforms.Resize(224),
        transforms.ToTensor(),
        # transforms.Normalize((0.5, 0.5, 0.5), (1., 1., 1.)),
    ])
    trainset = EMNIST(root,
                      train=True,
                      split='letters',
                      transform=transform,
                      download=True)
    testset = EMNIST(root, train=False, split='letters', transform=transform)
    return trainset, testset
Exemplo n.º 7
0
def get_dataloaders_using_pytorch():
    # download dataset
    root = '../data/emnist_pthdata'

    # transforms for train and test
    train_transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.RandomAffine(degrees=10,
                                translate=(0.2, 0.2),
                                scale=(0.8, 1.2)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5], std=[0.5])
    ])

    test_transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean=[0.5], std=[0.5])])

    train_dataset = EMNIST(root=root,
                           split='balanced',
                           download=True,
                           train=True,
                           transform=train_transform)
    test_dataset = EMNIST(root=root,
                          split='balanced',
                          download=True,
                          train=False,
                          transform=test_transform)

    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=192,
                              drop_last=True,
                              sampler=ImbalancedDatasetSampler(train_dataset))
    test_loader = DataLoader(dataset=test_dataset,
                             shuffle=False,
                             batch_size=64)
    return train_loader, test_loader
Exemplo n.º 8
0
def get_emnist_data(transform=None, RGB=True):
    """Returns EMNIST train and test datasets.

    This function is assumed to be primarily used as seed data.
    DataLoaders and data splits are in synthesis.py

    Parameters:
        transform: Relevant Torchvision transforms to apply to EMNIST
        RGB: A boolean value that decides if the images are RGB

    Returns:
        Two Torchvision datasets with the EMNIST train and test sets
    """

    if transform is None and (RGB is True):
        transform = transforms.Compose([
            transforms.Lambda(lambda image: image.convert("RGB")),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, )),
        ])
    elif transform is None and (RGB is False):
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])

    emnist_train = EMNIST(os.getcwd(),
                          split="digits",
                          train=True,
                          download=True,
                          transform=transform)
    emnist_test = EMNIST(os.getcwd(),
                         split="digits",
                         train=False,
                         download=True,
                         transform=transform)
    return emnist_train, emnist_test
Exemplo n.º 9
0
    def get_test_loader(self, filename=None):
        dataset = EMNIST(os.path.join(datasets_path, 'emnist'),
                         split='balanced',
                         download=True,
                         train=False)

        def transform(x):
            return torch.bernoulli(x.unsqueeze(-3).float().div(255))

        filename = self.testfile if filename is None else filename
        classes = torch.arange(20, 47) if self.novel else None
        return get_saved_data_loader(dataset,
                                     filename,
                                     TEST_NUM_PER_CLASS,
                                     transform=transform,
                                     classes=classes)
Exemplo n.º 10
0
    def __build_truncated_dataset__(self):
        emnist_dataobj = EMNIST(self.root, split="digits", train=self.train, 
                                transform=self.transform, 
                                target_transform=self.target_transform, 
                                download=self.download)

        if self.train:
            data = emnist_dataobj.train_data
            target = emnist_dataobj.train_labels
        else:
            data = emnist_dataobj.test_data
            target = emnist_dataobj.test_labels

        if self.dataidxs is not None:
            data = data[self.dataidxs]
            target = target[self.dataidxs]

        return data, target
Exemplo n.º 11
0
    def get_train_loader(self):
        dataset = EMNIST(os.path.join(datasets_path, 'emnist'),
                         split='balanced',
                         download=True,
                         train=True)

        def transform(x):
            return torch.bernoulli(x.unsqueeze(-3).float().div(255))

        classes = torch.arange(20) if self.novel else None
        return get_random_data_loader(dataset,
                                      self.B,
                                      self.N,
                                      self.K,
                                      self.num_steps,
                                      TRAIN_NUM_PER_CLASS,
                                      transform=transform,
                                      classes=classes)
Exemplo n.º 12
0
    def load_or_generate_data(self) -> None:
        """Fetch the EMNIST dataset."""
        dataset = EMNIST(
            root=DATA_DIRNAME,
            split="byclass",
            train=self.train,
            download=False,
            transform=None,
            target_transform=None,
        )

        self._data = dataset.data
        self._targets = dataset.targets

        if self.sample_to_balance:
            self._sample_to_balance()

        if self.subsample_fraction is not None:
            self._subsample()
Exemplo n.º 13
0
def load_dataset(data_transforms,
                 batch_size=1024,
                 num_workers=0,
                 root=DATA_ROOT,
                 split='digits'):

    datasets = {}
    for name in ('train', 'valid'):
        is_training = name == 'train'
        dataset = EMNIST(root=root,
                         split=split,
                         train=is_training,
                         download=True,
                         transform=data_transforms[name])
        loader = DataLoader(dataset,
                            batch_size=batch_size,
                            num_workers=num_workers)
        datasets[name] = {'dataset': dataset, 'loader': loader}
    return datasets
Exemplo n.º 14
0
    def __init__(self, letters, n_cap):
        """
        Create a data object to house aspects of the data
        :param letters: a tuple of the English letters wanting to plot
        :param n_cap: capped observations
        """
        self.letters = letters

        # Required PyTorch transform for this project
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        # Fetch the dataset
        self.emnist = EMNIST(root="./data", split='bymerge', download=True)

        # Split
        self.data_ind, self.targ, self.flat_X, self.labels, self.X = self.filter_by_label(
            n_cap)
Exemplo n.º 15
0
    def __build_truncated_dataset__(self):

        emnist_dataobj = EMNIST(self.root, split="digits", train=self.train, 
                                transform=self.transform, 
                                target_transform=self.target_transform, 
                                download=self.download)

        if self.train:
            data = emnist_dataobj.data
            target = np.array(emnist_dataobj.targets)
        else:
            data = emnist_dataobj.data
            target = np.array(emnist_dataobj.targets)

        if self.dataidxs is not None:
            data = data[self.dataidxs]
            target = target[self.dataidxs]

        data = np.append(data, self.saved_ardis_dataset_train, axis=0)
        target = np.append(target, self.saved_ardis_label_train, axis=0)
        return data, target
Exemplo n.º 16
0
    def sample(self, B, N, K, **kwargs):
        dataset = EMNIST(os.path.join(datasets_path, 'emnist'),
                         split='balanced',
                         download=True,
                         train=False)

        def transform(x):
            return torch.bernoulli(x.unsqueeze(-3).float().div(255))

        loader = get_random_data_loader(
            dataset,
            B,
            N,
            K,
            1,
            TEST_NUM_PER_CLASS,
            transform=transform,
            classes=(torch.arange(20, 47) if self.novel else None),
            transform=transform,
            **kwargs)
        return next(iter(loader))
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='run approximation to LeNet on Mnist')
    parser.add_argument('--batch-size',
                        type=int,
                        default=512,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--approx-epochs',
                        type=int,
                        default=200,
                        metavar='N',
                        help='number of epochs to approx (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-2,
                        metavar='LR',
                        help='learning rate (default: 0.0005)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--dropout-rate',
                        type=float,
                        default=0.5,
                        metavar='p_drop',
                        help='dropout rate')
    parser.add_argument(
        '--S',
        type=int,
        default=100,
        metavar='N',
        help='number of posterior samples from the Bayesian model')
    parser.add_argument(
        '--model-path',
        type=str,
        default='../saved_models/emnist_mcdp/',
        help='number of posterior samples from the Bayesian model')
    parser.add_argument('--save-approx-model',
                        type=int,
                        default=0,
                        metavar='N',
                        help='save approx model or not? default not')
    parser.add_argument('--from-approx-model',
                        type=int,
                        default=1,
                        metavar='N',
                        help='if our model is loaded or trained')
    parser.add_argument('--test-ood-from-disk',
                        type=int,
                        default=1,
                        help='generate test samples or load from disk')

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 8, 'pin_memory': False} if use_cuda else {}

    tr_data = EMNIST(
        '../../data',
        split='balanced',
        train=True,
        transform=transforms.Compose([
            # transforms.ToPILImage(),
            transforms.Resize((28, 28)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, ), (0.5, ))
        ]),
        download=True)

    te_data = EMNIST(
        '../../data',
        split='balanced',
        train=False,
        transform=transforms.Compose([
            # transforms.ToPILImage(),
            transforms.Resize((28, 28)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, ), (0.5, ))
        ]),
        download=True)

    ood_data = datasets.Omniglot('../data',
                                 download=True,
                                 transform=transforms.Compose([
                                     transforms.Resize((28, 28)),
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.5, ), (0.5, )),
                                 ]))

    train_loader = torch.utils.data.DataLoader(tr_data,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               **kwargs)

    test_loader = torch.utils.data.DataLoader(te_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              **kwargs)

    ood_loader = torch.utils.data.DataLoader(ood_data,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             **kwargs)

    model = mnist_net().to(device)

    model.load_state_dict(torch.load(args.model_path + 'mcdp-emnist.pt'))

    test(args, model, device, test_loader)

    if args.from_approx_model == 0:
        output_samples = torch.load(args.model_path + 'emnist-mcdp-samples.pt')

    # --------------- training approx ---------

    print('approximating ...')
    fmodel = mnist_net_f().to(device)
    gmodel = mnist_net_g().to(device)

    if args.from_approx_model == 0:
        g_optimizer = optim.SGD(gmodel.parameters(),
                                lr=args.lr,
                                momentum=args.momentum)
        f_optimizer = optim.SGD(fmodel.parameters(),
                                lr=args.lr,
                                momentum=args.momentum)

        best_acc = 0
        for epoch in range(1, args.approx_epochs + 1):
            train_approx(args, fmodel, gmodel, device, train_loader,
                         f_optimizer, g_optimizer, output_samples, epoch)
            acc = test(args, fmodel, device, test_loader)
            if acc > best_acc:
                torch.save(fmodel.state_dict(),
                           args.model_path + 'mcdp-emnist-mean-emd.pt')
                torch.save(gmodel.state_dict(),
                           args.model_path + 'mcdp-emnist-conc-emd.pt')
                best_acc = acc

    else:
        fmodel.load_state_dict(
            torch.load(args.model_path + 'mcdp-emnist-mean-emd.pt'))
        gmodel.load_state_dict(
            torch.load(args.model_path + 'mcdp-emnist-conc-emd.pt'))

    print('generating teacher particles for testing&ood data ...')
    # generate particles for test and ood dataset
    model.train()
    if args.test_ood_from_disk == 1:
        teacher_test_samples = torch.load(args.model_path +
                                          'emnist-mcdp-test-samples.pt')
    else:
        with torch.no_grad():
            # obtain ensemble outputs
            all_samples = []
            for i in range(500):
                samples_a_round = []
                for data, target in test_loader:
                    data = data.to(device)
                    output = F.softmax(model(data))
                    samples_a_round.append(output)
                samples_a_round = torch.cat(samples_a_round).cpu()
                all_samples.append(samples_a_round)
            teacher_test_samples = torch.stack(all_samples).permute(1, 0, 2)

            torch.save(all_samples,
                       args.model_path + 'emnist-mcdp-test-samples.pt')

    if args.test_ood_from_disk == 1:
        teacher_ood_samples = torch.load(
            args.model_path + 'omniglot-mcdp-ood-samples-trd-emnist.pt')
    else:
        with torch.no_grad():
            # obtain ensemble outputs
            all_samples = []
            for i in range(500):
                samples_a_round = []
                for data, target in ood_loader:
                    data = data.to(device)
                    output = F.softmax(model(data))
                    samples_a_round.append(output)
                samples_a_round = torch.cat(samples_a_round).cpu()
                all_samples.append(samples_a_round)
            teacher_ood_samples = torch.stack(all_samples).permute(1, 0, 2)

            torch.save(
                all_samples,
                args.model_path + 'omniglot-mcdp-ood-samples-trd-emnist.pt')

    eval_approx(args, fmodel, gmodel, device, test_loader, ood_loader,
                teacher_test_samples, teacher_ood_samples)
Exemplo n.º 18
0
def get_loader(dataset, path, bsz, cifar_c_type=None):

    if dataset == 'cifar':
        mean = (0.4914, 0.4822, 0.4465)
        std = (0.2023, 0.1944, 0.2010)

        train_transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std)])

        cifar_dataset = datasets.CIFAR10(root=path,
                                         train=True,
                                         download=True,
                                         transform=train_transform)

        num_train = len(cifar_dataset)
        indices = torch.randperm(num_train).tolist()
        valid_size = 2048

        train_idx, valid_idx = indices[valid_size:], indices[:valid_size]

        train_dataset = data.Subset(cifar_dataset, train_idx)
        valid_dataset = data.Subset(cifar_dataset, valid_idx)

        train_loader = data.DataLoader(train_dataset,
                                       batch_size=bsz,
                                       shuffle=True,
                                       drop_last=True)

        valid_loader = data.DataLoader(valid_dataset,
                                       batch_size=2000,
                                       shuffle=True)

        return (train_loader, valid_loader)

    elif dataset == 'cifar_c':
        mean = (0.4914, 0.4822, 0.4465)
        std = (0.2023, 0.1944, 0.2010)

        train_transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std)])

        cifar_c = CIFAR10_C(path,
                            c_type=cifar_c_type,
                            transform=train_transform)

        valid_c_loader = data.DataLoader(cifar_c,
                                         batch_size=500,
                                         shuffle=False)

        return valid_c_loader
    elif dataset == 'mnist':
        train_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=0.1307, std=0.3081)
        ])
        mnist_dataset = MNIST(root=path, transform=train_transform)

        num_train = len(mnist_dataset)
        indices = torch.randperm(num_train).tolist()
        valid_size = 2048

        train_idx, valid_idx = indices[valid_size:], indices[:valid_size]

        train_dataset = data.Subset(mnist_dataset, train_idx)
        valid_dataset = data.Subset(mnist_dataset, valid_idx)

        train_loader = data.DataLoader(train_dataset,
                                       batch_size=bsz,
                                       shuffle=True,
                                       drop_last=True)

        valid_loader = data.DataLoader(valid_dataset,
                                       batch_size=2000,
                                       shuffle=True,
                                       drop_last=True)

        return (train_loader, valid_loader)

    elif dataset == 'mnist_r':
        rot = [15, 30, 45, 60, 75]
        rot_loader = []
        for i in range(5):
            rotation = transforms.Compose([
                transforms.RandomRotation(degrees=(rot[i], rot[i])),
                transforms.ToTensor(),
                transforms.Normalize(mean=0.1307, std=0.3081)
            ])
            rotation_dataset = MNIST(root=path, transform=rotation)
            rot_loader.append(
                data.DataLoader(rotation_dataset,
                                batch_size=500,
                                shuffle=False))

        return rot_loader
    elif dataset == 'svhn':
        mean = (0.4914, 0.4822, 0.4465)
        std = (0.2023, 0.1944, 0.2010)

        train_transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std)])

        svhn_dataset = SVHN(path, download=True, transform=train_transform)

        svhn_loader = data.DataLoader(dataset=svhn_dataset,
                                      batch_size=500,
                                      shuffle=True)

        return svhn_loader

    elif dataset == 'lsun':
        mean = (0.4914, 0.4822, 0.4465)
        std = (0.2023, 0.1944, 0.2010)

        train_transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std)])

        lsun_dataset = LSUN(path, transform=train_transform)

        lsun_loader = data.DataLoader(dataset=lsun_dataset,
                                      batch_size=500,
                                      shuffle=True)
        return lsun_loader

    elif dataset == 'tiny':
        mean = (0.4914, 0.4822, 0.4465)
        std = (0.2023, 0.1944, 0.2010)

        train_transform = transforms.Compose(
            [transforms.ToTensor(),
             transforms.Normalize(mean=mean, std=std)])

        tiny_dataset = TINY(path, transform=train_transform)

        tiny_loader = data.DataLoader(dataset=tiny_dataset,
                                      batch_size=500,
                                      shuffle=True)

        return tiny_loader

    elif dataset == 'fmnist':
        train_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=0.1307, std=0.3081)
        ])

        fmnist_dataset = FashionMNIST(root=path,
                                      train=True,
                                      download=True,
                                      transform=train_transform)

        fmnist_loader = data.DataLoader(dataset=fmnist_dataset, batch_size=500)

        return fmnist_loader
    elif dataset == 'emnist':
        train_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=0.1307, std=0.3081)
        ])

        emnist_dataset = EMNIST(root=path,
                                train=True,
                                split='letters',
                                download=True,
                                transform=train_transform)

        emnist_loader = data.DataLoader(dataset=emnist_dataset, batch_size=500)

        return emnist_loader

    elif dataset == 'nmnist':
        train_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=0.1307, std=0.3081)
        ])

        nmnist_dataset = ImageFolder(root=path, transform=train_transform)

        nmnist_loader = data.DataLoader(dataset=nmnist_dataset, batch_size=500)

        return nmnist_loader
Exemplo n.º 19
0
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

from torchvision.datasets import EMNIST

from experiments import ROOT_DIR

dataset = EMNIST(os.path.join(ROOT_DIR, 'data', 'EMNIST'),
                 split='letters',
                 train=True,
                 download=True)


class CNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels=1,
                               out_channels=32,
                               kernel_size=5,
                               stride=1)
        self.mp1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(in_channels=32,
                               out_channels=64,
                               kernel_size=3,
                               stride=1)
        self.mp2 = nn.MaxPool2d(kernel_size=2)

    def forward(self, x):
Exemplo n.º 20
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(
        description='Training MCDP Bayes teacher and sampling')
    parser.add_argument('--batch-size',
                        type=int,
                        default=256,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=100,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=300,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.0005)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--dropout-rate',
                        type=float,
                        default=0.5,
                        metavar='p_drop',
                        help='dropout rate')
    parser.add_argument('--S',
                        type=int,
                        default=500,
                        metavar='N',
                        help='number of posterior samples')
    parser.add_argument(
        '--model-path',
        type=str,
        default='../saved_models/emnist_mcdp/',
        metavar='N',
        help='number of posterior samples from the Bayesian model')
    parser.add_argument('--from-model',
                        type=int,
                        default=1,
                        metavar='N',
                        help='if our model is loaded or trained')

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 8, 'pin_memory': True} if use_cuda else {}

    tr_data = EMNIST('../data',
                     split='balanced',
                     train=True,
                     transform=transforms.Compose([
                         transforms.Resize((28, 28)),
                         transforms.ToTensor(),
                         transforms.Normalize((0.5, ), (0.5, ))
                     ]),
                     download=True)

    te_data = EMNIST('../data',
                     split='balanced',
                     train=False,
                     transform=transforms.Compose([
                         transforms.Resize((28, 28)),
                         transforms.ToTensor(),
                         transforms.Normalize((0.5, ), (0.5, ))
                     ]),
                     download=True)

    train_loader = torch.utils.data.DataLoader(tr_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    test_loader = torch.utils.data.DataLoader(te_data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = mnist_net().to(device)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=args.momentum)

    # --------------- train or load teacher -----------
    if args.from_model == 1:
        print('loading teacher model ...')
        model.load_state_dict(torch.load(args.model_path + 'mcdp-emnist.pt'))
    else:
        print('training teacher model ...')
        schedule = [50, 100, 150, 200, 250]
        best = 0
        for epoch in range(1, args.epochs + 1):
            if epoch in schedule:
                for g in optimizer.param_groups:
                    g['lr'] *= 0.5
            train_bayesian(args, model, device, train_loader, optimizer, epoch)
            print("teacher training epoch: {}".format(epoch))
            test_acc = test(args, model, device, test_loader)
            if test_acc > best:
                torch.save(model.state_dict(),
                           args.model_path + 'mcdp-emnist.pt')
                best = test_acc

    train_loader = torch.utils.data.DataLoader(tr_data,
                                               batch_size=args.batch_size,
                                               shuffle=False,
                                               **kwargs)

    print('generating particles for training data ...')
    # for an easier training of amortized approximation,
    # instead of sampling param. during approx,
    # get particles on simplex and store them first.
    with torch.no_grad():
        all_samples = []
        for i in range(500):
            samples_a_round = []
            for data, target in train_loader:
                data = data.to(device)
                output = F.softmax(model(data))
                samples_a_round.append(output)
            samples_a_round = torch.cat(samples_a_round).cpu()
            all_samples.append(samples_a_round)
        all_samples = torch.stack(all_samples).permute(1, 0, 2)

        torch.save(all_samples, args.model_path + 'emnist-mcdp-samples.pt')
def main(seed=0,
         n_neurons=100,
         n_train=60000,
         n_test=10000,
         inhib=100,
         lr=1e-2,
         lr_decay=1,
         time=350,
         dt=1,
         theta_plus=0.05,
         theta_decay=1e-7,
         intensity=1,
         progress_interval=10,
         update_interval=250,
         plot=False,
         train=True,
         gpu=False):

    assert n_train % update_interval == 0 and n_test % update_interval == 0, \
        'No. examples must be divisible by update_interval'

    params = [
        seed, n_neurons, n_train, inhib, lr, lr_decay, time, dt, theta_plus,
        theta_decay, intensity, progress_interval, update_interval
    ]

    test_params = [
        seed, n_neurons, n_train, n_test, inhib, lr, lr_decay, time, dt,
        theta_plus, theta_decay, intensity, progress_interval, update_interval
    ]

    model_name = '_'.join([str(x) for x in params])

    np.random.seed(seed)

    if gpu:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
        torch.cuda.manual_seed_all(seed)
    else:
        torch.manual_seed(seed)

    n_examples = n_train if train else n_test
    n_sqrt = int(np.ceil(np.sqrt(n_neurons)))
    n_classes = 26

    # Build network.
    if train:
        network = DiehlAndCook2015v2(n_inpt=784,
                                     n_neurons=n_neurons,
                                     inh=inhib,
                                     dt=dt,
                                     norm=78.4,
                                     theta_plus=theta_plus,
                                     theta_decay=theta_decay,
                                     nu=[0, lr])

    else:
        network = load_network(os.path.join(params_path, model_name + '.pt'))
        network.connections['X', 'Y'].update_rule = NoOp(
            connection=network.connections['X', 'Y'],
            nu=network.connections['X', 'Y'].nu)
        network.layers['Y'].theta_decay = 0
        network.layers['Y'].theta_plus = 0

    # Load EMNIST data.
    dataset = EMNIST(root=data_path,
                     split='letters',
                     train=train,
                     download=True)

    if train:
        images = dataset.train_data.float()
        labels = dataset.train_labels.long()
    else:
        images = dataset.test_data.float()
        labels = dataset.test_labels.long()

    if gpu:
        images = images.cuda()
        labels = labels.cuda()

    permutation = torch.randperm(images.size(0))
    images = images[permutation]
    labels = labels[permutation]
    images = images.view(-1, 784)
    images *= intensity
    labels -= 1

    # Record spikes during the simulation.
    spike_record = torch.zeros(update_interval, time, n_neurons)

    # Neuron assignments and spike proportions.
    if train:
        assignments = -torch.ones_like(torch.Tensor(n_neurons))
        proportions = torch.zeros_like(torch.Tensor(n_neurons, n_classes))
        rates = torch.zeros_like(torch.Tensor(n_neurons, n_classes))
        ngram_scores = {}
    else:
        path = os.path.join(params_path,
                            '_'.join(['auxiliary', model_name]) + '.pt')
        assignments, proportions, rates, ngram_scores = torch.load(
            open(path, 'rb'))

    # Sequence of accuracy estimates.
    curves = {'all': [], 'proportion': [], 'ngram': []}
    predictions = {scheme: torch.Tensor().long() for scheme in curves.keys()}

    if train:
        best_accuracy = 0

    spikes = {}
    for layer in set(network.layers) - {'X'}:
        spikes[layer] = Monitor(network.layers[layer],
                                state_vars=['s'],
                                time=time)
        network.add_monitor(spikes[layer], name='%s_spikes' % layer)

    # Train the network.
    if train:
        print('\nBegin training.\n')
    else:
        print('\nBegin test.\n')

    inpt_axes = None
    inpt_ims = None
    spike_ims = None
    spike_axes = None
    weights_im = None
    assigns_im = None
    perf_ax = None

    start = t()
    for i in range(n_examples):
        if i % progress_interval == 0:
            print(f'Progress: {i} / {n_examples} ({t() - start:.4f} seconds)')
            start = t()

        if i % update_interval == 0 and i > 0:
            if train:
                network.connections['X', 'Y'].update_rule.nu[1] *= lr_decay

            if i % len(labels) == 0:
                current_labels = labels[-update_interval:]
            else:
                current_labels = labels[i % len(images) - update_interval:i %
                                        len(images)]

            # Update and print accuracy evaluations.
            curves, preds = update_curves(curves,
                                          current_labels,
                                          n_classes,
                                          spike_record=spike_record,
                                          assignments=assignments,
                                          proportions=proportions,
                                          ngram_scores=ngram_scores,
                                          n=2)
            print_results(curves)

            for scheme in preds:
                predictions[scheme] = torch.cat(
                    [predictions[scheme], preds[scheme]], -1)

            # Save accuracy curves to disk.
            to_write = ['train'] + params if train else ['test'] + params
            f = '_'.join([str(x) for x in to_write]) + '.pt'
            torch.save((curves, update_interval, n_examples),
                       open(os.path.join(curves_path, f), 'wb'))

            if train:
                if any([x[-1] > best_accuracy for x in curves.values()]):
                    print(
                        'New best accuracy! Saving network parameters to disk.'
                    )

                    # Save network to disk.
                    network.save(os.path.join(params_path, model_name + '.pt'))
                    path = os.path.join(
                        params_path,
                        '_'.join(['auxiliary', model_name]) + '.pt')
                    torch.save((assignments, proportions, rates, ngram_scores),
                               open(path, 'wb'))
                    best_accuracy = max([x[-1] for x in curves.values()])

                # Assign labels to excitatory layer neurons.
                assignments, proportions, rates = assign_labels(
                    spike_record, current_labels, n_classes, rates)

                # Compute ngram scores.
                ngram_scores = update_ngram_scores(spike_record,
                                                   current_labels, n_classes,
                                                   2, ngram_scores)

            print()

        # Get next input sample.
        image = images[i % len(images)]
        sample = poisson(datum=image, time=time, dt=dt)
        inpts = {'X': sample}

        # Run the network on the input.
        network.run(inpts=inpts, time=time)

        retries = 0
        while spikes['Y'].get('s').sum() < 5 and retries < 3:
            retries += 1
            image *= 2
            sample = poisson(datum=image, time=time, dt=dt)
            inpts = {'X': sample}
            network.run(inpts=inpts, time=time)

        # Add to spikes recording.
        spike_record[i % update_interval] = spikes['Y'].get('s').t()

        # Optionally plot various simulation information.
        if plot:
            # _input = image.view(28, 28)
            # reconstruction = inpts['X'].view(time, 784).sum(0).view(28, 28)
            _spikes = {layer: spikes[layer].get('s') for layer in spikes}
            input_exc_weights = network.connections[('X', 'Y')].w
            square_weights = get_square_weights(
                input_exc_weights.view(784, n_neurons), n_sqrt, 28)
            # square_assignments = get_square_assignments(assignments, n_sqrt)

            # inpt_axes, inpt_ims = plot_input(_input, reconstruction, label=labels[i], axes=inpt_axes, ims=inpt_ims)
            spike_ims, spike_axes = plot_spikes(_spikes,
                                                ims=spike_ims,
                                                axes=spike_axes)
            weights_im = plot_weights(square_weights, im=weights_im)
            # assigns_im = plot_assignments(square_assignments, im=assigns_im)
            # perf_ax = plot_performance(curves, ax=perf_ax)

            plt.pause(1e-8)

        network.reset_()  # Reset state variables.

    print(f'Progress: {n_examples} / {n_examples} ({t() - start:.4f} seconds)')

    i += 1

    if i % len(labels) == 0:
        current_labels = labels[-update_interval:]
    else:
        current_labels = labels[i % len(images) - update_interval:i %
                                len(images)]

    # Update and print accuracy evaluations.
    curves, preds = update_curves(curves,
                                  current_labels,
                                  n_classes,
                                  spike_record=spike_record,
                                  assignments=assignments,
                                  proportions=proportions,
                                  ngram_scores=ngram_scores,
                                  n=2)
    print_results(curves)

    for scheme in preds:
        predictions[scheme] = torch.cat([predictions[scheme], preds[scheme]],
                                        -1)

    if train:
        if any([x[-1] > best_accuracy for x in curves.values()]):
            print('New best accuracy! Saving network parameters to disk.')

            # Save network to disk.
            if train:
                network.save(os.path.join(params_path, model_name + '.pt'))
                path = os.path.join(
                    params_path, '_'.join(['auxiliary', model_name]) + '.pt')
                torch.save((assignments, proportions, rates, ngram_scores),
                           open(path, 'wb'))

    if train:
        print('\nTraining complete.\n')
    else:
        print('\nTest complete.\n')

    print('Average accuracies:\n')
    for scheme in curves.keys():
        print('\t%s: %.2f' % (scheme, float(np.mean(curves[scheme]))))

    # Save accuracy curves to disk.
    to_write = ['train'] + params if train else ['test'] + params
    f = '_'.join([str(x) for x in to_write]) + '.pt'
    torch.save((curves, update_interval, n_examples),
               open(os.path.join(curves_path, f), 'wb'))

    # Save results to disk.
    results = [
        np.mean(curves['all']),
        np.mean(curves['proportion']),
        np.mean(curves['ngram']),
        np.max(curves['all']),
        np.max(curves['proportion']),
        np.max(curves['ngram'])
    ]

    to_write = params + results if train else test_params + results
    to_write = [str(x) for x in to_write]
    name = 'train.csv' if train else 'test.csv'

    if not os.path.isfile(os.path.join(results_path, name)):
        with open(os.path.join(results_path, name), 'w') as f:
            if train:
                f.write(
                    'random_seed,n_neurons,n_train,inhib,lr,lr_decay,time,timestep,theta_plus,theta_decay,intensity,'
                    'progress_interval,update_interval,mean_all_activity,mean_proportion_weighting,'
                    'mean_ngram,max_all_activity,max_proportion_weighting,max_ngram\n'
                )
            else:
                f.write(
                    'random_seed,n_neurons,n_train,n_test,inhib,lr,lr_decay,time,timestep,theta_plus,theta_decay,'
                    'intensity,progress_interval,update_interval,mean_all_activity,mean_proportion_weighting,'
                    'mean_ngram,max_all_activity,max_proportion_weighting,max_ngram\n'
                )

    with open(os.path.join(results_path, name), 'a') as f:
        f.write(','.join(to_write) + '\n')

    if labels.numel() > n_examples:
        labels = labels[:n_examples]
    else:
        while labels.numel() < n_examples:
            if 2 * labels.numel() > n_examples:
                labels = torch.cat(
                    [labels, labels[:n_examples - labels.numel()]])
            else:
                labels = torch.cat([labels, labels])

    # Compute confusion matrices and save them to disk.
    confusions = {}
    for scheme in predictions:
        confusions[scheme] = confusion_matrix(labels, predictions[scheme])

    to_write = ['train'] + params if train else ['test'] + test_params
    f = '_'.join([str(x) for x in to_write]) + '.pt'
    torch.save(confusions, os.path.join(confusion_path, f))
Exemplo n.º 22
0
def select_dataset(dataset_name, input_dim=2, n_samples=10000):
    """
    :params n_samples: number of points returned. If 0, all datapoints will be returned. For artificial data, it will throw an error.
    """
    if dataset_name == 'fmnist':
        f_mnist = FashionMNIST(root="./datasets", download=True)
        data = f_mnist.data.numpy()
        vec_data = np.reshape(data, (data.shape[0], -1))
        vec_data = np.float32(vec_data)
        labels = f_mnist.targets.numpy()
    elif dataset_name == 'emnist':
        f_mnist = EMNIST(root="./datasets", download=True, split='byclass')
        data = f_mnist.data.numpy()
        vec_data = np.reshape(data, (data.shape[0], -1))
        vec_data = np.float32(vec_data)
        labels = f_mnist.targets.numpy()
    elif dataset_name == 'kmnist':
        f_mnist = KMNIST(root="./datasets", download=True)
        data = f_mnist.data.numpy()
        vec_data = np.reshape(data, (data.shape[0], -1))
        vec_data = np.float32(vec_data)
        labels = f_mnist.targets.numpy()
    elif dataset_name == 'usps':
        f_mnist = USPS(root="./datasets", download=True)
        data = f_mnist.data
        vec_data = np.reshape(data, (data.shape[0], -1))
        vec_data = np.float32(vec_data)
        labels = np.float32(f_mnist.targets)
    elif dataset_name == 'news':
        newsgroups_train = fetch_20newsgroups(data_home='./datasets', subset='train',
                                              remove=('headers', 'footers', 'quotes'))
        vectorizer = TfidfVectorizer()
        vec_data = vectorizer.fit_transform(newsgroups_train.data).toarray()
        vec_data = np.float32(vec_data)
        labels = newsgroups_train.target
        labels = np.float32(labels)
    elif dataset_name == 'cover_type':
        file_name = file_path + "/datasets/covtype.data"
        train_data = np.array(pd.read_csv(file_name, sep=','))
        vec_data = np.float32(train_data[:, :-1])
        labels = np.float32(train_data[:, -1])
    elif dataset_name == 'char':
        digits = datasets.load_digits()
        n_samples = len(digits.images)
        data = digits.images.reshape((n_samples, -1))
        vec_data = np.float32(data)
        labels = digits.target

    elif dataset_name == 'charx':
        file_name = file_path + "/datasets/char_x.npy"
        data = np.load(file_name, allow_pickle=True)
        vec_data, labels = data[0], data[1]

    elif dataset_name == 'kdd_cup':
        cover_train = fetch_kddcup99(data_home='./datasets', download_if_missing=True)
        vec_data = cover_train.data
        string_labels = cover_train.target
        vec_data, labels = feature_tranformers.vectorizer_kdd(data=vec_data, labels=string_labels)
    elif dataset_name == 'aggregation':
        file_name = file_path + "/2d_data/Aggregation.csv"
        a = np.array(pd.read_csv(file_name, sep=';'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'compound':
        file_name = file_path + "/2d_data/Compound.txt"
        a = np.array(pd.read_csv(file_name, sep='\t'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'd31':
        file_name = file_path + "/2d_data/D31.txt"
        a = np.array(pd.read_csv(file_name, sep='\t'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'flame':
        file_name = file_path + "/2d_data/flame.txt"
        a = np.array(pd.read_csv(file_name, sep='\t'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'path_based':
        file_name = file_path + "/2d_data/pathbased.txt"
        a = np.array(pd.read_csv(file_name, sep='\t'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'r15':
        file_name = file_path + "/2d_data/R15.txt"
        a = np.array(pd.read_csv(file_name, sep='\t'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'spiral':
        file_name = file_path + "/2d_data/spiral.txt"
        a = np.array(pd.read_csv(file_name, sep='\t'))
        vec_data = a[:, 0:2]
        labels = a[:, 2]
    elif dataset_name == 'birch1':
        file_name = file_path + "/2d_data/birch1.txt"
        a = np.array(pd.read_csv(file_name, delimiter=r"\s+"))
        vec_data = a[:, :]
        labels = np.ones((vec_data.shape[0]))
    elif dataset_name == 'birch2':
        file_name = file_path + "/2d_data/birch2.txt"
        a = np.array(pd.read_csv(file_name, delimiter=r"\s+"))
        vec_data = a[:, :]
        labels = np.ones((vec_data.shape[0]))
    elif dataset_name == 'birch3':
        file_name = file_path + "/2d_data/birch3.txt"
        a = np.array(pd.read_csv(file_name, delimiter=r"\s+"))
        vec_data = a[:, :]
        labels = np.ones((vec_data.shape[0]))
    elif dataset_name == 'worms':
        file_name = file_path + "/2d_data/worms/worms_2d.txt"
        a = np.array(pd.read_csv(file_name, sep=' '))
        vec_data = a[:, :]
        labels = np.ones((vec_data.shape[0]))
    elif dataset_name == 't48k':
        file_name = file_path + "/2d_data/t4.8k.txt"
        a = np.array(pd.read_csv(file_name, sep=' '))
        vec_data = a[1:, :]
        labels = np.ones((vec_data.shape[0]))
    elif dataset_name == 'moons':
        data, labels = make_moons(n_samples=5000)
        vec_data = np.float32(data)
        labels = np.float32(labels)
    elif dataset_name == 'circles':
        data, labels = make_circles(n_samples=5000)
        vec_data = np.float32(data)
        labels = np.float32(labels)
    elif dataset_name == 'blobs':
        data, labels = make_blobs(n_samples=n_samples, centers=3)
        vec_data = np.float32(data)
        labels = np.float32(labels)
    elif dataset_name == 'gmm':
        mean_1 = np.zeros(input_dim)
        mean_2 = 100 * np.ones(input_dim)
        cov = np.eye(input_dim)
        data_1 = np.random.multivariate_normal(mean_1, cov, int(n_samples / 2))
        labels_1 = np.ones(int(n_samples / 2))
        labels_2 = 2 * np.ones(int(n_samples / 2))
        data_2 = np.random.multivariate_normal(mean_2, cov, int(n_samples / 2))
        vec_data = np.concatenate([data_1, data_2], axis=0)
        labels = np.concatenate([labels_1, labels_2], axis=0)
    elif dataset_name == 'uniform':
        vec_data = np.random.uniform(0, 1, size=(n_samples, input_dim)) * 10
        labels = np.ones(n_samples)
    elif dataset_name == 'mnist_pc':
        d_mnist = MNIST(root="./datasets", download=True)
        mnist = d_mnist.data.numpy()
        data = np.float32(np.reshape(mnist, (mnist.shape[0], -1)))
        pca_data = PCA(n_components=input_dim).fit_transform(data)
        n_indices = np.random.randint(pca_data.shape[0], size=n_samples)
        vec_data = pca_data[n_indices]
        labels = d_mnist.targets.numpy()[n_indices]
    elif dataset_name == 'usps_pc':
        d_mnist = USPS(root="./datasets", download=True)
        mnist = d_mnist.data
        data = np.float32(np.reshape(mnist, (mnist.shape[0], -1)))
        pca_data = PCA(n_components=input_dim).fit_transform(data)
        n_indices = np.random.randint(pca_data.shape[0], size=n_samples)
        vec_data = pca_data[n_indices]
        labels = np.float32(d_mnist.targets)
    elif dataset_name == 'char_pc':
        digits = datasets.load_digits()
        n_samples = len(digits.images)
        data = digits.images.reshape((n_samples, -1))
        data = np.float32(data)
        targets = digits.target
        pca_data = PCA(n_components=input_dim).fit_transform(data)
        n_indices = np.random.randint(pca_data.shape[0], size=n_samples)
        vec_data = pca_data[n_indices]
        labels = targets
    else:
        d_mnist = MNIST(root="./datasets", download=True)
        data = d_mnist.data.numpy()
        vec_data = np.reshape(data, (data.shape[0], -1))
        vec_data = np.float32(vec_data)
        labels = d_mnist.targets.numpy()

    if 0 < n_samples < vec_data.shape[0]:
        rand_indices = np.random.choice(vec_data.shape[0], size=(n_samples,), replace=False)
        return vec_data[rand_indices], labels[rand_indices]
    else:
        return vec_data, labels
Exemplo n.º 23
0
def draw_accuracy_plot(train_accuracy, test_accuracy, epochs):
    plt.plot(epochs, train_accuracy, label="train")
    plt.plot(epochs, test_accuracy, label="test")
    plt.xlabel('epochs')
    plt.ylabel('accuracy')
    plt.title('training / test accuracy')
    plt.legend()
    plt.show()


if args.dataset == 'MNIST':
    train_data = MNIST('../data/MNIST', train=True, download=True)
    test_data = MNIST('../data/MNIST', train=False, download=True)
if args.dataset == 'EMNIST':
    train_data = EMNIST('../data/EMNIST',
                        split='balanced',
                        train=True,
                        download=True)  # 47 balanced classes
    test_data = EMNIST('../data/EMNIST',
                       split='balanced',
                       train=False,
                       download=True)

# split dataset into train and test according to hyperparam 'train_split'
total_classes = list(dict.fromkeys(train_data.train_labels.numpy()))
train_split = int(len(total_classes) * args.dataset_train_split)
labels_to_combine_train_dataset = random.sample(total_classes, k=train_split)
labels_to_combine_test_dataset = list(
    set(total_classes) - set(labels_to_combine_train_dataset))

random.seed(22)
train_portion = int(len(total_classes) * (1 - args.test_data_portion))
Exemplo n.º 24
0
                    filename = "./src/images/" + str(i) + "_" + str(j-examples +1 ) + ".npy"
                    img = np.load(filename)

                image_i = 1 + i + columns * j
                ax = fig.add_subplot(rows + examples, columns, image_i)
                ax.axis("off")
                ax.set_xticklabels([])
                ax.set_yticklabels([])
                ax.set_aspect('equal')
                plt.imshow(img)
        plt.subplots_adjust(wspace=0, hspace=0.1)
        plt.show()


if __name__ == '__main__':

    dataset = EMNIST('./data', train=True, download=True, split="byclass",
                transform=transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Normalize((0.1307,), (0.3081,))
                ]))

    #viz = NLayerParameterVisualizator(dataset, list(range(2,10)))
    viz = NLayerParameterVisualizator(dataset, list(range(2, 8)))

    viz.sample_images()
    #viz.train()
    #viz.combine_images()


Exemplo n.º 25
0
# 学習用モデルのインスタンスを生成します
model = MLP()

# -----------------------------------------------------------------------------
# 学習データの準備をします
#
print('---------- 学習のデータの準備 ----------')
data_folder = '~/data'
transform = transforms.Compose([
    # データの型をTensorに変換する
    transforms.ToTensor()
])

# 学習データ
train_data_with_labels = EMNIST(
    data_folder, train=True, download=True, transform=transform, split='mnist')

train_data_loader = DataLoader(
    train_data_with_labels, batch_size=BATCH_SIZE, shuffle=True)

# 検証データ
test_data_with_labels = EMNIST(
    data_folder, train=False, download=True, transform=transform, split='mnist')
test_data_loader = DataLoader(
    test_data_with_labels, batch_size=BATCH_SIZE, shuffle=True)

# -----------------------------------------------------------------------------
# 学習の用意をします
# 損失関数は交差エントロピー誤差関数を使います
lossResult = nn.CrossEntropyLoss()
# SGD
Exemplo n.º 26
0
def download_emnist() -> None:
    """Download the EMNIST dataset via the PyTorch class."""
    logger.info(f"Data directory is: {DATA_DIRNAME}")
    dataset = EMNIST(root=DATA_DIRNAME, split="byclass", download=True)
    save_emnist_essentials(dataset)
# Deep Dictionary Configurations
input_dim = 784  # the input dimensions to be expected
dd_layer_config = [784//2]  # the layer configuration for the deep dictionary
sparse_cff = 1e-1  # regularization to enusure sparseness in the dictionary representation
epoch_per_level = 15  # the number of epochs to train for each layer of deep dictionary

# MLP Configurations
batch_size_train = 500    # the batch size of the MLP model (optimized via Adam)
batch_size_valid = 500
epoch_mlp = 25              # the number of epochs to train the MLP for
num_classes = 47  # the number of classes for classification (10 for MNIST)
mlp_lr = 5e-3  # the learning rate for the Adam optimizer to optimize the MLP model


# prepare data loaders
mnist_train_data = EMNIST('./data/', split='balanced', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()]))
train_data, valid_data = torch.utils.data.random_split(mnist_train_data, [90240, 22560], generator=torch.Generator().manual_seed(0))

train_loader_dd = torch.utils.data.DataLoader(train_data, batch_size=len(train_data), shuffle=False, pin_memory=True)
train_loader_mlp = torch.utils.data.DataLoader(train_data, batch_size=batch_size_train, shuffle=True, pin_memory=True)

valid_loader_mlp = torch.utils.data.DataLoader(valid_data, batch_size=batch_size_valid, shuffle=True, pin_memory=True)

test_data = EMNIST('./data/', split='balanced', train=False, download=True, transform=transforms.Compose([transforms.ToTensor()]))
test_loader_mlp = torch.utils.data.DataLoader(test_data, batch_size=len(test_data), shuffle=True, pin_memory=True)

# Function Class
class Identity:
    @staticmethod
    def forward(x):
        return x
Exemplo n.º 28
0
def load_emnist(split, root=None, transform=None, target_transform=None, download=True):
    root = root or Path("~/.learner/dataset").expanduser()
    train_ds = EMNIST(root=root, split=split, train=True, download=download, transform=transform, target_transform=target_transform)
    test_ds = EMNIST(root=root, split=split, train=False, download=download, transform=transform, target_transform=target_transform)
    data = Data(train_ds, test_ds=test_ds, auto_split=True)
    return data
Exemplo n.º 29
0
from torchvision.datasets import EMNIST
from torch.utils.data import TensorDataset
from data.data_helpers import split_dataset, stratified_split_dataset
import properties as prop
import pwd, os
from data.data_helpers import split_dataset, concat_datasets

DATA_PATH = pwd.getpwuid(os.getuid()).pw_dir + '/time_series_data/eMNIST'


def transform_data(data):
    data = data.unsqueeze(1).float().div(255)
    return data


train_dataset = EMNIST(DATA_PATH, split='letters', train=True, download=True) # alternatives: letters, balanced
trainX, trainy = transform_data(train_dataset.data), (train_dataset.targets-1)


train_dataset = TensorDataset(trainX, trainy)


################ test dataset ################################
test_dataset = EMNIST(DATA_PATH, split='letters', train=False, download=True) # alternatives: letters, balanced
testX, testy = transform_data(test_dataset.data), (test_dataset.targets-1)

test_dataset = TensorDataset(testX, testy)
full_dataset = concat_datasets(train_dataset, test_dataset)


def get_data_splits():
Exemplo n.º 30
0
def main():
    ## reproducibility
    np.random.seed(0)
    torch.manual_seed(0)
    torch.cuda.manual_seed_all(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    args = _parse_args()
    dist.init_process_group(backend=args.distributed_backend,
                            init_method=args.distributed_init_method,
                            world_size=args.distributed_world_size,
                            rank=args.distributed_rank)

    ## data
    if args.data_name == "CIFAR10":
        transform = Compose([
            ToTensor(),
            Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))
        ])
        train_dataset = CIFAR10(args.data_root, transform=transform)
        test_dataset = CIFAR10(args.data_root, transform=transform, train=False)
        num_classes = 10
        num_features = 32*32*3
    elif args.data_name == "EMNIST":
        # transform = Compose([ToTensor(), Normalize([0.1732], [0.3317])])
        transform = ToTensor()
        train_dataset = EMNIST(args.data_root, transform=transform,
                               split="digits")
        test_dataset = EMNIST(args.data_root, transform=transform,
                              split="digits", train=False)
        num_classes = 10
        num_features = 28*28
    else:
        transform = Compose([ToTensor(), Normalize([0.1732], [0.3317])])
        train_dataset = MNIST(args.data_root, transform=transform)
        test_dataset = MNIST(args.data_root, transform=transform, train=False)
        num_classes = 10
        num_features = 28*28
    train_sampler = DistributedSampler(train_dataset)
    test_sampler = DistributedSampler(test_dataset, shuffle=False)
    train_loader = DataLoader(train_dataset, batch_size=args.batch_size,
                              sampler=train_sampler,
                              num_workers=args.dataloader_num_workers)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size,
                             sampler=test_sampler,
                             num_workers=args.dataloader_num_workers)

    ## model
    device = torch.device("cuda" if args.use_cuda else "cpu")
    dtype = torch.float64 if args.use_double_precision else torch.float32
    d = num_features if args.model_name == "NLLS" else num_features*num_classes
    weights = torch.zeros(d, device=device, dtype=dtype)

    ## run
    header = ["iter", "ccr", "loss", "grad", "test", "alpha"]
    print(("{:^16s}"*len(header)).format(*header))
    iterations_list = []
    communication_rounds_list = []
    loss_list = []
    grad_norm_list = []
    test_val_list = []
    step_size_list = []
    communication_rounds = 0
    iteration = 0
    while communication_rounds < args.max_communication_rounds:
        iterations_list.append(iteration)
        communication_rounds_list.append(communication_rounds)
        loss, grad, _ = _obj_fun(args.model_name, train_loader, weights, device,
                                 dtype, comp_hess=False,
                                 use_regularization=args.use_regularization)
        loss_list.append(loss)
        grad_norm_list.append(grad.norm().item())
        test_val = _get_test_val(args.model_name, test_loader, weights, device,
                                 dtype)
        test_val_list.append(test_val.item())
        update_direction = _get_update_direction(args.model_name, train_loader,
                                                 weights, device, dtype, grad,
                                                 args.phi, args.theta,
                                                 args.use_exact_solve,
                                                 args.subproblem_tol,
                                                 args.subproblem_max_iter,
                                                 args.use_regularization)
        step_size = _get_step_size(args.model_name, train_loader, weights,
                                   device, dtype, loss, grad, update_direction,
                                   args.rho, args.use_regularization)
        step_size_list.append(step_size)
        weights.add_(update_direction, alpha=step_size)
        # code can be changed to have 5 or 6 communication rounds per iteration
        communication_rounds += (5 if iteration == 0 else 6)
        iteration += 1
        print("{:^16g}{:^16g}{:^16.2e}{:^16.2e}{:^16.2e}{:^16.2e}".format(
              iterations_list[-1], communication_rounds_list[-1], loss_list[-1],
              grad_norm_list[-1], test_val_list[-1], step_size_list[-1]))
    if dist.get_rank() == 0:
        data = zip(iterations_list, communication_rounds_list, loss_list,
                   grad_norm_list, test_val_list, step_size_list)
        np.savetxt('DINO.csv',list(data),delimiter=',',header=",".join(header))