Exemplo n.º 1
0
def main():
    # import data
    kwargs = {'num_workers': 2} if FLAGS.cuda else {}

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))
                       ])),
        batch_size=FLAGS.batchsize, shuffle=True, **kwargs)

    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([
            transforms.ToTensor(), transforms.Normalize((0.1307,),(0.3081,))
        ])),
        batch_size=FLAGS.batchsize, shuffle=False, **kwargs)

    # for later analysis we take some sample digits
    mask = 255. * (np.ones((1, 28, 28))); print(FLAGS.cuda)
    examples = train_loader.sampler.data_source.data[5:10].numpy()
    images = np.vstack([mask, examples]); print("We will start training")
    


    if not FLAGS.load_pretrained:
        print('Starting from scratch')
        fc1_w_init = None
        fc1_b_init = None
        fc2_w_init = None
        fc2_b_init = None
        fc3_w_init = None
        fc3_b_init = None
    else:
        print('Starting from a pretrained point')
        ckpt_pret = torch.load('mnist_nn.pt')
        fc1_w_init = ckpt_pret['fc1.weight'].numpy()
        fc1_b_init = ckpt_pret['fc1.bias'].numpy()
        fc2_w_init = ckpt_pret['fc2.weight'].numpy()
        fc2_b_init = ckpt_pret['fc2.bias'].numpy()
        fc3_w_init = ckpt_pret['fc3.weight'].numpy()
        fc3_b_init = ckpt_pret['fc3.bias'].numpy()
    # build a simple MLP
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # activation
            self.relu = nn.ReLU()
            # layers
            self.fc1 = BayesianLayers.LinearGroupNJ(28 * 28, 300, clip_var=0.04, init_weight=fc1_w_init, init_bias=fc1_b_init, cuda=FLAGS.cuda)
            self.fc2 = BayesianLayers.LinearGroupNJ(300, 100,init_weight=fc2_w_init, init_bias=fc2_b_init, cuda=FLAGS.cuda)
            self.fc3 = BayesianLayers.LinearGroupNJ(100, 10,init_weight=fc3_w_init, init_bias=fc3_b_init, cuda=FLAGS.cuda)
            # layers including kl_divergence
            self.kl_list = [self.fc1, self.fc2, self.fc3]

        def forward(self, x):
            x = x.view(-1, 28 * 28)
            x = self.relu(self.fc1(x))
            x = self.relu(self.fc2(x))
            return self.fc3(x)

        def get_masks(self,thresholds):
            weight_masks = []
            mask = None
            for i, (layer, threshold) in enumerate(zip(self.kl_list, thresholds)):
                # compute dropout mask
                if mask is None:
                    log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
                    mask = log_alpha < threshold
                else:
                    mask = np.copy(next_mask)
                try:
                    log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
                    next_mask = log_alpha < thresholds[i + 1]
                except:
                    # must be the last mask
                    next_mask = np.ones(10)

                weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
                weight_masks.append(weight_mask.astype(np.float))
            return weight_masks

        def kl_divergence(self):
            KLD = 0
            for layer in self.kl_list:
                KLD += layer.kl_divergence()
            return KLD

    # init model
    model = Net().cuda();print('Loaded model')
    if FLAGS.cuda:
        model.cuda()

    # init optimizer
    optimizer = optim.Adam(model.parameters()); print('Loaded optimizer')

    # we optimize the variational lower bound scaled by the number of data
    # points (so we can keep our intuitions about hyper-params such as the learning rate)
    discrimination_loss = nn.functional.cross_entropy

    def objective(output, target, kl_divergence):
        discrimination_error = discrimination_loss(output, target)
        variational_bound = discrimination_error + kl_divergence / N
        if FLAGS.cuda:
            variational_bound = variational_bound.cuda()
        return variational_bound

    def train(epoch):
        model.train(); print('Entering training block');iter_num=0
        for data, target in train_loader:
            print(iter_num)
            data, target = data.cuda(),target.cuda(); #import pdb; pdb.set_trace()
            optimizer.zero_grad()
            output = model(data)
            loss = objective(output, target, model.kl_divergence())
            loss.backward()
            optimizer.step();iter_num +=1
            # clip the variances after each step
            for layer in model.kl_list:
                layer.clip_variances()
        print('Epoch: {} \tTrain loss: {:.6f} \t'.format(
            epoch, loss.item()))

    def test():
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                if FLAGS.cuda:
                    data, target = data.cuda(), target.cuda()
                output = model(data)
                test_loss += discrimination_loss(output, target, size_average=False).item()
                pred = output.data.max(1, keepdim=True)[1]
                correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        test_loss /= len(test_loader.dataset)
        print('Test loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

    # train the model and save some visualisations on the way
    for epoch in range(1, FLAGS.epochs + 1):
        print('Now we will train the epoch:'+str(epoch))
        train(epoch)
        test()
        # visualizations
        weight_mus = [model.fc1.weight_mu, model.fc2.weight_mu]; #import pdb; pdb.set_trace()
        log_alphas = [model.fc1.get_log_dropout_rates(), model.fc2.get_log_dropout_rates(),
                      model.fc3.get_log_dropout_rates()]
        visualise_weights(weight_mus, log_alphas, epoch=epoch,FLAGS=FLAGS)
        log_alpha = model.fc1.get_log_dropout_rates().cpu().data.numpy()
        visualize_pixel_importance(images, log_alpha=log_alpha, FLAGS=FLAGS, epoch=str(epoch))
        if epoch%3 == 0:
            if not FLAGS.load_pretrained:
                torch.save(model.state_dict(), "epoch" + str(epoch) + "bcdl_no_pretrained.pt")
            else:
                torch.save(model.state_dict(), "epoch" + str(epoch) + "bcdl_pretrained.pt")
    if FLAGS.load_pretrained:
        generate_gif(save='pretrained_pixel', epochs=FLAGS.epochs)
        generate_gif(save='pretrained_weight0_e', epochs=FLAGS.epochs)
        generate_gif(save='pretrained_weight1_e', epochs=FLAGS.epochs)
    else:
        generate_gif(save='pixel', epochs=FLAGS.epochs)
        generate_gif(save='weight0_e', epochs=FLAGS.epochs)
        generate_gif(save='weight1_e', epochs=FLAGS.epochs)


    # compute compression rate and new model accuracy
    layers = [model.fc1, model.fc2, model.fc3]
    thresholds = FLAGS.thresholds
    compute_compression_rate(layers, model.get_masks(thresholds))

    print("Test error after with reduced bit precision:")

    weights = compute_reduced_weights(layers, model.get_masks(thresholds))
    for layer, weight in zip(layers, weights):
        if FLAGS.cuda:
            layer.post_weight_mu.data = torch.Tensor(weight).cuda()
        else:
            layer.post_weight_mu.data = torch.Tensor(weight)
    for layer in layers: layer.deterministic = True
    test()
Exemplo n.º 2
0
def main():
    # import data
    kwargs = {'num_workers': 1, 'pin_memory': True} if FLAGS.cuda else {}

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),lambda x: 2 * (x - 0.5),
                       ])),
        batch_size=FLAGS.batchsize, shuffle=True, **kwargs)

    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data', train=False, transform=transforms.Compose([
            transforms.ToTensor(), lambda x: 2 * (x - 0.5),
        ])),
        batch_size=FLAGS.batchsize, shuffle=True, **kwargs)

    # for later analysis we take some sample digits
    mask = 255. * (np.ones((1, 28, 28)))
    examples = train_loader.sampler.data_source.train_data[0:5].numpy()
    images = np.vstack([mask, examples])

    # build a simple MLP
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            # activation
            self.relu = nn.ReLU()
            # layers
            self.conv1 = BayesianLayers.Conv2dGroupNJ(1, 64, 3, stride=2, clip_var=0.04, padding=1, cuda=FLAGS.cuda)
            self.conv2 = BayesianLayers.Conv2dGroupNJ(64, 64, 3, stride=2, clip_var=0.04, padding=1, cuda=FLAGS.cuda)
            self.fc3 = BayesianLayers.LinearGroupNJ(3136, 10, cuda=FLAGS.cuda)
            # layers including kl_divergence
            self.kl_list = [self.conv1, self.conv2, self.fc3]

        def forward(self, x):
            x = self.relu(self.conv1(x))
            x = self.relu(self.conv2(x))
            try:
                n,c,w,h = x.size()
                x = x.view(n, c*w*h)
                return self.fc3(x)
            except:
                import pdb
                pdb.set_trace()

        def get_masks(self,thresholds):
            weight_masks = []
            mask = None
            for i, (layer, threshold) in enumerate(zip(self.kl_list, thresholds)):
                # compute dropout mask
                if mask is None:
                    log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
                    mask = log_alpha < threshold
                else:
                    mask = np.copy(next_mask)
                try:
                    log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
                    next_mask = log_alpha < thresholds[i + 1]
                except:
                    # must be the last mask
                    next_mask = np.ones(10)
                
                # mask should be shape of weight in associated layer
                if len(layer.weight_mu.size()) == 2:
                    mask_shape = (1, -1)
                elif len(layer.weight_mu.size()) == 4:
                    mask_shape = (-1, 1, 1, 1)
                weight_mask = np.ones([x for x in layer.weight_mu.size()])*mask.reshape(mask_shape)
                weight_masks.append(weight_mask.astype(np.float))
            return weight_masks

        def kl_divergence(self):
            KLD = 0
            for layer in self.kl_list:
                KLD += layer.kl_divergence()
            return KLD

    # init model
    model = Net()
    if FLAGS.cuda:
        model.cuda()

    # init optimizer
    #optimizer = optim.Adam(model.parameters())
    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

    def clip_grads(model, clip=0.2):
        for p in model.parameters():
            p.grad.data.clamp_(-clip, clip)

    # we optimize the variational lower bound scaled by the number of data
    # points (so we can keep our intuitions about hyper-params such as the learning rate)
    discrimination_loss = nn.functional.cross_entropy

    def objective(output, target, kl_divergence):
        discrimination_error = discrimination_loss(output, target)
        variational_bound = discrimination_error + kl_divergence / N
        if FLAGS.cuda:
            variational_bound = variational_bound.cuda()
        return variational_bound

    def train(epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            if FLAGS.cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = objective(output, target, model.kl_divergence())
            loss.backward()
            clip_grads(model)
            optimizer.step()
            # clip the variances after each step
            for layer in model.kl_list:
                layer.clip_variances()
        print('Epoch: {} \tTrain loss: {:.6f} \t'.format(
            epoch, loss.data[0]))

    def test():
        model.eval()
        test_loss = 0
        correct = 0
        for data, target in test_loader:
            if FLAGS.cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            test_loss += discrimination_loss(output, target, size_average=False).data[0]
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        test_loss /= len(test_loader.dataset)
        print('Test loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

    # train the model and save some visualisations on the way
    for epoch in range(1, FLAGS.epochs + 1):
        train(epoch)
        test()
        # visualizations
        #weight_mus = [model.fc1.weight_mu, model.fc2.weight_mu]
        #log_alphas = [model.fc1.get_log_dropout_rates(), model.fc2.get_log_dropout_rates(),
        #              model.fc3.get_log_dropout_rates()]
        #visualise_weights(weight_mus, log_alphas, epoch=epoch)
        #log_alpha = model.fc1.get_log_dropout_rates().cpu().data.numpy()
        #visualize_pixel_importance(images, log_alpha=log_alpha, epoch=str(epoch))

    #generate_gif(save='pixel', epochs=FLAGS.epochs)
    #generate_gif(save='weight0_e', epochs=FLAGS.epochs)
    #generate_gif(save='weight1_e', epochs=FLAGS.epochs)

    # compute compression rate and new model accuracy
    layers = [model.conv1, model.conv2, model.fc3]
    thresholds = FLAGS.thresholds
    #compute_compression_rate(layers, model.get_masks(thresholds))

    print("Test error after with reduced bit precision:")

    weights = compute_reduced_weights(layers, model.get_masks(thresholds))
    for layer, weight in zip(layers, weights):
        if FLAGS.cuda:
            layer.post_weight_mu.data = torch.Tensor(weight).cuda()
        else:
            layer.post_weight_mu.data = torch.Tensor(weight)
    for layer in layers: layer.deterministic = True
    test()
Exemplo n.º 3
0
def main(FLAGS):
    # import data
    kwargs = {'num_workers': 1, 'pin_memory': True} if FLAGS.cuda else {}

    if FLAGS.dataset == "cifar10":
        proj_dst = datasets.CIFAR10
        num_classes = 10
    elif FLAGS.dataset == "cifar100":
        proj_dst = datasets.CIFAR100
        num_classes = 100
    elif FLAGS.dataset == "mnist":
        proj_dst = datasets.MNIST
        num_classes = 10

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           lambda x: 2 * (x - 0.5),
                       ])),
        batch_size=FLAGS.batchsize, shuffle=True, **kwargs)

    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            lambda x: 2 * (x - 0.5),
        ])),
        batch_size=FLAGS.batchsize, shuffle=True, **kwargs)

    if FLAGS.dataset.startswith("cifar"):
        if FLAGS.nettype == "lenet":
            model = BayesianModule.LeNet_Cifar(num_classes)
        elif FLAGS.nettype == "mlp":
            model = BayesianModule.MLP_Cifar(num_classes)
    elif FLAGS.dataset == "mnist":
        if FLAGS.nettype == "lenet":
            model = BayesianModule.LeNet_MNIST(num_classes)
        elif FLAGS.nettype == "mlp":
            model = BayesianModule.MLP_MNIST(num_classes)

    print(FLAGS.dataset, FLAGS.nettype)
    if FLAGS.cuda:
        model.cuda()

    # init optimizer
    optimizer = optim.Adam(model.parameters())

    # we optimize the variational lower bound scaled by the number of data
    # points (so we can keep our intuitions about hyper-params such as the learning rate)
    discrimination_loss = nn.functional.cross_entropy

    class objection(object):
        def __init__(self, N, use_cuda=True):
            self.d_loss = nn.functional.cross_entropy
            self.N = N
            self.use_cuda = use_cuda

        def __call__(self, output, target, kl_divergence):
            d_error = self.d_loss(output, target)
            variational_bound = d_error + kl_divergence / self.N  # TODO: why divide by N?
            if self.use_cuda:
                variational_bound = variational_bound.cuda()
            return variational_bound

    objective = objection(len(train_loader.dataset))

    from trainer import Trainer
    trainer = Trainer(model, train_loader, test_loader, optimizer, objective)
    # train the model and save some visualisations on the way
    for epoch in range(1, FLAGS.epochs + 1):
        trainer.train(epoch)
        trainer.test()

    # compute compression rate and new model accuracy
    layers = model.layers
    thresholds = FLAGS.thresholds
    compute_compression_rate(layers, model.get_masks(thresholds))

    print("Test error after with reduced bit precision:")

    weights = compute_reduced_weights(layers, model.get_masks(thresholds))
    for layer, weight in zip(layers, weights):
        if FLAGS.cuda:
            layer.post_weight_mu.data = torch.Tensor(weight).cuda()
        else:
            layer.post_weight_mu.data = torch.Tensor(weight)

    for layer in layers:
        layer.deterministic = True
    trainer.test()
def main():

    trainset     = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

    testset     = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
    test_loader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)


    # Le-Net
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = BayesianLayers.Conv2dGroupNJ(3, 6, 5, cuda=FLAGS.cuda)
            self.conv2 = BayesianLayers.Conv2dGroupNJ(6, 16, 5, cuda=FLAGS.cuda)
            self.fc1   = BayesianLayers.LinearGroupNJ(16*5*5, 120, clip_var=0.03, cuda=FLAGS.cuda)
            self.fc2   = BayesianLayers.LinearGroupNJ(120, 84, cuda=FLAGS.cuda)
            self.fc3   = BayesianLayers.LinearGroupNJ(84, 10, cuda=FLAGS.cuda)

            self.kl_list = [self.conv1, self.conv2, self.fc1, self.fc2, self.fc3]

        def forward(self, x):
            out = F.relu(self.conv1(x))
            out = F.max_pool2d(out, 2)
            out = F.relu(self.conv2(out))
            out = F.max_pool2d(out, 2)
            out = out.view(out.size(0), -1)
            out = F.relu(self.fc1(out))
            out = F.relu(self.fc2(out))
            out = self.fc3(out)
            return out

        def get_masks(self,thresholds):
            weight_masks = []
            mask = None
            for i, (layer, threshold) in enumerate(zip(self.kl_list, thresholds)):
                # compute dropout mask
                if mask is None:
                    log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
                    mask = log_alpha < threshold
                else:
                    mask = np.copy(next_mask)
                try:
                    log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
                    next_mask = log_alpha < thresholds[i + 1]
                except:
                    # must be the last mask
                    next_mask = np.ones(10)

                weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
                weight_masks.append(weight_mask.astype(np.float))
            return weight_masks

        def kl_divergence(self):
            KLD = 0
            for layer in self.kl_list:
                KLD += layer.kl_divergence()
            return KLD

    # init model
    model = Net()
    if FLAGS.cuda:
        model.cuda()

    # init optimizer
    optimizer = optim.Adam(model.parameters())

    # we optimize the variational lower bound scaled by the number of data
    # points (so we can keep our intuitions about hyper-params such as the learning rate)
    discrimination_loss = nn.functional.cross_entropy

    def objective(output, target, kl_divergence):
        discrimination_error = discrimination_loss(output, target)
        variational_bound = discrimination_error + kl_divergence / N
        if FLAGS.cuda:
            variational_bound = variational_bound.cuda()
        return variational_bound

    def train(epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            if FLAGS.cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            optimizer.zero_grad()
            output = model(data)
            loss = objective(output, target, model.kl_divergence())
            loss.backward()
            optimizer.step()
            # clip the variances after each step
            for layer in model.kl_list:
                layer.clip_variances()
        print('Epoch: {} \tTrain loss: {:.6f} \t'.format(
            epoch, loss.data[0]))

    def test():
        model.eval()
        test_loss = 0
        correct = 0
        for data, target in test_loader:
            if FLAGS.cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            test_loss += discrimination_loss(output, target, size_average=False).data[0]
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        test_loss /= len(test_loader.dataset)
        print('Test loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
            test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)))

    # train the model and save some visualisations on the way
    for epoch in range(1, FLAGS.epochs + 1):
        train(epoch)
        test()
        '''
        # visualisations
        weight_mus = [model.fc1.weight_mu, model.fc2.weight_mu]
        log_alphas = [model.fc1.get_log_dropout_rates(), model.fc2.get_log_dropout_rates(),
                      model.fc3.get_log_dropout_rates()]
        visualise_weights(weight_mus, log_alphas, epoch=epoch)
        log_alpha = model.fc1.get_log_dropout_rates().cpu().data.numpy()
        visualize_pixel_importance(images, log_alpha=log_alpha, epoch=str(epoch))

    generate_gif(save='pixel', epochs=FLAGS.epochs)
    generate_gif(save='weight0_e', epochs=FLAGS.epochs)
    generate_gif(save='weight1_e', epochs=FLAGS.epochs)
    '''
    # compute compression rate and new model accuracy
    layers = [model.conv1, model.conv2, model.fc1, model.fc2, model.fc3]
    thresholds = FLAGS.thresholds
    compute_compression_rate(layers, model.get_masks(thresholds))

    print("Test error after with reduced bit precision:")

    weights = compute_reduced_weights(layers, model.get_masks(thresholds))
    for layer, weight in zip(layers, weights):
        if FLAGS.cuda:
            layer.post_weight_mu.data = torch.Tensor(weight).cuda()
        else:
            layer.post_weight_mu.data = torch.Tensor(weight)
    for layer in layers: layer.deterministic = True
    test()