def main(): # import data kwargs = {'num_workers': 2} if FLAGS.cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,)) ])), batch_size=FLAGS.batchsize, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,),(0.3081,)) ])), batch_size=FLAGS.batchsize, shuffle=False, **kwargs) # for later analysis we take some sample digits mask = 255. * (np.ones((1, 28, 28))); print(FLAGS.cuda) examples = train_loader.sampler.data_source.data[5:10].numpy() images = np.vstack([mask, examples]); print("We will start training") if not FLAGS.load_pretrained: print('Starting from scratch') fc1_w_init = None fc1_b_init = None fc2_w_init = None fc2_b_init = None fc3_w_init = None fc3_b_init = None else: print('Starting from a pretrained point') ckpt_pret = torch.load('mnist_nn.pt') fc1_w_init = ckpt_pret['fc1.weight'].numpy() fc1_b_init = ckpt_pret['fc1.bias'].numpy() fc2_w_init = ckpt_pret['fc2.weight'].numpy() fc2_b_init = ckpt_pret['fc2.bias'].numpy() fc3_w_init = ckpt_pret['fc3.weight'].numpy() fc3_b_init = ckpt_pret['fc3.bias'].numpy() # build a simple MLP class Net(nn.Module): def __init__(self): super(Net, self).__init__() # activation self.relu = nn.ReLU() # layers self.fc1 = BayesianLayers.LinearGroupNJ(28 * 28, 300, clip_var=0.04, init_weight=fc1_w_init, init_bias=fc1_b_init, cuda=FLAGS.cuda) self.fc2 = BayesianLayers.LinearGroupNJ(300, 100,init_weight=fc2_w_init, init_bias=fc2_b_init, cuda=FLAGS.cuda) self.fc3 = BayesianLayers.LinearGroupNJ(100, 10,init_weight=fc3_w_init, init_bias=fc3_b_init, cuda=FLAGS.cuda) # layers including kl_divergence self.kl_list = [self.fc1, self.fc2, self.fc3] def forward(self, x): x = x.view(-1, 28 * 28) x = self.relu(self.fc1(x)) x = self.relu(self.fc2(x)) return self.fc3(x) def get_masks(self,thresholds): weight_masks = [] mask = None for i, (layer, threshold) in enumerate(zip(self.kl_list, thresholds)): # compute dropout mask if mask is None: log_alpha = layer.get_log_dropout_rates().cpu().data.numpy() mask = log_alpha < threshold else: mask = np.copy(next_mask) try: log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy() next_mask = log_alpha < thresholds[i + 1] except: # must be the last mask next_mask = np.ones(10) weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1) weight_masks.append(weight_mask.astype(np.float)) return weight_masks def kl_divergence(self): KLD = 0 for layer in self.kl_list: KLD += layer.kl_divergence() return KLD # init model model = Net().cuda();print('Loaded model') if FLAGS.cuda: model.cuda() # init optimizer optimizer = optim.Adam(model.parameters()); print('Loaded optimizer') # we optimize the variational lower bound scaled by the number of data # points (so we can keep our intuitions about hyper-params such as the learning rate) discrimination_loss = nn.functional.cross_entropy def objective(output, target, kl_divergence): discrimination_error = discrimination_loss(output, target) variational_bound = discrimination_error + kl_divergence / N if FLAGS.cuda: variational_bound = variational_bound.cuda() return variational_bound def train(epoch): model.train(); print('Entering training block');iter_num=0 for data, target in train_loader: print(iter_num) data, target = data.cuda(),target.cuda(); #import pdb; pdb.set_trace() optimizer.zero_grad() output = model(data) loss = objective(output, target, model.kl_divergence()) loss.backward() optimizer.step();iter_num +=1 # clip the variances after each step for layer in model.kl_list: layer.clip_variances() print('Epoch: {} \tTrain loss: {:.6f} \t'.format( epoch, loss.item())) def test(): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: if FLAGS.cuda: data, target = data.cuda(), target.cuda() output = model(data) test_loss += discrimination_loss(output, target, size_average=False).item() pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) print('Test loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # train the model and save some visualisations on the way for epoch in range(1, FLAGS.epochs + 1): print('Now we will train the epoch:'+str(epoch)) train(epoch) test() # visualizations weight_mus = [model.fc1.weight_mu, model.fc2.weight_mu]; #import pdb; pdb.set_trace() log_alphas = [model.fc1.get_log_dropout_rates(), model.fc2.get_log_dropout_rates(), model.fc3.get_log_dropout_rates()] visualise_weights(weight_mus, log_alphas, epoch=epoch,FLAGS=FLAGS) log_alpha = model.fc1.get_log_dropout_rates().cpu().data.numpy() visualize_pixel_importance(images, log_alpha=log_alpha, FLAGS=FLAGS, epoch=str(epoch)) if epoch%3 == 0: if not FLAGS.load_pretrained: torch.save(model.state_dict(), "epoch" + str(epoch) + "bcdl_no_pretrained.pt") else: torch.save(model.state_dict(), "epoch" + str(epoch) + "bcdl_pretrained.pt") if FLAGS.load_pretrained: generate_gif(save='pretrained_pixel', epochs=FLAGS.epochs) generate_gif(save='pretrained_weight0_e', epochs=FLAGS.epochs) generate_gif(save='pretrained_weight1_e', epochs=FLAGS.epochs) else: generate_gif(save='pixel', epochs=FLAGS.epochs) generate_gif(save='weight0_e', epochs=FLAGS.epochs) generate_gif(save='weight1_e', epochs=FLAGS.epochs) # compute compression rate and new model accuracy layers = [model.fc1, model.fc2, model.fc3] thresholds = FLAGS.thresholds compute_compression_rate(layers, model.get_masks(thresholds)) print("Test error after with reduced bit precision:") weights = compute_reduced_weights(layers, model.get_masks(thresholds)) for layer, weight in zip(layers, weights): if FLAGS.cuda: layer.post_weight_mu.data = torch.Tensor(weight).cuda() else: layer.post_weight_mu.data = torch.Tensor(weight) for layer in layers: layer.deterministic = True test()
def main(): # import data kwargs = {'num_workers': 1, 'pin_memory': True} if FLAGS.cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(),lambda x: 2 * (x - 0.5), ])), batch_size=FLAGS.batchsize, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), lambda x: 2 * (x - 0.5), ])), batch_size=FLAGS.batchsize, shuffle=True, **kwargs) # for later analysis we take some sample digits mask = 255. * (np.ones((1, 28, 28))) examples = train_loader.sampler.data_source.train_data[0:5].numpy() images = np.vstack([mask, examples]) # build a simple MLP class Net(nn.Module): def __init__(self): super(Net, self).__init__() # activation self.relu = nn.ReLU() # layers self.conv1 = BayesianLayers.Conv2dGroupNJ(1, 64, 3, stride=2, clip_var=0.04, padding=1, cuda=FLAGS.cuda) self.conv2 = BayesianLayers.Conv2dGroupNJ(64, 64, 3, stride=2, clip_var=0.04, padding=1, cuda=FLAGS.cuda) self.fc3 = BayesianLayers.LinearGroupNJ(3136, 10, cuda=FLAGS.cuda) # layers including kl_divergence self.kl_list = [self.conv1, self.conv2, self.fc3] def forward(self, x): x = self.relu(self.conv1(x)) x = self.relu(self.conv2(x)) try: n,c,w,h = x.size() x = x.view(n, c*w*h) return self.fc3(x) except: import pdb pdb.set_trace() def get_masks(self,thresholds): weight_masks = [] mask = None for i, (layer, threshold) in enumerate(zip(self.kl_list, thresholds)): # compute dropout mask if mask is None: log_alpha = layer.get_log_dropout_rates().cpu().data.numpy() mask = log_alpha < threshold else: mask = np.copy(next_mask) try: log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy() next_mask = log_alpha < thresholds[i + 1] except: # must be the last mask next_mask = np.ones(10) # mask should be shape of weight in associated layer if len(layer.weight_mu.size()) == 2: mask_shape = (1, -1) elif len(layer.weight_mu.size()) == 4: mask_shape = (-1, 1, 1, 1) weight_mask = np.ones([x for x in layer.weight_mu.size()])*mask.reshape(mask_shape) weight_masks.append(weight_mask.astype(np.float)) return weight_masks def kl_divergence(self): KLD = 0 for layer in self.kl_list: KLD += layer.kl_divergence() return KLD # init model model = Net() if FLAGS.cuda: model.cuda() # init optimizer #optimizer = optim.Adam(model.parameters()) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9) def clip_grads(model, clip=0.2): for p in model.parameters(): p.grad.data.clamp_(-clip, clip) # we optimize the variational lower bound scaled by the number of data # points (so we can keep our intuitions about hyper-params such as the learning rate) discrimination_loss = nn.functional.cross_entropy def objective(output, target, kl_divergence): discrimination_error = discrimination_loss(output, target) variational_bound = discrimination_error + kl_divergence / N if FLAGS.cuda: variational_bound = variational_bound.cuda() return variational_bound def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if FLAGS.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = objective(output, target, model.kl_divergence()) loss.backward() clip_grads(model) optimizer.step() # clip the variances after each step for layer in model.kl_list: layer.clip_variances() print('Epoch: {} \tTrain loss: {:.6f} \t'.format( epoch, loss.data[0])) def test(): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: if FLAGS.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model(data) test_loss += discrimination_loss(output, target, size_average=False).data[0] pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) print('Test loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # train the model and save some visualisations on the way for epoch in range(1, FLAGS.epochs + 1): train(epoch) test() # visualizations #weight_mus = [model.fc1.weight_mu, model.fc2.weight_mu] #log_alphas = [model.fc1.get_log_dropout_rates(), model.fc2.get_log_dropout_rates(), # model.fc3.get_log_dropout_rates()] #visualise_weights(weight_mus, log_alphas, epoch=epoch) #log_alpha = model.fc1.get_log_dropout_rates().cpu().data.numpy() #visualize_pixel_importance(images, log_alpha=log_alpha, epoch=str(epoch)) #generate_gif(save='pixel', epochs=FLAGS.epochs) #generate_gif(save='weight0_e', epochs=FLAGS.epochs) #generate_gif(save='weight1_e', epochs=FLAGS.epochs) # compute compression rate and new model accuracy layers = [model.conv1, model.conv2, model.fc3] thresholds = FLAGS.thresholds #compute_compression_rate(layers, model.get_masks(thresholds)) print("Test error after with reduced bit precision:") weights = compute_reduced_weights(layers, model.get_masks(thresholds)) for layer, weight in zip(layers, weights): if FLAGS.cuda: layer.post_weight_mu.data = torch.Tensor(weight).cuda() else: layer.post_weight_mu.data = torch.Tensor(weight) for layer in layers: layer.deterministic = True test()
def main(FLAGS): # import data kwargs = {'num_workers': 1, 'pin_memory': True} if FLAGS.cuda else {} if FLAGS.dataset == "cifar10": proj_dst = datasets.CIFAR10 num_classes = 10 elif FLAGS.dataset == "cifar100": proj_dst = datasets.CIFAR100 num_classes = 100 elif FLAGS.dataset == "mnist": proj_dst = datasets.MNIST num_classes = 10 train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), lambda x: 2 * (x - 0.5), ])), batch_size=FLAGS.batchsize, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), lambda x: 2 * (x - 0.5), ])), batch_size=FLAGS.batchsize, shuffle=True, **kwargs) if FLAGS.dataset.startswith("cifar"): if FLAGS.nettype == "lenet": model = BayesianModule.LeNet_Cifar(num_classes) elif FLAGS.nettype == "mlp": model = BayesianModule.MLP_Cifar(num_classes) elif FLAGS.dataset == "mnist": if FLAGS.nettype == "lenet": model = BayesianModule.LeNet_MNIST(num_classes) elif FLAGS.nettype == "mlp": model = BayesianModule.MLP_MNIST(num_classes) print(FLAGS.dataset, FLAGS.nettype) if FLAGS.cuda: model.cuda() # init optimizer optimizer = optim.Adam(model.parameters()) # we optimize the variational lower bound scaled by the number of data # points (so we can keep our intuitions about hyper-params such as the learning rate) discrimination_loss = nn.functional.cross_entropy class objection(object): def __init__(self, N, use_cuda=True): self.d_loss = nn.functional.cross_entropy self.N = N self.use_cuda = use_cuda def __call__(self, output, target, kl_divergence): d_error = self.d_loss(output, target) variational_bound = d_error + kl_divergence / self.N # TODO: why divide by N? if self.use_cuda: variational_bound = variational_bound.cuda() return variational_bound objective = objection(len(train_loader.dataset)) from trainer import Trainer trainer = Trainer(model, train_loader, test_loader, optimizer, objective) # train the model and save some visualisations on the way for epoch in range(1, FLAGS.epochs + 1): trainer.train(epoch) trainer.test() # compute compression rate and new model accuracy layers = model.layers thresholds = FLAGS.thresholds compute_compression_rate(layers, model.get_masks(thresholds)) print("Test error after with reduced bit precision:") weights = compute_reduced_weights(layers, model.get_masks(thresholds)) for layer, weight in zip(layers, weights): if FLAGS.cuda: layer.post_weight_mu.data = torch.Tensor(weight).cuda() else: layer.post_weight_mu.data = torch.Tensor(weight) for layer in layers: layer.deterministic = True trainer.test()
def main(): trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) test_loader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) # Le-Net class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = BayesianLayers.Conv2dGroupNJ(3, 6, 5, cuda=FLAGS.cuda) self.conv2 = BayesianLayers.Conv2dGroupNJ(6, 16, 5, cuda=FLAGS.cuda) self.fc1 = BayesianLayers.LinearGroupNJ(16*5*5, 120, clip_var=0.03, cuda=FLAGS.cuda) self.fc2 = BayesianLayers.LinearGroupNJ(120, 84, cuda=FLAGS.cuda) self.fc3 = BayesianLayers.LinearGroupNJ(84, 10, cuda=FLAGS.cuda) self.kl_list = [self.conv1, self.conv2, self.fc1, self.fc2, self.fc3] def forward(self, x): out = F.relu(self.conv1(x)) out = F.max_pool2d(out, 2) out = F.relu(self.conv2(out)) out = F.max_pool2d(out, 2) out = out.view(out.size(0), -1) out = F.relu(self.fc1(out)) out = F.relu(self.fc2(out)) out = self.fc3(out) return out def get_masks(self,thresholds): weight_masks = [] mask = None for i, (layer, threshold) in enumerate(zip(self.kl_list, thresholds)): # compute dropout mask if mask is None: log_alpha = layer.get_log_dropout_rates().cpu().data.numpy() mask = log_alpha < threshold else: mask = np.copy(next_mask) try: log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy() next_mask = log_alpha < thresholds[i + 1] except: # must be the last mask next_mask = np.ones(10) weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1) weight_masks.append(weight_mask.astype(np.float)) return weight_masks def kl_divergence(self): KLD = 0 for layer in self.kl_list: KLD += layer.kl_divergence() return KLD # init model model = Net() if FLAGS.cuda: model.cuda() # init optimizer optimizer = optim.Adam(model.parameters()) # we optimize the variational lower bound scaled by the number of data # points (so we can keep our intuitions about hyper-params such as the learning rate) discrimination_loss = nn.functional.cross_entropy def objective(output, target, kl_divergence): discrimination_error = discrimination_loss(output, target) variational_bound = discrimination_error + kl_divergence / N if FLAGS.cuda: variational_bound = variational_bound.cuda() return variational_bound def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if FLAGS.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = objective(output, target, model.kl_divergence()) loss.backward() optimizer.step() # clip the variances after each step for layer in model.kl_list: layer.clip_variances() print('Epoch: {} \tTrain loss: {:.6f} \t'.format( epoch, loss.data[0])) def test(): model.eval() test_loss = 0 correct = 0 for data, target in test_loader: if FLAGS.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model(data) test_loss += discrimination_loss(output, target, size_average=False).data[0] pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) print('Test loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # train the model and save some visualisations on the way for epoch in range(1, FLAGS.epochs + 1): train(epoch) test() ''' # visualisations weight_mus = [model.fc1.weight_mu, model.fc2.weight_mu] log_alphas = [model.fc1.get_log_dropout_rates(), model.fc2.get_log_dropout_rates(), model.fc3.get_log_dropout_rates()] visualise_weights(weight_mus, log_alphas, epoch=epoch) log_alpha = model.fc1.get_log_dropout_rates().cpu().data.numpy() visualize_pixel_importance(images, log_alpha=log_alpha, epoch=str(epoch)) generate_gif(save='pixel', epochs=FLAGS.epochs) generate_gif(save='weight0_e', epochs=FLAGS.epochs) generate_gif(save='weight1_e', epochs=FLAGS.epochs) ''' # compute compression rate and new model accuracy layers = [model.conv1, model.conv2, model.fc1, model.fc2, model.fc3] thresholds = FLAGS.thresholds compute_compression_rate(layers, model.get_masks(thresholds)) print("Test error after with reduced bit precision:") weights = compute_reduced_weights(layers, model.get_masks(thresholds)) for layer, weight in zip(layers, weights): if FLAGS.cuda: layer.post_weight_mu.data = torch.Tensor(weight).cuda() else: layer.post_weight_mu.data = torch.Tensor(weight) for layer in layers: layer.deterministic = True test()