def _get_primary_net(self): nwn = self.num_mlp_params t = np.cast['int32'](0) k = np.cast['int32'](0) if self.dataset == 'mnist': p_net = lasagne.layers.InputLayer([None, 1, 28, 28]) elif self.dataset == 'cifar10': p_net = lasagne.layers.InputLayer([None, 3, 32, 32]) print p_net.output_shape inputs = {p_net: self.input_var} for ws, args in zip(self.weight_shapes, self.args): num_param = np.prod(ws) num_kernel = ws[1] weight = self.kernel_weights[k:k + num_kernel, :] weight = weight.dimshuffle(1,0).reshape(self.kernel_shape + \ (num_kernel,)) weight = weight.dimshuffle(0, 3, 1, 2) num_filters = args[0] filter_size = args[1] stride = args[2] pad = args[3] nonl = args[4] p_net = stochasticConv2DLayer([p_net, weight], num_filters, filter_size, stride, pad, nonlinearity=nonl) if args[5] == 'max': p_net = lasagne.layers.MaxPool2DLayer(p_net, 2) print p_net.output_shape t += num_param k += num_kernel assert self.num_mlp_layers == 1 for layer in range(self.num_mlp_layers): w_layer = lasagne.layers.InputLayer((None, self.num_hids)) weight = self.weights[:, t:t + self.num_hids].reshape( (self.wd1, self.num_hids)) inputs[w_layer] = weight p_net = stochasticDenseLayer2([p_net, w_layer], self.num_hids, nonlinearity=nonlinearities.rectify) t += self.num_hids w_layer = lasagne.layers.InputLayer((None, self.num_classes)) weight = self.weights[:, t:t + self.num_classes].reshape( (self.wd1, self.num_classes)) inputs[w_layer] = weight p_net = stochasticDenseLayer2([p_net, w_layer], self.num_classes, nonlinearity=nonlinearities.softmax) y = T.clip(get_output(p_net, inputs), 0.001, 0.999) # stability self.p_net = p_net self.y = y
def _get_primary_net(self): nc = self.num_classes t = np.cast['int32'](0) p_net = lasagne.layers.InputLayer([None, 1, 28, 28]) inputs = {p_net: self.input_var} for ws, args in zip(self.weight_shapes, self.args): num_param = np.prod(ws) weight = self.weights[:, t:t + num_param].reshape(ws) num_filters = args[0] filter_size = args[1] stride = args[2] pad = args[3] nonl = args[4] p_net = stochasticConv2DLayer([p_net, weight], num_filters, filter_size, stride, pad, nonlinearity=nonl) print p_net.output_shape t += num_param w_layer = lasagne.layers.InputLayer((None, nc)) weight = self.weights[:, t:t + nc].reshape((self.wd1, nc)) inputs[w_layer] = weight p_net = stochasticDenseLayer2([p_net, w_layer], nc, nonlinearity=nonlinearities.softmax) y = T.clip(get_output(p_net, inputs), 0.001, 0.999) # stability self.p_net = p_net self.y = y
def main(): """ MNIST example weight norm reparameterized MLP with prior on rescaling parameters """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--coupling', action='store_true') parser.add_argument('--size', default=10000, type=int) parser.add_argument('--lrdecay', action='store_true') parser.add_argument('--lr0', default=0.1, type=float) parser.add_argument('--lbda', default=0.01, type=float) parser.add_argument('--bs', default=50, type=int) args = parser.parse_args() print args coupling = args.coupling lr0 = args.lr0 lrdecay = args.lrdecay lbda = np.cast[floatX](args.lbda) bs = args.bs size = max(10, min(50000, args.size)) clip_grad = 5 max_norm = 10 # load dataset filename = '/data/lisa/data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) train_x = train_x.reshape(50000, 1, 28, 28) valid_x = valid_x.reshape(10000, 1, 28, 28) test_x = test_x.reshape(10000, 1, 28, 28) input_var = T.tensor4('input_var') target_var = T.matrix('target_var') dataset_size = T.scalar('dataset_size') lr = T.scalar('lr') # 784 -> 20 -> 10 weight_shapes = [ (16, 1, 5, 5), # -> (None, 16, 14, 14) (16, 16, 5, 5), # -> (None, 16, 7, 7) (16, 16, 5, 5) ] # -> (None, 16, 4, 4) num_params = sum(np.prod(ws) for ws in weight_shapes) + 10 wd1 = 1 # stochastic hypernet ep = srng.normal(std=0.01, size=(wd1, num_params), dtype=floatX) logdets_layers = [] h_layer = lasagne.layers.InputLayer([None, num_params]) layer_temp = LinearFlowLayer(h_layer) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if coupling: layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) h_layer = PermuteLayer(h_layer, num_params) layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) weights = lasagne.layers.get_output(h_layer, ep) # primary net t = np.cast['int32'](0) layer = lasagne.layers.InputLayer([None, 1, 28, 28]) inputs = {layer: input_var} for ws in weight_shapes: num_param = np.prod(ws) weight = weights[:, t:t + num_param].reshape(ws) num_filters = ws[0] filter_size = ws[2] stride = 2 pad = 'same' layer = stochasticConv2DLayer([layer, weight], num_filters, filter_size, stride, pad) print layer.output_shape t += num_param w_layer = lasagne.layers.InputLayer((None, 10)) weight = weights[:, t:t + 10].reshape((wd1, 10)) inputs[w_layer] = weight layer = stochasticDenseLayer2([layer, w_layer], 10, nonlinearity=nonlinearities.softmax) y = T.clip(get_output(layer, inputs), 0.001, 0.999) # loss terms logdets = sum([get_output(logdet, ep) for logdet in logdets_layers]) logqw = -(0.5 * (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets) logpw = log_normal(weights, 0., -T.log(lbda)).sum(1) #logpw = log_stdnormal(weights).sum(1) kl = (logqw - logpw).mean() logpyx = -cc(y, target_var).mean() loss = -(logpyx - kl / T.cast(dataset_size, floatX)) params = lasagne.layers.get_all_params([layer])[1:] # excluding rand state grads = T.grad(loss, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.adam(cgrads, params, learning_rate=lr) train = theano.function([input_var, target_var, dataset_size, lr], loss, updates=updates) predict = theano.function([input_var], y.argmax(1)) records = train_model(train, predict, train_x[:size], train_y[:size], valid_x, valid_y, lr0, lrdecay, bs) output_probs = theano.function([input_var], y) MCt = np.zeros((100, 1000, 10)) MCv = np.zeros((100, 1000, 10)) for i in range(100): MCt[i] = output_probs(train_x[:1000]) MCv[i] = output_probs(valid_x[:1000]) tr = np.equal(MCt.mean(0).argmax(-1), train_y[:1000].argmax(-1)).mean() va = np.equal(MCv.mean(0).argmax(-1), valid_y[:1000].argmax(-1)).mean() print "train perf=", tr print "valid perf=", va for ii in range(15): print np.round(MCt[ii][0] * 1000)