def main(): """ MNIST example weight norm reparameterized MLP with prior on rescaling parameters """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--perdatapoint', action='store_true') parser.add_argument('--coupling', action='store_true') parser.add_argument('--size', default=10000, type=int) parser.add_argument('--lrdecay', action='store_true') parser.add_argument('--lr0', default=0.1, type=float) parser.add_argument('--lbda', default=0.01, type=float) parser.add_argument('--bs', default=50, type=int) args = parser.parse_args() print args perdatapoint = args.perdatapoint coupling = 1 #args.coupling lr0 = args.lr0 lrdecay = args.lrdecay lbda = np.cast[floatX](args.lbda) bs = args.bs size = max(10, min(50000, args.size)) clip_grad = 100 max_norm = 100 # load dataset filename = '/data/lisa/data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) input_var = T.matrix('input_var') target_var = T.matrix('target_var') dataset_size = T.scalar('dataset_size') lr = T.scalar('lr') # 784 -> 20 -> 10 weight_shapes = [(784, 200), (200, 10)] num_params = sum(ws[1] for ws in weight_shapes) if perdatapoint: wd1 = input_var.shape[0] else: wd1 = 1 # stochastic hypernet ep = srng.normal(std=0.01, size=(wd1, num_params), dtype=floatX) logdets_layers = [] h_layer = lasagne.layers.InputLayer([None, num_params]) layer_temp = LinearFlowLayer(h_layer) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if coupling: layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) h_layer = PermuteLayer(h_layer, num_params) layer_temp = CoupledDenseLayer(h_layer, 200) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) weights = lasagne.layers.get_output(h_layer, ep) # primary net t = np.cast['int32'](0) layer = lasagne.layers.InputLayer([None, 784]) inputs = {layer: input_var} for ws in weight_shapes: num_param = ws[1] w_layer = lasagne.layers.InputLayer((None, ws[1])) weight = weights[:, t:t + num_param].reshape((wd1, ws[1])) inputs[w_layer] = weight layer = stochasticDenseLayer2([layer, w_layer], ws[1]) print layer.output_shape t += num_param layer.nonlinearity = nonlinearities.softmax y = T.clip(get_output(layer, inputs), 0.001, 0.999) # stability # loss terms logdets = sum([get_output(logdet, ep) for logdet in logdets_layers]) logqw = -(0.5 * (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets) #logpw = log_normal(weights,0.,-T.log(lbda)).sum(1) logpw = log_stdnormal(weights).sum(1) kl = (logqw - logpw).mean() logpyx = -cc(y, target_var).mean() loss = -(logpyx - kl / T.cast(dataset_size, floatX)) params = lasagne.layers.get_all_params([h_layer, layer]) grads = T.grad(loss, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.adam(cgrads, params, learning_rate=lr) train = theano.function([input_var, target_var, dataset_size, lr], loss, updates=updates) predict = theano.function([input_var], y.argmax(1)) records = train_model(train, predict, train_x[:size], train_y[:size], valid_x, valid_y, lr0, lrdecay, bs)
w_layer = lasagne.layers.InputLayer((None,)+ws) weight = weights[:,t:t+num_param].reshape((wd1,)+ws) inputs[w_layer] = weight layer = stochasticDenseLayer([layer,w_layer],ws[1]) t += num_param layer.nonlinearity = nonlinearities.softmax y = get_output(layer,inputs) #y = T.clip(y, 0.00001, 0.99999) # stability ########################### # loss and grad logdets = sum([get_output(logdet,ep) for logdet in logdets_layers]) logqw = - (0.5*(ep**2).sum(1) + 0.5*T.log(2*np.pi)*num_params + logdets) logpw = log_stdnormal(weights).sum(1) logpyx = - cc(y,target_var).mean() kl = (logqw - logpw).mean() ds = T.cast(dataset_size,floatX) loss = - (logpyx - kl/ds) params = lasagne.layers.get_all_params([h_layer,layer]) grads = T.grad(loss, params) ########################### # extra monitoring nll_grads = flatten_list(T.grad(-logpyx, params, disconnected_inputs='warn')).norm(2) prior_grads = flatten_list(T.grad(-logpw.mean() / ds, params, disconnected_inputs='warn')).norm(2) entropy_grads = flatten_list(T.grad(logqw.mean() / ds, params, disconnected_inputs='warn')).norm(2) outputs = [loss, -logpyx, -logpw / ds, logqw / ds, nll_grads, prior_grads, entropy_grads, logdets] # logdets is "legacy"
def main(): """ MNIST example """ import argparse parser = argparse.ArgumentParser() parser = argparse.ArgumentParser() parser.add_argument('--perdatapoint', action='store_true') parser.add_argument('--coupling', action='store_true') parser.add_argument('--size', default=10000, type=int) parser.add_argument('--lrdecay', action='store_true') parser.add_argument('--lr0', default=0.1, type=float) parser.add_argument('--lbda', default=10, type=float) parser.add_argument('--bs', default=50, type=int) args = parser.parse_args() print args perdatapoint = args.perdatapoint coupling = args.coupling size = max(10, min(50000, args.size)) clip_grad = 10 max_norm = 1000 # load dataset filename = '/data/lisa/data/mnist.pkl.gz' train_x, train_y, valid_x, valid_y, test_x, test_y = load_mnist(filename) input_var = T.matrix('input_var') target_var = T.matrix('target_var') dataset_size = T.scalar('dataset_size') lr = T.scalar('lr') # 784 -> 20 -> 10 weight_shapes = [(784, 20), (20, 20), (20, 10)] num_params = sum(np.prod(ws) for ws in weight_shapes) if perdatapoint: wd1 = input_var.shape[0] else: wd1 = 1 # stochastic hypernet ep = srng.normal(size=(wd1, num_params), dtype=floatX) logdets_layers = [] h_layer = lasagne.layers.InputLayer([None, num_params]) layer_temp = LinearFlowLayer(h_layer) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) if coupling: layer_temp = CoupledConv1DLayer(h_layer, 16, 5) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) h_layer = PermuteLayer(h_layer, num_params) layer_temp = CoupledConv1DLayer(h_layer, 16, 5) h_layer = IndexLayer(layer_temp, 0) logdets_layers.append(IndexLayer(layer_temp, 1)) weights = lasagne.layers.get_output(h_layer, ep) # primary net t = np.cast['int32'](0) layer = lasagne.layers.InputLayer([None, 784]) inputs = {layer: input_var} for ws in weight_shapes: num_param = np.prod(ws) print t, t + num_param w_layer = lasagne.layers.InputLayer((None, ) + ws) weight = weights[:, t:t + num_param].reshape((wd1, ) + ws) inputs[w_layer] = weight layer = stochasticDenseLayer([layer, w_layer], ws[1]) t += num_param layer.nonlinearity = nonlinearities.softmax y = T.clip(get_output(layer, inputs), 0.001, 0.999) # stability # loss terms logdets = sum([get_output(logdet, ep) for logdet in logdets_layers]) logqw = -(0.5 * (ep**2).sum(1) + 0.5 * T.log(2 * np.pi) * num_params + logdets) logpw = log_stdnormal(weights).sum(1) kl = (logqw - logpw).mean() logpyx = -cc(y, target_var).mean() loss = -(logpyx - kl / T.cast(dataset_size, floatX)) params = lasagne.layers.get_all_params([h_layer, layer]) grads = T.grad(loss, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = lasagne.updates.nesterov_momentum(cgrads, params, learning_rate=lr) train = theano.function([input_var, target_var, dataset_size, lr], loss, updates=updates) predict = theano.function([input_var], y.argmax(1)) records = train_model(train, predict, train_x[:size], train_y[:size], valid_x, valid_y) output_probs = theano.function([input_var], y) MCt = np.zeros((100, 1000, 10)) MCv = np.zeros((100, 1000, 10)) for i in range(100): MCt[i] = output_probs(train_x[:1000]) MCv[i] = output_probs(valid_x[:1000]) tr = np.equal(MCt.mean(0).argmax(-1), train_y[:1000].argmax(-1)).mean() va = np.equal(MCv.mean(0).argmax(-1), valid_y[:1000].argmax(-1)).mean() print "train perf=", tr print "valid perf=", va for ii in range(15): print np.round(MCt[ii][0] * 1000)