def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = fetch_dataset("http://rll.berkeley.edu/cgt-data/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--profile", action="store_true") parser.add_argument("--dropout", action="store_true") parser.add_argument("--stepsize", type=float, default=.001) parser.add_argument("--model", choices=["dense", "conv"], default="dense") parser.add_argument("--unittest", action="store_true") parser.add_argument("--grad_check", action="store_true") args = parser.parse_args() if args.grad_check: cgt.set_precision("quad") # from mldata.org http://mldata.org/repository/data/viewslug/mnist-original/ # converted to npz mnist = fetch_dataset("http://rll.berkeley.edu/cgt-data/mnist.npz") Xdata = (mnist["X"] / 255.).astype(cgt.floatX) ydata = mnist["y"] np.random.seed(0) if args.model == "conv": Xdata = Xdata.reshape(-1, 1, 28, 28) Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) if args.model == "conv" else cgt.matrix( "X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') if args.model == "dense": p_drop_input, p_drop_hidden = (0.2, 0.5) if args.dropout else (0, 0) w_h = init_weights(784, 256) w_h2 = init_weights(256, 256) w_o = init_weights(256, 10) pofy_drop = dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden) pofy_nodrop = dense_model(X, w_h, w_h2, w_o, 0., 0.) params = [w_h, w_h2, w_o] elif args.model == "conv": p_drop_conv, p_drop_hidden = (0.2, 0.5) if args.dropout else (0, 0) w = init_weights(32, 1, 3, 3) w2 = init_weights(64, 32, 3, 3) w3 = init_weights(128, 64, 3, 3) w4 = init_weights(128 * 2 * 2, 625) w_o = init_weights(625, 10) pofy_drop = convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden) pofy_nodrop = convnet_model(X, w, w2, w3, w4, w_o, 0., 0.) params = [w, w2, w3, w4, w_o] else: raise RuntimeError("Unreachable") cost_drop = -cgt.mean(categorical.loglik(y, pofy_drop)) updates = rmsprop_updates(cost_drop, params, stepsize=args.stepsize) y_nodrop = cgt.argmax(pofy_nodrop, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, pofy_nodrop)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size = 128 from cgt.tests import gradcheck_model if args.grad_check: cost_nodrop = cgt.core.clone(cost_nodrop, { X: Xtrain[:1], y: ytrain[:1] }) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, params[0:1]) print "success!" return if args.profile: cgt.profiler.start() print fmt_row(10, [ "Epoch", "Train NLL", "Train Err", "Test NLL", "Test Err", "Epoch Time" ]) for i_epoch in xrange(args.epochs): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start + batch_size train(Xtrain[start:end], ytrain[start:end]) if args.unittest: return elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row( 10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.execution.profiler.print_stats()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--grad_check", action="store_true") parser.add_argument("--n_batches", type=int, default=1000000) parser.add_argument("--profile", action="store_true") parser.add_argument("--unittest", action="store_true") args = parser.parse_args() np.seterr("raise") cgt.set_precision("quad" if args.grad_check else "double") np.random.seed(0) # model parameters if args.grad_check: opt = NTMOpts( b=1, # batch size h=1, # number of heads n=2, # number of memory sites m=3, # dimension at each memory site k=4, # dimension of input p=2, # dimension of output ff_hid_sizes=[]) seq_length = 2 else: opt = NTMOpts( b=64, # batch size h=3, # number of heads n=128, # number of memory sites m=20, # dimension at each memory site k=3, # dimension of input p=1, # dimension of output ff_hid_sizes=[128, 128]) seq_length = 10 if args.unittest: seq_length = 3 args.n_batches = 3 tstart = time.time() ntm = make_ntm(opt) task = CopyTask(opt.b, seq_length, opt.p) f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps()) print "graph construction and compilation took %g seconds" % (time.time() - tstart) pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), ))) if args.grad_check: x, y = task.gen_batch() def f(thnew): thold = th.copy() pc.set_value_flat(thnew) loss = f_loss(x, y) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, th, eps=1e-8) _, _, g_anal = f_loss_and_grad(x, y) assert np.allclose(g_num, g_anal, atol=1e-8) print "Gradient check succeeded!" print "%i/%i elts of grad are nonzero" % ( (g_anal != 0).sum(), g_anal.size) return seq_num = 0 state = make_rmsprop_state(pc.get_value_flat(), .01, .95) print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"], header=True) if args.profile: cgt.profiler.start() for i in xrange(args.n_batches): x, y = task.gen_batch() seq_num += x.shape[1] l, l01, g = f_loss_and_grad(x, y) print fmt_row(13, [seq_num, l, l01, np.abs(g).max()]) rmsprop_update(g, state) pc.set_value_flat(state.theta) if not np.isfinite(l): break if args.profile: cgt.profiler.print_stats()
def main(): import argparse parser=argparse.ArgumentParser() parser.add_argument("--epochs",type=int,default=10) parser.add_argument("--profile",action="store_true") parser.add_argument("--dropout",action="store_true") parser.add_argument("--stepsize",type=float, default=.001) parser.add_argument("--model",choices=["dense","conv"],default="dense") parser.add_argument("--unittest",action="store_true") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--devtype",choices=["cpu","gpu"],default="cpu") args = parser.parse_args() if args.grad_check: cgt.set_precision("quad") # from mldata.org http://mldata.org/repository/data/viewslug/mnist-original/ # converted to npz mnist = fetch_dataset("http://rll.berkeley.edu/cgt-data/mnist.npz") Xdata = (mnist["X"]/255.).astype(cgt.floatX) ydata = mnist["y"] np.random.seed(0) cgt.update_config(default_device=cgt.core.Device(devtype=args.devtype), backend="native") if args.model=="conv": Xdata = Xdata.reshape(-1, 1, 28, 28) Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] X = cgt.tensor4("X",fixed_shape=(None,1,28,28)) if args.model=="conv" else cgt.matrix("X", fixed_shape=(None,28*28)) y = cgt.vector("y",dtype='i8') if args.model == "dense": p_drop_input,p_drop_hidden = (0.2, 0.5) if args.dropout else (0,0) w_h = init_weights(784, 256) w_h2 = init_weights(256, 256) w_o = init_weights(256, 10) pofy_drop = dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden) pofy_nodrop = dense_model(X, w_h, w_h2, w_o, 0., 0.) params = [w_h, w_h2, w_o] elif args.model == "conv": p_drop_conv,p_drop_hidden = (0.2, 0.5) if args.dropout else (0,0) w = init_weights(32, 1, 3, 3) w2 = init_weights(64, 32, 3, 3) w3 = init_weights(128, 64, 3, 3) w4 = init_weights(128 * 2 * 2, 625) w_o = init_weights(625, 10) pofy_drop = convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden) pofy_nodrop = convnet_model(X, w, w2, w3, w4, w_o, 0., 0.) params = [w, w2, w3, w4, w_o] else: raise RuntimeError("Unreachable") cost_drop = -cgt.mean(categorical.loglik(y, pofy_drop)) updates = rmsprop_updates(cost_drop, params, stepsize=args.stepsize) y_nodrop = cgt.argmax(pofy_nodrop, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, pofy_nodrop)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop,cost_nodrop]) batch_size=128 from cgt.tests import gradcheck_model if args.grad_check: cost_nodrop = cgt.core.clone(cost_nodrop, {X:Xtrain[:1],y:ytrain[:1]}) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, params[0:1]) print "success!" return if args.profile: cgt.profiler.start() print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) if args.unittest: return elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.execution.profiler.print_stats()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--grad_check",action="store_true") parser.add_argument("--n_batches",type=int,default=1000000) parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest", action="store_true") parser.add_argument("--task",choices=["copy","reverse_copy","repeat_copy"],default="copy") args = parser.parse_args() np.seterr("raise") cgt.set_precision("quad" if args.grad_check else "double") np.random.seed(0) # model parameters if args.grad_check: opt = NTMOpts( b = 1, # batch size h = 1, # number of heads n = 2, # number of memory sites m = 3, # dimension at each memory site k = 4, # dimension of input p = 2, # dimension of output ff_hid_sizes = [] ) seq_length = 2 else: opt = NTMOpts( b = 64, # batch size h = 3, # number of heads n = 128, # number of memory sites m = 20, # dimension at each memory site k = 3, # dimension of input p = 1, # dimension of output ff_hid_sizes = [128,128] ) seq_length = 10 if args.unittest: seq_length=3 args.n_batches=3 tstart = time.time() ntm = make_ntm(opt) if args.task == "copy": task = CopyTask(opt.b, seq_length, opt.p) elif args.task == "reverse_copy": task = ReverseCopyTask(opt.b, seq_length, opt.p) elif args.task == "repeat_copy": n_copies = 4 task = RepeatCopyTask(opt.b, seq_length, opt.p, n_copies) f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps()) print "graph construction and compilation took %g seconds"%(time.time()-tstart) pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),))) if args.grad_check: x,y = task.gen_batch() def f(thnew): thold = th.copy() pc.set_value_flat(thnew) loss = f_loss(x,y) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, th,eps=1e-8) _, _, g_anal = f_loss_and_grad(x,y) assert np.allclose(g_num, g_anal, atol=1e-8) print "Gradient check succeeded!" print "%i/%i elts of grad are nonzero"%( (g_anal != 0).sum(), g_anal.size ) return seq_num = 0 state = make_rmsprop_state(pc.get_value_flat(), .01, .95) print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"], header=True) if args.profile: cgt.profiler.start() for i in xrange(args.n_batches): x,y = task.gen_batch() seq_num += x.shape[1] l,l01,g = f_loss_and_grad(x,y) print fmt_row(13, [seq_num, l,l01,np.abs(g).max()]) rmsprop_update(g, state) pc.set_value_flat(state.theta) if not np.isfinite(l): break if args.profile: cgt.profiler.print_stats()