def test_shape_err(): with CaptureStderr(): with cgt.scoped_update_config(debug=True, backend="python"): x = cgt.vector() y = cgt.vector() f = cgt.function([x,y],x+y) f(np.zeros(3),np.zeros(4))
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no",fixed_shape=(None,n_in)) a_n = cgt.vector("a_n",dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0)/128.0 nhid = 64 h1 = cgt.tanh(nn.Affine(128,nhid,weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax(nn.Affine(nhid,n_actions,weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n*q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np/probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def test_shape_err(): try: with CaptureStderr() as s: with cgt.scoped_update_config(debug=True): x = cgt.vector() y = cgt.vector() f = cgt.function([x,y],x+y) f(np.zeros(3),np.zeros(4)) except Exception as e: assert "f = cgt.function([x,y],x+y)" in s.getvalue()
def test_setting_weights(): X = cgt.matrix("X", fixed_shape=(None, 28*28)) model = build_model(X, 0.0) nnbuilder.set_all_weights(model, 'mnist.p') y = cgt.vector("y", dtype='i8') cost = -cgt.mean(categorical.loglik(y, model)) selected_number = cgt.argmax(model, axis=1) err_nodrop = cgt.cast(cgt.not_equal(selected_number, y), cgt.floatX).mean() computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost]) Xdata, ydata = load_data() Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(3): tstart = time.time() elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = fetch_dataset("http://rll.berkeley.edu/cgt-data/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--epochs",type=int,default=10) args = parser.parse_args() batchsize = 64 Xshape = (batchsize, 3, 32, 32) X = cgt.tensor4("X", fixed_shape = Xshape) y = cgt.vector("y", fixed_shape = (batchsize,), dtype='i4') conv1 = nn.SpatialConvolution(3, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=1e-4))(X) relu1 = nn.rectify(conv1) pool1 = nn.max_pool_2d(relu1, kernelshape=(3,3), stride=(2,2)) conv2 = nn.SpatialConvolution(32, 32, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(relu1) relu2 = nn.rectify(conv2) pool2 = nn.max_pool_2d(relu2, kernelshape=(3,3), stride=(2,2)) conv3 = nn.SpatialConvolution(32, 64, kernelshape=(5,5), pad=(2,2), weight_init=nn.IIDGaussian(std=0.01))(pool2) pool3 = nn.max_pool_2d(conv3, kernelshape=(3,3), stride=(2,2)) relu3 = nn.rectify(pool3) d0,d1,d2,d3 = relu3.shape flatlayer = relu3.reshape([d0,d1*d2*d3]) nfeats = cgt.infer_shape(flatlayer)[1] ip1 = nn.Affine(nfeats, 10)(flatlayer) logprobs = nn.logsoftmax(ip1) loss = -logprobs[cgt.arange(batchsize), y].mean() params = nn.get_parameters(loss) updates = rmsprop_updates(loss, params, stepsize=1e-3) train = cgt.function(inputs=[X, y], outputs=[loss], updates=updates) if args.profile: cgt.profiler.start() data = np.load("/Users/joschu/Data/cifar-10-batches-py/cifar10.npz") Xtrain = data["X_train"] ytrain = data["y_train"] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): for start in xrange(0, Xtrain.shape[0], batchsize): tstart = time.time() end = start+batchsize print train(Xtrain[start:end], ytrain[start:end]), time.time()-tstart if start > batchsize*5: break # elapsed = time.time() - tstart # trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) # testerr, testloss = computeloss(Xtest, ytest) # print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.profiler.print_stats() return if args.unittest: break
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def test_incsubtensor2(): W = cgt.shared(np.zeros((5, 3)), name="W") i0 = cgt.vector(dtype='i8') i1 = cgt.vector(dtype='i8') inc = cgt.vector() updates2 = {W: cgt.inc_subtensor(W, (i0, i1), inc)} f2 = cgt.function([i0, i1, inc], [], updates=updates2) f2([0, 1, 2, 2], [0, 1, 2, 2], [1, 2, 3, 4]) assert np.allclose( W.op.get_value(), np.array([ [1., 0., 0.], [0., 2., 0.], [0., 0., 7.], [0., 0., 0.], [0., 0., 0.], ]))
def make_updater_fc(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def runTest(self): cgt.set_precision('double') x = cgt.vector() y = cgt.square(x) eg = cgt.execution.compilation_pipeline([x],[y+y],[]) pprint.pprint(eg.to_json()) import cycgt interp = cycgt.cInterpreter(eg) print interp(np.array([3,4,5,6],'f8'))
def test_incsubtensor2(): W = cgt.shared(np.zeros((5,3)), name="W") i0 = cgt.vector(dtype='i8') i1 = cgt.vector(dtype='i8') inc = cgt.vector() updates2 = {W : cgt.inc_subtensor(W, (i0,i1), inc)} f2 = cgt.function([i0,i1,inc],[],updates=updates2) f2([0,1,2,2],[0,1,2,2],[1,2,3,4]) assert np.allclose(W.op.get_value(), np.array( [ [ 1., 0., 0.], [ 0., 2., 0.], [ 0., 0., 7.], [ 0., 0., 0.], [ 0., 0., 0.], ]))
def test_stack(): x = cgt.scalar() y = cgt.scalar() z = cgt.scalar() s0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(s0, {x: 1, y: 2, z: 3}).shape == (3, ) x = cgt.vector() y = cgt.vector() z = cgt.vector() v0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(v0, { x: np.zeros(2), y: np.zeros(2), z: np.zeros(2) }).shape == (3, 2) v1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(v1, { x: np.zeros(2), y: np.ones(2), z: np.zeros(2) }).shape == (2, 3) x = cgt.matrix() y = cgt.matrix() z = cgt.matrix() m0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(m0, { x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4)) }).shape == (3, 2, 4) m1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(m1, { x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4)) }).shape == (2, 3, 4) m2 = cgt.stack([x, y, z], axis=2) assert cgt.numeric_eval(m2, { x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4)) }).shape == (2, 4, 3)
def make_updater_convnet(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def test_multi_output(): for x in (cgt.scalar('x'), cgt.vector('x'), cgt.matrix('x')): for cls in (SinCos, SinCos2): y, z = core.unpack(core.Result(cls(), [x])) xnum = np.ones((3, ) * x.ndim, cgt.floatX) correct = (np.sin(xnum), np.cos(xnum)) yznum = cgt.numeric_eval([y, z], {x: xnum}) np.testing.assert_allclose(yznum, correct) f = cgt.function([x], [y, z]) np.testing.assert_allclose(f(xnum), correct)
def runTest(self): f1 = cgt.function1([], ()) assert f1() == () x = cgt.vector() xval = np.random.randn(1) f2 = cgt.function([x], [(x,x),(x,),()]) ytrue = [(xval,xval),(xval,),()] y = f2(xval) assert y==ytrue
def test_linreg(): cgt.reset_config() cgt.set_precision('double') N = 10 K = 3 Xval = np.random.randn(N, K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple, an, _ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree( g_simple, nodefn=lambda node, o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk: Xval, w_k: wval, b: bval, y_n: yval} np.testing.assert_allclose(cgt.numeric_eval(err, d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2) np.testing.assert_allclose(cgt.numeric_eval(g[0], d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval)) np.testing.assert_allclose(cgt.numeric_eval(g[1], d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0))
def test_multi_output(): for x in (cgt.scalar('x'), cgt.vector('x'), cgt.matrix('x')): for cls in (SinCos, SinCos2): y,z = core.unpack(core.Result(cls(), [x])) xnum = np.ones((3,)*x.ndim, cgt.floatX) correct = (np.sin(xnum),np.cos(xnum)) yznum = cgt.numeric_eval([y,z], {x:xnum}) np.testing.assert_allclose(yznum, correct) f = cgt.function([x],[y,z]) np.testing.assert_allclose(f(xnum), correct)
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no",fixed_shape=(None,obs_dim)) a_na = cgt.matrix("a_na",fixed_shape = (None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2*ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh(nn.Affine(obs_dim,nhid,weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh(nn.Affine(nhid,nhid,weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid,ctrl_dim,weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2*self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na ).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na ).sum(axis=1)) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n*adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function([lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj,params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
def make_updater_convnet(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) params = nn.get_parameters(loss) gparams = cgt.grad(loss, params) updates = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], loss, updates=updates)
def test_cycgt(self): x = cgt.vector('x') y = cgt.vector('y') z = y / x cs = cycgt.CallSequence([x, y], [z], list(cgt.topsorted([z]))) xshp = (4, ) yshp = (4, ) zshp = (4, ) xval = np.random.randn(*xshp).astype('float32') yval = np.random.randn(*yshp).astype('float32') zval = np.random.randn(*zshp).astype('float32') cs.set_shapes([xshp, yshp, zshp]) cs.set_inputs([xval, yval]) cs.execute() print xval, yval print xval * yval np.testing.assert_allclose(yval / xval, cs.get_outputs_numpy()[0])
def test_cycgt(self): x = cgt.vector('x') y = cgt.vector('y') z = y/x cs = cycgt.CallSequence([x,y],[z], list(cgt.topsorted([z]))) xshp = (4,) yshp = (4,) zshp = (4,) xval = np.random.randn(*xshp).astype('float32') yval = np.random.randn(*yshp).astype('float32') zval = np.random.randn(*zshp).astype('float32') cs.set_shapes([xshp,yshp,zshp]) cs.set_inputs([xval,yval]) cs.execute() print xval, yval print xval * yval np.testing.assert_allclose(yval/xval , cs.get_outputs_numpy()[0])
def CGT_dvLJ(x): N = len(x) xt = cgt.vector('xt') vLJt = 0 for j in range(1,N): for i in range(j): rho = ((xt[i*D:i*D+D] - xt[j*D:j*D+D])**2).sum() vLJt += rho**(-6.0)-(rho**(-3.0)) dvLJc = cgt.grad(4*vLJt, xt) df = cgt.function([xt],dvLJc) return df(np.ravel(x))
def test_linreg(): N = 10 K = 3 Xval = np.random.randn(N,K) wval = np.random.randn(K) bval = np.random.randn() yval = np.random.randn(N) X_nk = cgt.matrix("X") y_n = cgt.vector("y") w_k = cgt.vector("w") b = cgt.scalar(name="b") ypred = cgt.dot(X_nk, w_k) + b err = cgt.sum(cgt.square(ypred - y_n)) g = cgt.grad(err, [w_k, b]) g_simple,an,_ = cgt.core.simplify_and_analyze(g) print "Loss function:" cgt.print_tree([err]) print "Gradient:" cgt.print_tree(g) print "Gradient simplified" cgt.print_tree(g_simple, nodefn=lambda node,o: o.write(" " + an["node2hash"][node][:5])) print "-------" d = {X_nk : Xval, w_k : wval, b : bval, y_n : yval} np.testing.assert_allclose(cgt.numeric_eval(err,d), np.linalg.norm(Xval.dot(wval) + bval - yval)**2, atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[0],d), 2 * Xval.T.dot(Xval.dot(wval) + bval - yval), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()]) np.testing.assert_allclose(cgt.numeric_eval(g[1],d), 2 * np.sum(Xval.dot(wval) + bval - yval, 0), atol={"single":1e-3,"double":1e-6}[cgt.get_precision()])
def __init__(self, n_actions): Serializable.__init__(self, n_actions) cgt.set_precision('double') n_in = 128 o_no = cgt.matrix("o_no", fixed_shape=(None, n_in)) a_n = cgt.vector("a_n", dtype='i8') q_n = cgt.vector("q_n") oldpdist_np = cgt.matrix("oldpdists") h0 = (o_no - 128.0) / 128.0 nhid = 64 h1 = cgt.tanh( nn.Affine(128, nhid, weight_init=nn.IIDGaussian(std=.1))(h0)) probs_na = nn.softmax( nn.Affine(nhid, n_actions, weight_init=nn.IIDGaussian(std=0.01))(h1)) logprobs_na = cgt.log(probs_na) b = cgt.size(o_no, 0) logps_n = logprobs_na[cgt.arange(b), a_n] surr = (logps_n * q_n).mean() kl = (oldpdist_np * cgt.log(oldpdist_np / probs_na)).sum(axis=1).mean() params = nn.get_parameters(surr) gradsurr = cgt.grad(surr, params) flatgrad = cgt.concatenate([p.flatten() for p in gradsurr]) lam = cgt.scalar() penobj = surr - lam * kl self._f_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_n, q_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], probs_na) self.f_probs = cgt.function([o_no], probs_na) self.f_surr_kl = cgt.function([oldpdist_np, o_no, a_n, q_n], [surr, kl]) self.f_gradlogp = cgt.function([oldpdist_np, o_no, a_n, q_n], flatgrad) self.pc = ParamCollection(params)
def test_stack(): x = cgt.scalar() y = cgt.scalar() z = cgt.scalar() s0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(s0, {x: 1, y: 2, z: 3}).shape == (3,) x = cgt.vector() y = cgt.vector() z = cgt.vector() v0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(v0, {x: np.zeros(2), y: np.zeros(2), z: np.zeros(2)}).shape == (3, 2) v1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(v1, {x: np.zeros(2), y: np.ones(2), z: np.zeros(2)}).shape == (2, 3) x = cgt.matrix() y = cgt.matrix() z = cgt.matrix() m0 = cgt.stack([x, y, z], axis=0) assert cgt.numeric_eval(m0, {x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4))}).shape == (3, 2, 4) m1 = cgt.stack([x, y, z], axis=1) assert cgt.numeric_eval(m1, {x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4))}).shape == (2, 3, 4) m2 = cgt.stack([x, y, z], axis=2) assert cgt.numeric_eval(m2, {x: np.zeros((2, 4)), y: np.zeros((2, 4)), z: np.zeros((2, 4))}).shape == (2, 4, 3)
def make_updater_convnet_parallel(): X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) # so shapes can be inferred y = cgt.vector("y", dtype="i8") stepsize = cgt.scalar("stepsize") loss = build_convnet_return_loss(X, y) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 params = nn.get_parameters(loss) gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def test_incsubtensor0(): # First let's test fancy slice along zeroth dimension W = cgt.shared(np.zeros((5, 3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3, 3) inds = cgt.vector(dtype='i8') updates = {W: cgt.inc_subtensor(W, inds, inc)} f = cgt.function([inds, inc], [], updates=updates) f([1, 2, 4], incval) assert np.allclose( W.op.get_value(), np.array([[0., 0., 0.], [0., 1., 2.], [3., 4., 5.], [0., 0., 0.], [6., 7., 8.]]))
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X, y) params = nn.get_parameters(loss) m = nn.Module([X, y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size // 4): sli = slice(start, start + batch_size // 4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)] return cgt.function([X, y, stepsize], split_loss, updates=updates2)
def make_updater_fc_parallel(): X = cgt.matrix("X", fixed_shape=(None,28*28)) y = cgt.vector("y",dtype='i8') stepsize = cgt.scalar("stepsize") loss = build_fc_return_loss(X,y) params = nn.get_parameters(loss) m = nn.Module([X,y], [loss]) split_loss = 0 for start in xrange(0, batch_size, batch_size//4): sli = slice(start, start+batch_size//4) split_loss += m([X[sli], y[sli]])[0] split_loss /= 4 gparams = cgt.grad(split_loss, params) updates2 = [(p, p-stepsize*gp) for (p, gp) in zip(params, gparams)] return cgt.function([X,y, stepsize], split_loss, updates=updates2)
def main(num_epochs=NUM_EPOCHS): #cgt.set_precision('half') print("Building network ...") # Recurrent layers expect input of shape # (batch size, max sequence length, number of features) X = cgt.tensor3(name='X', fixed_shape=(N_BATCH, MAX_LENGTH, 2)) l_forward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN) l_backward = nnbuilder.recurrentLayer(nn_input=X, num_units=N_HIDDEN, backwards=True) #l_forward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid) #l_backward = nnbuilder.LSTMLayer(nn_input=X, num_units=N_HIDDEN, activation=cgt.sigmoid, backwards=True) #l_forward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify) #l_backward = nnbuilder.GRULayer(nn_input=X, num_units=N_HIDDEN, activation=nn.rectify, backwards=True) l_forward_slice = l_forward[:, MAX_LENGTH-1, :] # Take the last element in the forward slice time dimension l_backward_slice = l_backward[:, 0, :] # And the first element in the backward slice time dimension l_sum = cgt.concatenate([l_forward_slice, l_backward_slice], axis=1) l_out = nnbuilder.denseLayer(l_sum, num_units=1, activation=cgt.tanh) target_values = cgt.vector('target_output') predicted_values = l_out[:, 0] # For this task we only need the last value cost = cgt.mean((predicted_values - target_values)**2) # Compute SGD updates for training print("Computing updates ...") updates = nn.rmsprop(cost, nn.get_parameters(l_out), LEARNING_RATE) #updates = nn.nesterov_momentum(cost, nn.get_parameters(l_out), 0.05) # cgt functions for training and computing cost print("Compiling functions ...") train = cgt.function([X, target_values], cost, updates=updates) compute_cost = cgt.function([X, target_values], cost) # We'll use this "validation set" to periodically check progress X_val, y_val, mask_val = gen_data() print("Training ...") time_start = time.time() try: for epoch in range(num_epochs): for _ in range(EPOCH_SIZE): X, y, m = gen_data() train(X, y) cost_val = compute_cost(X_val, y_val) print("Epoch {} validation cost = {}".format(epoch+1, cost_val)) print ('Epoch took ' + str(time.time() - time_start)) time_start = time.time() except KeyboardInterrupt: pass
def main(): print("Loading data...") X = cgt.matrix("X", fixed_shape=(None, 28*28)) y = cgt.vector("y", dtype='i8') model = build_model(X, 0.0) loss = -cgt.mean(categorical.loglik(y, model)) updates = nn.rmsprop(loss, nn.get_parameters(loss), 0.01) train = cgt.function(inputs=[X, y], outputs=[], updates=updates) y_nodrop = cgt.argmax(model, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, model)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size=128 Xdata, ydata = load_data() Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(3): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) nnbuilder.save_weights(model, 'mnist')
def CGT_vLJ_Optimize(x): N = len(x) #cgt.set_precision('double') xt = cgt.vector('xt') vLJt = 0 for j in range(1,N): for i in range(j): rho = ((xt[i*D:i*D+D] - xt[j*D:j*D+D])**2).sum() vLJt += rho**(-6.0)-(rho**(-3.0)) f = cgt.function([xt],4*vLJt) dvLJc = cgt.grad(4*vLJt, xt) df = cgt.function([xt],dvLJc) CGT_BFGSres = optimize.minimize(f, np.ravel(x), \ method='L-BFGS-B', \ jac = df, \ options={'disp': False}) return np.reshape(CGT_BFGSres.x, (N,D))
def test_incsubtensor0(): # First let's test fancy slice along zeroth dimension W = cgt.shared(np.zeros((5,3)), name="W") inc = cgt.matrix() # we'll increment W by this matrix incval = np.arange(9).reshape(3,3) inds = cgt.vector(dtype='i8') updates = {W : cgt.inc_subtensor(W, inds, inc)} f = cgt.function([inds,inc],[],updates=updates) f([1,2,4],incval) assert np.allclose(W.op.get_value(), np.array( [[ 0., 0., 0.], [ 0., 1., 2.], [ 3., 4., 5.], [ 0., 0., 0.], [ 6., 7., 8.]]))
def main(): X = cgt.matrix(name='data', dtype=cgt.floatX, fixed_shape=(None, 2212)) y = cgt.vector("y", dtype='i8') model = build_nn(X) loss = -cgt.mean(categorical.loglik(y, model)) updates = nn.adagrad(loss, nn.get_parameters(loss), 0.01) y_nodrop = cgt.argmax(model, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, model)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size = 20 Xdata, ydata = load_data() Xtrain = Xdata[0:5200] ytrain = ydata[0:5200] Xtest = Xdata[5200:5573] ytest = ydata[5200:5573] sortinds = np.random.permutation(5200) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(20): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed])
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--profile", action="store_true") parser.add_argument("--dropout", action="store_true") parser.add_argument("--stepsize", type=float, default=.001) parser.add_argument("--model", choices=["dense", "conv"], default="dense") parser.add_argument("--unittest", action="store_true") parser.add_argument("--grad_check", action="store_true") args = parser.parse_args() if args.grad_check: cgt.set_precision("quad") # from mldata.org http://mldata.org/repository/data/viewslug/mnist-original/ # converted to npz mnist = fetch_dataset("http://rll.berkeley.edu/cgt-data/mnist.npz") Xdata = (mnist["X"] / 255.).astype(cgt.floatX) ydata = mnist["y"] np.random.seed(0) if args.model == "conv": Xdata = Xdata.reshape(-1, 1, 28, 28) Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] X = cgt.tensor4("X", fixed_shape=(None, 1, 28, 28)) if args.model == "conv" else cgt.matrix( "X", fixed_shape=(None, 28 * 28)) y = cgt.vector("y", dtype='i8') if args.model == "dense": p_drop_input, p_drop_hidden = (0.2, 0.5) if args.dropout else (0, 0) w_h = init_weights(784, 256) w_h2 = init_weights(256, 256) w_o = init_weights(256, 10) pofy_drop = dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden) pofy_nodrop = dense_model(X, w_h, w_h2, w_o, 0., 0.) params = [w_h, w_h2, w_o] elif args.model == "conv": p_drop_conv, p_drop_hidden = (0.2, 0.5) if args.dropout else (0, 0) w = init_weights(32, 1, 3, 3) w2 = init_weights(64, 32, 3, 3) w3 = init_weights(128, 64, 3, 3) w4 = init_weights(128 * 2 * 2, 625) w_o = init_weights(625, 10) pofy_drop = convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden) pofy_nodrop = convnet_model(X, w, w2, w3, w4, w_o, 0., 0.) params = [w, w2, w3, w4, w_o] else: raise RuntimeError("Unreachable") cost_drop = -cgt.mean(categorical.loglik(y, pofy_drop)) updates = rmsprop_updates(cost_drop, params, stepsize=args.stepsize) y_nodrop = cgt.argmax(pofy_nodrop, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, pofy_nodrop)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop, cost_nodrop]) batch_size = 128 from cgt.tests import gradcheck_model if args.grad_check: cost_nodrop = cgt.core.clone(cost_nodrop, { X: Xtrain[:1], y: ytrain[:1] }) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, params[0:1]) print "success!" return if args.profile: cgt.profiler.start() print fmt_row(10, [ "Epoch", "Train NLL", "Train Err", "Test NLL", "Test Err", "Epoch Time" ]) for i_epoch in xrange(args.epochs): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start + batch_size train(Xtrain[start:end], ytrain[start:end]) if args.unittest: return elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row( 10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.execution.profiler.print_stats()
def vector(name, dtype=None, fixed_shape=None): return cgt.vector(name, dtype, fixed_shape)
def __init__(self, obs_dim, ctrl_dim): cgt.set_precision('double') Serializable.__init__(self, obs_dim, ctrl_dim) self.obs_dim = obs_dim self.ctrl_dim = ctrl_dim o_no = cgt.matrix("o_no", fixed_shape=(None, obs_dim)) a_na = cgt.matrix("a_na", fixed_shape=(None, ctrl_dim)) adv_n = cgt.vector("adv_n") oldpdist_np = cgt.matrix("oldpdist", fixed_shape=(None, 2 * ctrl_dim)) self.logstd = logstd_1a = nn.parameter(np.zeros((1, self.ctrl_dim)), name="std_1a") std_1a = cgt.exp(logstd_1a) # Here's where we apply the network h0 = o_no nhid = 32 h1 = cgt.tanh( nn.Affine(obs_dim, nhid, weight_init=nn.IIDGaussian(std=0.1))(h0)) h2 = cgt.tanh( nn.Affine(nhid, nhid, weight_init=nn.IIDGaussian(std=0.1))(h1)) mean_na = nn.Affine(nhid, ctrl_dim, weight_init=nn.IIDGaussian(std=0.01))(h2) b = cgt.size(o_no, 0) std_na = cgt.repeat(std_1a, b, axis=0) oldmean_na = oldpdist_np[:, 0:self.ctrl_dim] oldstd_na = oldpdist_np[:, self.ctrl_dim:2 * self.ctrl_dim] logp_n = ((-.5) * cgt.square( (a_na - mean_na) / std_na).sum(axis=1)) - logstd_1a.sum() oldlogp_n = ((-.5) * cgt.square( (a_na - oldmean_na) / oldstd_na).sum(axis=1) ) - cgt.log(oldstd_na).sum(axis=1) ratio_n = cgt.exp(logp_n - oldlogp_n) surr = (ratio_n * adv_n).mean() pdists_np = cgt.concatenate([mean_na, std_na], axis=1) # kl = cgt.log(sigafter/) params = nn.get_parameters(surr) oldvar_na = cgt.square(oldstd_na) var_na = cgt.square(std_na) kl = (cgt.log(std_na / oldstd_na) + (oldvar_na + cgt.square(oldmean_na - mean_na)) / (2 * var_na) - .5).sum(axis=1).mean() lam = cgt.scalar() penobj = surr - lam * kl self._compute_surr_kl = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self._compute_grad_lagrangian = cgt.function( [lam, oldpdist_np, o_no, a_na, adv_n], cgt.concatenate([p.flatten() for p in cgt.grad(penobj, params)])) self.f_pdist = cgt.function([o_no], pdists_np) self.f_objs = cgt.function([oldpdist_np, o_no, a_na, adv_n], [surr, kl]) self.pc = ParamCollection(params)
np.random.seed(42) sortinds = np.random.permutation(Xtrain.shape[0]) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] # reshape for convnet Xtrainimg = Xtrain.reshape(-1, 1, 28, 28) Xtestimg = Xtest.reshape(-1, 1, 28, 28) # Model: # Make it VGG-like # VGG nets have 3x3 kernels with length 1 padding and max-pooling has all 2s. # # VGG is a large model so here well just do a small part of it. X = cgt.tensor4('X', fixed_shape=(None, 1, 28, 28)) y = cgt.vector('y', dtype='i8') conv1 = nn.rectify( nn.SpatialConvolution(1, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(X) ) pool1 = nn.max_pool_2d(conv1, kernelshape=(2,2), stride=(2,2)) conv2 = nn.rectify( nn.SpatialConvolution(32, 32, kernelshape=(3,3), stride=(1,1), pad=(1,1), weight_init=nn.IIDGaussian(std=.1))(pool1) ) pool2 = nn.max_pool_2d(conv2, kernelshape=(2,2), stride=(2,2)) d0, d1, d2, d3 = pool2.shape flat = pool2.reshape([d0, d1*d2*d3]) nfeats = cgt.infer_shape(flat)[1] probs = nn.softmax(nn.Affine(nfeats, 10)(flat))
def __init__(self, model="dense", im_size=[28, 28], dropout=True, devtype="cpu", grad_check=True, reg=0): if grad_check: cgt.set_precision("quad") self.model = model self.reg = reg np.random.seed(0) cgt.update_config(default_device=cgt.core.Device(devtype=devtype), backend="native") print(model) # MLP with 1 hidden layer if model == "dense1": self.Xsize = 2 * im_size[0] * im_size[1] + im_size[0] + im_size[1] self.X = cgt.matrix("X", fixed_shape=(None, self.Xsize)) self.y = cgt.vector("y", dtype='i8') self.p_drop_input, self.p_drop_hidden = (0.2, 0.5) if dropout else (0, 0) self.w_h = init_weights(self.Xsize, 256) self.w_o = init_weights(256, 8) self.pofy_drop = dense_model1(self.X, self.w_h, self.w_o, self.p_drop_input, self.p_drop_hidden) self.pofy_nodrop = dense_model1(self.X, self.w_h, self.w_o, 0., 0.) self.params = [self.w_h, self.w_o] self.l1 = cgt.abs(self.w_h).sum() + cgt.abs(self.w_o).sum() self.cost_drop = -cgt.mean( categorical.loglik(self.y, self.pofy_drop)) + self.reg * self.l1 # MLP with 2 hidden layers elif model == "dense2": self.Xsize = 2 * im_size[0] * im_size[1] + im_size[0] + im_size[1] self.X = cgt.matrix("X", fixed_shape=(None, self.Xsize)) self.y = cgt.vector("y", dtype='i8') self.p_drop_input, self.p_drop_hidden = (0.2, 0.5) if dropout else (0, 0) self.w_h = init_weights(self.Xsize, 256) self.w_h2 = init_weights(256, 256) self.w_o = init_weights(256, 8) self.pofy_drop = dense_model2(self.X, self.w_h, self.w_h2, self.w_o, self.p_drop_input, self.p_drop_hidden) self.pofy_nodrop = dense_model2(self.X, self.w_h, self.w_h2, self.w_o, 0., 0.) self.params = [self.w_h, self.w_h2, self.w_o] self.l1 = cgt.abs(self.w_h).sum() + cgt.abs( self.w_h2).sum() + cgt.abs(self.w_o).sum() self.cost_drop = -cgt.mean( categorical.loglik(self.y, self.pofy_drop)) + self.reg * self.l1 # MLP with 3 hidden layers elif model == "dense3": self.Xsize = 2 * im_size[0] * im_size[1] + im_size[0] + im_size[1] self.X = cgt.matrix("X", fixed_shape=(None, self.Xsize)) self.y = cgt.vector("y", dtype='i8') self.p_drop_input, self.p_drop_hidden = ( 0.0, [0.5, 0.5, 0.5]) if dropout else (0, [0, 0, 0]) self.w_h = init_weights(self.Xsize, 256) self.w_h2 = init_weights(256, 256) self.w_h3 = init_weights(256, 256) self.w_o = init_weights(256, 8) self.pofy_drop = dense_model3(self.X, self.w_h, self.w_h2, self.w_h3, self.w_o, self.p_drop_input, self.p_drop_hidden) self.pofy_nodrop = dense_model3(self.X, self.w_h, self.w_h2, self.w_h3, self.w_o, 0., [0., 0., 0.]) self.params = [self.w_h, self.w_h2, self.w_h3, self.w_o] self.l1 = cgt.abs(self.w_h).sum() + cgt.abs(self.w_h2).sum() + cgt.abs(self.w_h3).sum() + \ cgt.abs(self.w_o).sum() self.cost_drop = -cgt.mean( categorical.loglik(self.y, self.pofy_drop)) + self.reg * self.l1 else: raise RuntimeError("Unknown Model") self.y_nodrop = cgt.argmax(self.pofy_nodrop, axis=1) self.cost_nodrop = -cgt.mean( categorical.loglik(self.y, self.pofy_nodrop)) self.err_nodrop = cgt.cast(cgt.not_equal(self.y_nodrop, self.y), cgt.floatX).mean() self.computeloss = cgt.function( inputs=[self.X, self.y], outputs=[self.err_nodrop, self.cost_nodrop]) self.y_out = cgt.function(inputs=[self.X], outputs=[self.y_nodrop]) self.updates = rmsprop_updates(self.cost_drop, self.params) self.train = cgt.function(inputs=[self.X, self.y], outputs=[], updates=self.updates)
import cgt # X = cgt.matrix(fixed_shape=(10,3)) y = cgt.vector(fixed_shape=(3,)) w = cgt.vector(fixed_shape=(5,)) # z = X.dot(y) y+w # cgt.print_tree(cgt.core.simplify(cgt.shape(z)))
def ivector(name): return cgt.vector(name, dtype='int32')
def vector(name, dtype=None, fixed_shape=None): return cgt.vector(name, dtype, fixed_shape)
import cgt # X = cgt.matrix(fixed_shape=(10,3)) y = cgt.vector(fixed_shape=(3, )) w = cgt.vector(fixed_shape=(5, )) # z = X.dot(y) y + w # cgt.print_tree(cgt.core.simplify(cgt.shape(z)))
# scaled_data = scaler.transform(data, targets) # split data X_train, X_test, Y_train, Y_test = train_test_split(data, targets, test_size=0.2, random_state=0) # hyperparams # # Be careful when setting alpha! If it's too large # here the cost will blow up. alpha = 1e-7 epochs = 100 # Linear regression model np.random.seed(0) X = cgt.matrix("X", fixed_shape=(None, nfeats)) Y = cgt.vector("Y") w = cgt.shared(np.random.randn(nfeats) * 0.01) # prediction ypred = cgt.dot(X, w) # cost cost = cgt.square(Y - ypred).mean() # derivative with respect to w dw = cgt.grad(cost=cost, wrt=w) updates = [(w, w - dw * alpha)] # training function trainf = cgt.function(inputs=[X, Y], outputs=[], updates=updates) # cost function, no updates
def ivector(name): return cgt.vector(name, dtype='int32')
def main(): import argparse parser=argparse.ArgumentParser() parser.add_argument("--epochs",type=int,default=10) parser.add_argument("--profile",action="store_true") parser.add_argument("--dropout",action="store_true") parser.add_argument("--stepsize",type=float, default=.001) parser.add_argument("--model",choices=["dense","conv"],default="dense") parser.add_argument("--unittest",action="store_true") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--devtype",choices=["cpu","gpu"],default="cpu") args = parser.parse_args() if args.grad_check: cgt.set_precision("quad") # from mldata.org http://mldata.org/repository/data/viewslug/mnist-original/ # converted to npz mnist = fetch_dataset("http://rll.berkeley.edu/cgt-data/mnist.npz") Xdata = (mnist["X"]/255.).astype(cgt.floatX) ydata = mnist["y"] np.random.seed(0) cgt.update_config(default_device=cgt.core.Device(devtype=args.devtype), backend="native") if args.model=="conv": Xdata = Xdata.reshape(-1, 1, 28, 28) Xtrain = Xdata[0:60000] ytrain = ydata[0:60000] Xtest = Xdata[60000:70000] ytest = ydata[60000:70000] sortinds = np.random.permutation(60000) Xtrain = Xtrain[sortinds] ytrain = ytrain[sortinds] X = cgt.tensor4("X",fixed_shape=(None,1,28,28)) if args.model=="conv" else cgt.matrix("X", fixed_shape=(None,28*28)) y = cgt.vector("y",dtype='i8') if args.model == "dense": p_drop_input,p_drop_hidden = (0.2, 0.5) if args.dropout else (0,0) w_h = init_weights(784, 256) w_h2 = init_weights(256, 256) w_o = init_weights(256, 10) pofy_drop = dense_model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden) pofy_nodrop = dense_model(X, w_h, w_h2, w_o, 0., 0.) params = [w_h, w_h2, w_o] elif args.model == "conv": p_drop_conv,p_drop_hidden = (0.2, 0.5) if args.dropout else (0,0) w = init_weights(32, 1, 3, 3) w2 = init_weights(64, 32, 3, 3) w3 = init_weights(128, 64, 3, 3) w4 = init_weights(128 * 2 * 2, 625) w_o = init_weights(625, 10) pofy_drop = convnet_model(X, w, w2, w3, w4, w_o, p_drop_conv, p_drop_hidden) pofy_nodrop = convnet_model(X, w, w2, w3, w4, w_o, 0., 0.) params = [w, w2, w3, w4, w_o] else: raise RuntimeError("Unreachable") cost_drop = -cgt.mean(categorical.loglik(y, pofy_drop)) updates = rmsprop_updates(cost_drop, params, stepsize=args.stepsize) y_nodrop = cgt.argmax(pofy_nodrop, axis=1) cost_nodrop = -cgt.mean(categorical.loglik(y, pofy_nodrop)) err_nodrop = cgt.cast(cgt.not_equal(y_nodrop, y), cgt.floatX).mean() train = cgt.function(inputs=[X, y], outputs=[], updates=updates) computeloss = cgt.function(inputs=[X, y], outputs=[err_nodrop,cost_nodrop]) batch_size=128 from cgt.tests import gradcheck_model if args.grad_check: cost_nodrop = cgt.core.clone(cost_nodrop, {X:Xtrain[:1],y:ytrain[:1]}) print "doing gradient check..." print "------------------------------------" gradcheck_model(cost_nodrop, params[0:1]) print "success!" return if args.profile: cgt.profiler.start() print fmt_row(10, ["Epoch","Train NLL","Train Err","Test NLL","Test Err","Epoch Time"]) for i_epoch in xrange(args.epochs): tstart = time.time() for start in xrange(0, Xtrain.shape[0], batch_size): end = start+batch_size train(Xtrain[start:end], ytrain[start:end]) if args.unittest: return elapsed = time.time() - tstart trainerr, trainloss = computeloss(Xtrain[:len(Xtest)], ytrain[:len(Xtest)]) testerr, testloss = computeloss(Xtest, ytest) print fmt_row(10, [i_epoch, trainloss, trainerr, testloss, testerr, elapsed]) if args.profile: cgt.execution.profiler.print_stats()