def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1, 1)): inputs_val = numpy.random.random(inputs_shape).astype('float32') dCdH_val = numpy.random.random(dCdH_shape).astype('float32') inputs = shared(inputs_val) dCdH = shared(dCdH_val) conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH, WShape=filters_shape, d=subsample) img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3)) topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3)) if (subsample == (1, 1, 1)): conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img, topgrad) else: conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)( img, topgrad, shape=filters_shape[1:4]) conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1) f_ref = theano.function([], conv) f = theano.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def test_compare_1D_and_2D_upsampling_values(self): """Compare 1D and 2D upsampling This method verifies the bilinear upsampling done by using 1D and 2D kernels will generate the same result. """ # checking upsampling with ratio 5 input_x = np.random.rand(5, 4, 6, 7).astype(theano.config.floatX) mat_1D = bilinear_upsampling(input=input_x, ratio=5, batch_size=5, num_input_channels=4, use_1D_kernel=True) mat_2D = bilinear_upsampling(input=input_x, ratio=5, batch_size=5, num_input_channels=4, use_1D_kernel=False) f_1D = theano.function([], mat_1D, mode=self.compile_mode) f_2D = theano.function([], mat_2D, mode=self.compile_mode) utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06) # checking upsampling with ratio 8 input_x = np.random.rand(12, 11, 10, 7).astype(theano.config.floatX) mat_1D = bilinear_upsampling(input=input_x, ratio=8, batch_size=12, num_input_channels=11, use_1D_kernel=True) mat_2D = bilinear_upsampling(input=input_x, ratio=8, batch_size=12, num_input_channels=11, use_1D_kernel=False) f_1D = theano.function([], mat_1D, mode=self.compile_mode) f_2D = theano.function([], mat_2D, mode=self.compile_mode) utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)
def visualize_states(hidden_states, updates, train_stream, valid_stream, args): # Get all the hidden_states filter_states = VariableFilter(theano_name_regex="hidden_state_.*") all_states = filter_states(hidden_states) all_states = sorted(all_states, key=lambda var: var.name[-1]) # Get all the hidden_cells filter_cells = VariableFilter(theano_name_regex="hidden_cells_.*") all_cells = filter_cells(hidden_states) all_cells = sorted(all_cells, key=lambda var: var.name[-1]) # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not(has_indices(args.dataset))) # Compile the function logger.info("The compilation of the function has started") if args.rnn_type == "lstm" and args.visualize_cells: compiled = theano.function(inputs=ComputationGraph(all_cells).inputs, outputs=all_cells, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) else: compiled = theano.function(inputs=ComputationGraph(all_states).inputs, outputs=all_states, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Plot the function plot("hidden_state", train_stream, compiled, args)
def test_in_transit(): t = np.linspace(-20, 20, 1000) m_planet = np.array([0.3, 0.5]) m_star = 1.45 r_star = 1.5 orbit = KeplerianOrbit( m_star=m_star, r_star=r_star, t0=np.array([0.5, 17.4]), period=np.array([10.0, 5.3]), ecc=np.array([0.1, 0.8]), omega=np.array([0.5, 1.3]), m_planet=m_planet, ) r_pl = np.array([0.1, 0.03]) coords = theano.function([], orbit.get_relative_position(t))() r2 = coords[0]**2 + coords[1]**2 inds = theano.function([], orbit.in_transit(t, r=r_pl))() m = np.isin(np.arange(len(t)), inds) in_ = r2[inds] <= ((r_star + r_pl)**2)[None, :] in_ &= coords[2][inds] > 0 assert np.all(np.any(in_, axis=1)) out = r2[~m] > ((r_star + r_pl)**2)[None, :] out |= coords[2][~m] <= 0 assert np.all(out)
def test_bilinear_kernel_1D(self): """Test 1D kernels used in bilinear upsampling This method tests the correctness of the 1D kernel values used in bilinear upsampling for some upsampling ratios. """ rat = tensor.iscalar() kernel_ten = bilinear_kernel_1D(ratio=rat, normalize=False) f_ten = theano.function([rat], kernel_ten) kernel_ten_norm = bilinear_kernel_1D(ratio=rat, normalize=True) f_ten_norm = theano.function([rat], kernel_ten_norm) for ratio in [2, 3, 4, 5, 6, 7, 8, 9]: # getting the un-normalized kernel kernel = bilinear_kernel_1D(ratio=ratio, normalize=False) f = theano.function([], kernel) kernel_1D = self.numerical_kernel_1D(ratio) utt.assert_allclose(kernel_1D, f()) utt.assert_allclose(kernel_1D, f_ten(ratio)) # getting the normalized kernel kernel = bilinear_kernel_1D(ratio=ratio, normalize=True) f = theano.function([], kernel) kernel_1D = kernel_1D / float(ratio) utt.assert_allclose(kernel_1D, f()) utt.assert_allclose(kernel_1D, f_ten_norm(ratio))
def _compile_models(self): tn_x, _ = self.datasets[0] v_x, _ = self.datasets[1] tt_x, _ = self.datasets[2] tn_model = theano.function( inputs=[self.index], outputs=self.cost, updates=self.updates, givens={ self.x: tn_x[self.index * self.batch_size: (self.index + 1) * self.batch_size] } ) v_model = theano.function( inputs=[self.index], outputs=self.learner.error, givens={ self.x: v_x[self.index * self.batch_size: (self.index + 1) * self.batch_size], } ) tt_model = theano.function( inputs=[self.index], outputs=self.learner.error, givens={ self.x: tt_x[self.index * self.batch_size: (self.index + 1) * self.batch_size], } ) return [tn_model, v_model, tt_model]
def define_train_test_funcs(self): activation = self.layers[len(self.layers) - 1].activation self.Y = T.matrix("Y") pYs = T.reshape(activation, (self.maskY.shape[0] * self.batch_size, self.out_size)) tYs = T.reshape(self.Y, (self.maskY.shape[0] * self.batch_size, self.out_size)) cost = self.categorical_crossentropy(pYs, tYs) gparams = [] for param in self.params: #gparam = T.grad(cost, param) gparam = T.clip(T.grad(cost, param), -10, 10) gparams.append(gparam) lr = T.scalar("lr") # eval(): string to function optimizer = eval(self.optimizer) updates = optimizer(self.params, gparams, lr) #updates = sgd(self.params, gparams, lr) #updates = momentum(self.params, gparams, lr) #updates = rmsprop(self.params, gparams, lr) #updates = adagrad(self.params, gparams, lr) #updates = dadelta(self.params, gparams, lr) #updates = adam(self.params, gparams, lr) self.train = theano.function(inputs = [self.X, self.maskX, self.Y, self.maskY, lr, self.batch_size], givens = {self.is_train : np.cast['int32'](1)}, outputs = cost, updates = updates) self.predict = theano.function(inputs = [self.X, self.maskX, self.batch_size], givens = {self.is_train : np.cast['int32'](0)}, outputs = activation)
def test_examples_8(self): from theano import shared # Force the dtype to int64 to work correctly on 32 bit computer. # Otherwise, it create by default a int32 on 32 bit computer. state = shared(0) inc = T.iscalar('inc') accumulator = function([inc], state, updates=[(state, state+inc)]) assert state.get_value() == array(0) assert accumulator(1) == array(0) assert state.get_value() == array(1) assert accumulator(300) == array(1) assert state.get_value() == array(301) state.set_value(-1) assert accumulator(3) == array(-1) assert state.get_value() == array(2) decrementor = function([inc], state, updates=[(state, state-inc)]) assert decrementor(2) == array(2) assert state.get_value() == array(0) fn_of_state = state * 2 + inc # The type of foo must match the shared variable we are replacing # with the ``givens`` foo = T.scalar(dtype=state.dtype) skip_shared = function([inc, foo], fn_of_state, givens=[(state, foo)]) assert skip_shared(1, 3) == array(7) assert state.get_value() == array(0)
def test_copy_random_state(self): class Graph(): def __init__(self, seed=123): self.rng = RandomStreams(seed) self.y = self.rng.uniform(size=(1,)) g1 = Graph(seed=123) f1 = theano.function([], g1.y) g2 = Graph(seed=987) f2 = theano.function([], g2.y) #print 'By default, the two functions are out of sync.' v1 = f1() v2 = f2() def copy_random_state(g1, g2): if isinstance(g1.rng, MRG_RandomStreams): g2.rng.rstate = g1.rng.rstate for (su1, su2) in zip(g1.rng.state_updates, g2.rng.state_updates): su2[0].set_value(su1[0].get_value()) #print 'We now copy the state of the theano random number generators.' copy_random_state(g1, g2) v3 = f1() v4 = f2() assert numpy.allclose(v1, 0.72803009) assert numpy.allclose(v2, 0.55056769) assert numpy.allclose(v3, 0.59044123) assert numpy.allclose(v4, 0.59044123)
def sgd(lr, tparams, grads, x, mask, y, cost): """ Stochastic Gradient Descent :note: A more complicated version of sgd then needed. This is done like that for adadelta and rmsprop. """ # New set of shared variable that will contain the gradient # for a mini-batch. gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.items()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] # Function that computes gradients for a mini-batch, but do not # updates the weights. f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, name='sgd_f_grad_shared') pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)] # Function that updates the weights from the previously computed # gradient. f_update = theano.function([lr], [], updates=pup, name='sgd_f_update') return f_grad_shared, f_update
def adam(lr, tparams, grads, inp, cost): gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup) lr0 = 0.0002 b1 = 0.1 b2 = 0.001 e = 1e-8 updates = [] i = theano.shared(numpy.float32(0.)) i_t = i + 1. fix1 = 1. - b1**(i_t) fix2 = 1. - b2**(i_t) lr_t = lr0 * (tensor.sqrt(fix2) / fix1) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v) g_t = m_t / (tensor.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore') return f_grad_shared, f_update
def test_dnn_conv_merge(): # This test that we merge correctly multiple dnn_conv. if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) img_shp = [2, 5, 6, 8] kern_shp = [3, 5, 5, 6] img = T.ftensor4('img') kern = T.ftensor4('kern') out = T.ftensor4('out') desc = dnn.GpuDnnConvDesc( border_mode='valid')(kern.shape) # Test forward op o1 = dnn.dnn_conv(img, kern) o2 = dnn.dnn_conv(img, kern) f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu) d1, d2 = f(numpy.random.rand(*img_shp).astype('float32'), numpy.random.rand(*kern_shp).astype('float32')) topo = f.maker.fgraph.toposort() assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConv)]) == 1 # Test grad w op o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc) o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc) f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradW)]) == 1 # Test grad i op o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc) o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc) f = theano.function([img, kern, out], [o1, o2], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len([n for n in topo if isinstance(n.op, dnn.GpuDnnConvGradI)]) == 1
def test_flatten(): m = theano.tensor.fmatrix() f = theano.function([m], m.flatten(), mode=mode_with_gpu) val = numpy.random.rand(10, 11).astype("float32") res = f(val) utt.assert_allclose(res, val.flatten()) assert res.shape == val.flatten().shape assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()] val = numpy.random.rand(10, 11).astype("float32") res = f(val) utt.assert_allclose(res, val.flatten()) assert res.shape == val.flatten().shape assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()] f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu) val = numpy.random.rand(10, 11).astype("float32") res = f(val) utt.assert_allclose(res, val) assert res.shape == val.shape assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()] m = theano.tensor.tensor3() f = theano.function([m], m.flatten(ndim=2), mode=mode_with_gpu) val = numpy.random.rand(10, 11, 12).astype("float32") res = f(val) utt.assert_allclose(res, val.reshape(10, -1)) assert res.shape == val.reshape(10, -1).shape assert GpuReshape in [type(node.op) for node in f.maker.fgraph.toposort()]
def test_local_gpualloc_memset_0(): i = theano.tensor.iscalar() z = numpy.zeros((1,), dtype='float32') o = numpy.ones((1,), dtype='float32') ones = numpy.ones((2,), dtype='float32') # Test with 0 a = gpu_alloc(z, i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0 assert (numpy.asarray(f(6)) == 0).all() # Test with 1 a = gpu_alloc(o, i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, GpuAlloc) assert not topo[0].op.memset_0 assert (numpy.asarray(f(6)) == 1).all() # Test with 1, 1 a = gpu_alloc(ones, i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, GpuAlloc) assert not topo[0].op.memset_0 assert (numpy.asarray(f(2)) == 1).all()
def test_pooling_opt(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) x = T.fmatrix() f = theano.function( [x], pool_2d(x, ds=(2, 2), mode='average_inc_pad', ignore_border=True), mode=mode_with_gpu) assert any([isinstance(n.op, dnn.GpuDnnPool) for n in f.maker.fgraph.toposort()]) f(numpy.zeros((10, 10), dtype='float32')) f = theano.function( [x], T.grad(pool_2d(x, ds=(2, 2), mode='average_inc_pad', ignore_border=True).sum(), x), mode=mode_with_gpu.including("cudnn")) assert any([isinstance(n.op, dnn.GpuDnnPoolGrad) for n in f.maker.fgraph.toposort()]) f(numpy.zeros((10, 10), dtype='float32'))
def sdg(lr,params,grads,x,mask,y,cost): '''随机梯度下降 参数: lr:学习速率 params: 网络中的参数 grads: 梯度 x,y: 输入数据 cost: 损失 返回: 两个theano函数: 1. f_grad_shared计算梯度,输出误差 2. f_update更新权重 ''' #新的shared变量,包含mini_batch的梯度 gshared=[theano.shared(p.get_value()*0.,name='%s_grad' %k) for k,p in params.items()] gsup=[(gs,g) for gs,g in zip(gshared,grads)] #计算mini_batch的梯度的function,但是不更新权重 f_grad_shared=theano.function([x,mask,y],cost,updates=gsup,name='sgd_f_grad_shared') pup=[(p,p-lr*g) for p,g in zip(params.values(),gshared)] #计算权重更新的函数 f_update=theano.function([lr],[],updates=pup,name='sgd_f_update') return f_grad_shared,f_update
def adadelta(lr,tparams,grads,x,mask,y,cost): zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k) for k, p in tparams.items()] running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k) for k, p in tparams.items()] running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k) for k, p in tparams.items()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up, name='adadelta_f_grad_shared') updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] #梯度更新字典 param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)] f_update = theano.function([lr], [], updates=ru2up + param_up, on_unused_input='ignore', name='adadelta_f_update') return f_grad_shared, f_update
def build_model(self, data, batch_size=256): # create a neural network rng = np.random.RandomState(7919) t_X = T.matrix(dtype=theano.config.floatX) t_y = T.vector(dtype=theano.config.floatX) t_learning_rate = T.scalar(dtype=theano.config.floatX) layer1 = NeuralNetworkLayer(rng, t_X, 8, 256, T.nnet.relu) layer2 = NeuralNetworkLayer(rng, layer1.output, 256, 256, T.nnet.relu) layer3 = NeuralNetworkLayer(rng, layer2.output, 256, 1, T.nnet.relu) output = T.cast((layer3.output + 0.5), 'int32') cost = T.sum((layer3.output - t_y.reshape((batch_size, 1))) ** 2) params = layer3.params + layer2.params + layer1.params grads = T.grad(cost, params) updates = [ (param_i, param_i - t_learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] self.train_model = theano.function( [ t_learning_rate ], cost, updates=updates, givens={ t_X: data.data_X, t_y: data.data_y, } ) # evaluation function self.forward = theano.function([ t_X ], output)
def build_train_valid(l_out): params = nn.layers.get_all_params(l_out, regularizable = True) wc_term = 0.5 * sum(T.sum(param ** 2) for param in params) x_batch = T.tensor4('x', theano.config.floatX) y_batch = T.matrix('y', 'int32') train_output = nn.layers.get_output(l_out, x_batch) train_loss = nn.objectives.binary_crossentropy(train_output, y_batch) train_loss = nn.objectives.aggregate(train_loss, mode = 'mean') train_loss += wc * wc_term params = nn.layers.get_all_params(l_out, trainable = True) valid_output = nn.layers.get_output(l_out, x_batch, deterministic = True) lr = theano.shared(np.float32(lr_schedule(0))) updates = nn.updates.nesterov_momentum(train_loss, params, lr, momentum) x_shared = nn.utils.shared_empty(dim = len(input_dims)) y_shared = nn.utils.shared_empty(dim = 2, dtype = 'int32') idx = T.scalar('idx', 'int32') givens = {x_batch: x_shared[idx * batch_size:(idx + 1) * batch_size], y_batch: y_shared[idx * batch_size:(idx + 1) * batch_size]} iter_train = theano.function([idx], [train_loss, train_output], givens = givens, updates = updates) givens = {x_batch: x_shared[idx * batch_size:(idx + 1) * batch_size]} iter_valid = theano.function([idx], valid_output, givens = givens) return x_shared, y_shared, idx, lr, iter_train, iter_valid
def initialise_model(self, X_train, y_train): print 'Initialising model...' self.input_shape = X_train.shape[1] input_var = T.matrix('inputs') target_var = T.matrix('targets') if self.normalise: y_train = self.normalise_y(y_train, reset = True) X_train = self.normalise_X(X_train, reset = True) # Create neural network model self.network = self.build_custom_mlp(input_var) prediction = lasagne.layers.get_output(self.network) loss = lasagne.objectives.squared_error(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(self.network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=self.learning_rate, momentum=self.momentum) test_prediction = lasagne.layers.get_output(self.network, deterministic=True) test_loss = lasagne.objectives.squared_error(test_prediction, target_var) test_loss = test_loss.mean() self.train_fn = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) self.predict_output = theano.function([input_var], outputs=test_prediction, allow_input_downcast=True) self.initialised = True
def __init__(self, embedding_dim=100, num_hidden_layers=2, hidden_dim=200, in_dropout_p=0.2, hidden_dropout_p=0.5, update_hyperparams={'learning_rate': 0.01}): self.embedding_dim = embedding_dim self.num_hidden_layers = num_hidden_layers self.hidden_dim = hidden_dim self.in_dropout_p = in_dropout_p self.hidden_dropout_p = update_hyperparams print >> sys.stderr, 'Building computation graph for discriminator...' self.input_var = T.matrix('input') self.target_var = T.matrix('targer') self.l_in = lasagne.layers.InputLayer(shape=(None, self.embedding_dim), input_var=T.tanh(self.input_var), name='l_in') self.l_in_dr = lasagne.layers.DropoutLayer(self.l_in, 0.2) self.layers = [self.l_in, self.l_in_dr] for i in xrange(self.num_hidden_layers): l_hid = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=self.hidden_dim, nonlinearity=lasagne.nonlinearities.leaky_rectify, W=lasagne.init.GlorotUniform(gain=leaky_relu_gain), name=('l_hid_%s' % i))) l_hid_dr = lasagne.layers.DropoutLayer(l_hid, 0.5) self.layers.append(l_hid) self.layers.append(l_hid_dr) self.l_preout = lasagne.layers.batch_norm(lasagne.layers.DenseLayer(self.layers[-1], num_units=1, nonlinearity=None, name='l_preout')) self.l_out = lasagne.layers.NonlinearityLayer(self.l_preout, nonlinearity=lasagne.nonlinearities.sigmoid, name='l_out') self.prediction = lasagne.layers.get_output(self.l_out) self.loss = lasagne.objectives.binary_crossentropy(self.prediction, self.target_var).mean() self.accuracy = T.eq(T.ge(self.prediction, 0.5), self.target_var).mean() self.params = lasagne.layers.get_all_params(self.l_out, trainable=True) self.updates = lasagne.updates.adam(self.loss, self.params, **update_hyperparams) print >> sys.stderr, 'Compiling discriminator...' self.train_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy], updates=self.updates) self.eval_fn = theano.function([self.input_var, self.target_var], [self.loss, self.accuracy])
def adadelta(lr, tparams, grads, inp, cost, extra_ups=[], extra_outs=[], exclude_params=set([])): '''Adadelta''' zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function( inp, [cost]+extra_outs, updates=zgup+rg2up+extra_ups, profile=profile) updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(tools.itemlist(tparams), updir) if p.name not in exclude_params] if not isinstance(lr, list): lr = [lr] f_update = theano.function(lr, [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def time_linker(name, linker): steps_a = 5 steps_b = 100 x = tensor.vector() a = build_graph(x,steps_a) b = build_graph(x,steps_b) f_a = function([x], a, mode=Mode(optimizer=None, linker=linker()), #profile='f_a speed test %s'%name, ) f_b = function([x], b, mode=Mode(optimizer=None, linker=linker()), #profile='f_b speed test %s'%name, ) print f_a([2.0, 3.0]) t0 = time.time() print f_a([2.0, 3.0]) t1 = time.time() print f_b([2.0, 3.0]) t2 = time.time() print f_b([2.0, 3.0]) t3 = time.time() t_a = t1 - t0 t_b = t3 - t2 print "%s takes %f s/Kop" % ( name, (1000*(t_b-t_a) / (steps_b - steps_a)))
def test_opt_gpujoin_onlyajoin(): # from a bug in normal sampling _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32') _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float32') a = cuda.shared_constructor(_a) b = cuda.shared_constructor(_b) c = tensor.join(1, a, b) f = theano.function([], c, mode=mode_with_gpu) f() graph_nodes = f.maker.fgraph.toposort() assert isinstance(graph_nodes[-1].op, cuda.HostFromGpu) assert isinstance(graph_nodes[-2].op, cuda.GpuJoin) assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1)) # test mixed dtype _b = numpy.asarray([[5, 6, 7], [8, 9, 10]], dtype='float64') b = theano.tensor.constant(_b) c = tensor.join(1, a, b) f = theano.function([], c, mode=mode_with_gpu) f() graph_nodes = f.maker.fgraph.toposort() assert isinstance(graph_nodes[-1].op, theano.tensor.Join) assert numpy.all(f() == numpy.concatenate([_a, _b], axis=1))
def architecture(self, cons, code_layer): """Build up the architecture by theano""" for i in range(len(self.layers)-1): # Initialize shared variables init_w = cons*np.random.randn(self.layers[i], self.layers[i+1]) self.weights.append(th.shared(init_w)) init_bias = cons*np.random.randn(self.layers[i+1]) self.biases.append(th.shared(init_bias)) # Building architecture a_before = T.dot(self.a_n[i], self.weights[i]) + \ self.biases[i].dimshuffle('x', 0) a_next = self.activ(a_before) self.a_n.append(a_next) # help the optimization for param in (self.weights+self.biases): self.auxiliary.append(th.shared(np.zeros(param.get_value().shape))) self.encode = th.function([self.x], self.a_n[code_layer]) self.decode = th.function([self.a_n[code_layer]], self.a_n[-1]) # Calculate the cost and gradients Cost = (T.sum((self.a_n[-1]-self.y_hat)**2))/self.batch params = self.weights + self.biases grads = T.grad(Cost, params, disconnected_inputs='ignore') # Update parameters update_query = self.update(params, grads, self.auxiliary) self.gradient_2 = th.function(inputs=[self.x, self.y_hat], updates=update_query, outputs=Cost)
def test_local_assert_no_cpu_op(): numpy.random.seed(1) m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32") ms = cuda.shared_constructor(m, name="m_shared") out = theano.tensor.tanh(ms).dot(ms.T) mode_local_assert = mode_with_gpu.including("assert_no_cpu_op") mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_0") mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise_1") old = config.assert_no_cpu_op old2 = config.on_opt_error # If the flag is raise try: config.assert_no_cpu_op = 'raise' config.on_opt_error = 'ignore' assert_raises(AssertionError, theano.function, [], out, mode=mode_local_assert) finally: config.assert_no_cpu_op = old config.on_opt_error = old2 # If the flag is ignore try: config.assert_no_cpu_op = 'ignore' theano.function([], out, mode=mode_local_assert) finally: config.assert_no_cpu_op = old
def test_alloc_memset_0(): i = tensor.iscalar() z = numpy.zeros((1,), dtype='float32') o = numpy.ones((1,), dtype='float32') ones = numpy.ones((2,), dtype='float32') # Test with 0 a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(z)), i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, basic_ops.GpuAlloc) and topo[0].op.memset_0 assert (numpy.asarray(f(6)) == 0).all() # Test with 1 a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(o)), i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, basic_ops.GpuAlloc) assert not topo[0].op.memset_0 assert (numpy.asarray(f(6)) == 1).all() # Test with 1, 1 a = basic_ops.gpu_alloc(cuda.gpu_from_host(tensor.constant(ones)), i) f = theano.function([i], a, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, basic_ops.GpuAlloc) assert not topo[0].op.memset_0 assert (numpy.asarray(f(2)) == 1).all()
def test_pattern_output(self): #print (self.phi.W.get_value(borrow=True)) assert (self.phi.W.get_value(borrow=True).shape == (self.n,self.d)) self.phi.W.set_value ( np.array([[1,0]]).T ) assert (self.phi.W.get_value(borrow=True).shape == (self.n,self.d)) assert (self.psi.W.get_value(borrow=True).shape == (self.d,self.num_classes)) self.psi.W.set_value ( np.array([[1,]]) ) assert (self.psi.W.get_value(borrow=True).shape == (self.d,self.num_classes)) # assert (self.beta.W.get_value(borrow=True).shape == (self.d, self.m)) # # [1,1] means that we will project the intermediate representation # # onto both dimensions of the output representation # self.beta.W.set_value ( np.array([[1,1]]) ) # assert (self.beta.W.get_value(borrow=True).shape == (self.d, self.m)) test_prediction = lasagne.layers.get_output(self.pattern, deterministic=True) test_fn = theano.function([self.input_var], test_prediction) X_hat = test_fn(self.X) assert ( np.all(X_hat == self.S) ) # self.phi1 = test_prediction # self.phi2 = lasagne.layers.get_output(self.pattern, self.side_var, deterministic=True) beta_prediction = self.pattern.get_beta_output_for(self.input_var, self.side_var, deterministic=True) beta_fn = theano.function([self.input_var, self.side_var], beta_prediction) C_hat = beta_fn(self.X, self.CX) assert ( np.all(C_hat == self.Cy) )
def test_in_transit_circ(): t = np.linspace(-20, 20, 1000) m_planet = np.array([0.3, 0.5]) m_star = 1.45 r_star = 1.5 orbit = KeplerianOrbit( m_star=m_star, r_star=r_star, t0=np.array([0.5, 17.4]), period=np.array([10.0, 5.3]), ecc=np.array([0.0, 0.0]), omega=np.array([0.0, 0.0]), m_planet=m_planet, ) orbit_circ = KeplerianOrbit( m_star=m_star, r_star=r_star, t0=np.array([0.5, 17.4]), period=np.array([10.0, 5.3]), m_planet=m_planet, ) r_pl = np.array([0.1, 0.03]) inds = theano.function([], orbit.in_transit(t, r=r_pl))() inds_circ = theano.function([], orbit_circ.in_transit(t, r=r_pl))() assert np.all(inds == inds_circ)
def __init__(self, kernel, max_iter = 10, max_diff = None): """ :param kernel: a function with a signature (expected, observed) -> a similarity measure that accepts symbolic theano expressions and returns them accordingly. See `crayimage.hotornot.em.kernels` for examples. :param max_iter: maximal number of iteration :param max_diff: stop iterations if maximal difference in weights from the previous iteration is smaller than `max_diff`. If None the check is not performed. """ self.original_shape = None self.kernel = kernel self.max_iter = max_iter self.max_diff = max_diff self.X = theano.shared( np.zeros(shape=(0, 0), dtype='float32') ) self.weights = theano.shared( np.ones(shape=(0, ), dtype='float32') ) canonical = T.sum(self.weights[:, None] * self.X, axis=0) / T.sum(self.weights) weights_updates = self.kernel(canonical, self.X) weights_diff = T.max(abs(weights_updates - self.weights)) upd = { self.weights : weights_updates } self.iteration = theano.function([], weights_diff if max_diff is not None else [], updates=upd) self.get_canonical = theano.function([], canonical)
def test_doctorAI( modelFile='model.txt', seqFile='seq.txt', inputDimSize=20000, labelFile='label.txt', numClass=500, timeFile='', predictTime=False, useLogTime=True, hiddenDimSize=[200,200], batchSize=100, logEps=1e-8, mean_duration=20.0, verbose=False ): options = locals().copy() if len(timeFile) > 0: useTime = True else: useTime = False options['useTime'] = useTime models = np.load(modelFile) tparams = init_tparams(models) print('build model ... ',) if predictTime: x, t, mask, codePred, timePred = build_model(tparams, options) predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code') predict_time = theano.function(inputs=[x,t,mask], outputs=timePred, name='predict_time') elif useTime: x, t, mask, codePred = build_model(tparams, options) predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code') else: x, mask, codePred = build_model(tparams, options) predict_code = theano.function(inputs=[x,mask], outputs=codePred, name='predict_code') options['inputDimSize']=models['W_emb'].shape[0] options['numClass']=models['b_output'].shape[0] print('load data ... ', ) testSet = load_data(seqFile, labelFile, timeFile) n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize))) print('done') predVec = [] trueVec = [] predTimeVec = [] trueTimeVec = [] iteration = 0 for batchIndex in range(n_batches): tempX = testSet[0][batchIndex*batchSize: (batchIndex+1)*batchSize] tempY = testSet[1][batchIndex*batchSize: (batchIndex+1)*batchSize] if predictTime: tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize] x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) codeResults = predict_code(x, t, mask) timeResults = predict_time(x, t, mask) elif useTime: tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize] x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) codeResults = predict_code(x, t, mask) else: x, mask, lengths = padMatrixWithoutTime(tempX, options) codeResults = predict_code(x, mask) for i in range(codeResults.shape[1]): tensorMatrix = codeResults[:,i,:] thisY = tempY[i][1:] for timeIndex in range(lengths[i]): if len(thisY[timeIndex]) == 0: continue trueVec.append(thisY[timeIndex]) output = tensorMatrix[timeIndex] predVec.append(list(zip(*heapq.nlargest(30, enumerate(output), key=operator.itemgetter(1))))[0]) if predictTime: for i in range(timeResults.shape[1]): timeVec = timeResults[:,i] trueTimeVec.extend(tempT[i][1:]) for timeIndex in range(lengths[i]): predTimeVec.append(timeVec[timeIndex]) if (iteration % 10 == 0) and verbose: print('iteration:%d/%d' % (iteration, n_batches)) iteration += 1 if iteration == 10: break recall = recallTop(trueVec, predVec) print('recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1], recall[2])) if predictTime: r_squared = calculate_r_squared(trueTimeVec, predTimeVec, options) print('R2:%f' % r_squared)
l_action_formed = lasagne.layers.ReshapeLayer(input_layer=l_action, shape=(N_BATCH, N_TIME_STEPS, N_ACTIONS)) # Cost function is mean squared error input = T.tensor3('input') target_output = T.tensor3('target_output') # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) # action_prediction = theano.function([input], l_action_formed.get_output(input)) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] for time in xrange(50): records.append([]) _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1))) baseline = None num_parameters = 4 # five parameters epsilon = 3 # initial number sigma sigma_list = ones(num_parameters) * epsilon
def compile(self, optimizer, loss, class_mode="categorical", theano_mode=None): self.optimizer = optimizers.get(optimizer) self.loss = objectives.get(loss) weighted_loss = weighted_objective(objectives.get(loss)) # input of model self.X_train = self.get_input(train=True) self.X_test = self.get_input(train=False) self.y_train = self.get_output(train=True) self.y_test = self.get_output(train=False) # target of model self.y = T.zeros_like(self.y_train) self.weights = T.ones_like(self.y_train) if hasattr(self.layers[-1], "get_output_mask"): mask = self.layers[-1].get_output_mask() else: mask = None train_loss = weighted_loss(self.y, self.y_train, self.weights, mask) test_loss = weighted_loss(self.y, self.y_test, self.weights, mask) train_loss.name = 'train_loss' test_loss.name = 'test_loss' self.y.name = 'y' if class_mode == "categorical": train_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_train, axis=-1))) test_accuracy = T.mean(T.eq(T.argmax(self.y, axis=-1), T.argmax(self.y_test, axis=-1))) elif class_mode == "binary": train_accuracy = T.mean(T.eq(self.y, T.round(self.y_train))) test_accuracy = T.mean(T.eq(self.y, T.round(self.y_test))) else: raise Exception("Invalid class mode:" + str(class_mode)) self.class_mode = class_mode self.theano_mode = theano_mode for r in self.regularizers: train_loss = r(train_loss) updates = self.optimizer.get_updates(self.trainable_params, self.constraints, train_loss) updates += self.updates if type(self.X_train) == list: train_ins = self.X_train + [self.y, self.weights] test_ins = self.X_test + [self.y, self.weights] predict_ins = self.X_test else: train_ins = [self.X_train, self.y, self.weights] test_ins = [self.X_test, self.y, self.weights] predict_ins = [self.X_test] self._train = theano.function(train_ins, train_loss, updates=updates, allow_input_downcast=True, mode=theano_mode) self._train_with_acc = theano.function(train_ins, [train_loss, train_accuracy], updates=updates, allow_input_downcast=True, mode=theano_mode) self._predict = theano.function(predict_ins, self.y_test, allow_input_downcast=True, mode=theano_mode) self._test = theano.function(test_ins, test_loss, allow_input_downcast=True, mode=theano_mode) self._test_with_acc = theano.function(test_ins, [test_loss, test_accuracy], allow_input_downcast=True, mode=theano_mode)
def make_grad_func(X): Z = theano.tensor.dot(X, W) + b H = theano.tensor.nnet.sigmoid(Z) cost = H.sum() g = gradient.grad(cost, X) return theano.function([X, W, b], g, on_unused_input="ignore")
def test_undefined_grad_func(self): # tests that function compilation catches undefined grads in the graph a = theano.tensor.vector() b = theano.gradient.grad_undefined(theano.tensor.add, 0, a) with pytest.raises(TypeError): theano.function([a], b, on_unused_input="ignore")
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {None: 0} self.ivocab = {0: None} self.word2vec = word2vec self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in self.max_inp_sent_len = 0 self.max_q_len = 0 """ #To Use All Vocab self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #self.vocab = {'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} #self.ivocab = {1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #""" self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.imatrix('input_var') self.q_var = T.ivector('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_res_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.inp_sent_reps, _ = theano.scan(fn=self.sum_pos_encodings_in, sequences=self.input_var) self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps) self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.sum_pos_encodings_q(self.q_var) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, 1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, 1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): self.mem_weight_num = int(iter - 1) current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in[self.mem_weight_num], self.W_mem_res_hid[self.mem_weight_num], self.b_mem_res[self.mem_weight_num], self.W_mem_upd_in[self.mem_weight_num], self.W_mem_upd_hid[self.mem_weight_num], self.b_mem_upd[self.mem_weight_num], self.W_mem_hid_in[self.mem_weight_num], self.W_mem_hid_hid[self.mem_weight_num], self.b_mem_hid[self.mem_weight_num])) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adam(self.loss, self.params, learning_rate=0.0001, beta1=0.5) #from DCGAN paper self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)
def build_finetune_functions(self, datasets, batch_size, learning_rate): '''Generates a function `train` that implements one step of finetuning, a function `validate` that computes the error on a batch from the validation set, and a function `test` that computes the error on a batch from the testing set :type datasets: list of pairs of theano.tensor.TensorType :param datasets: It is a list that contain all the datasets; the has to contain three pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano variables, one for the datapoints, the other for the labels :type batch_size: int :param batch_size: size of a minibatch :type learning_rate: float :param learning_rate: learning rate used during finetune stage ''' (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_test_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = {} for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate train_fn = theano.function( inputs=[index], outputs=self.finetune_cost, updates=updates, givens={ self.x: train_set_x[index * batch_size:(index + 1) * batch_size], self.y: train_set_y[index * batch_size:(index + 1) * batch_size] }) test_score_i = theano.function( [index], self.errors, givens={ self.x: test_set_x[index * batch_size:(index + 1) * batch_size], self.y: test_set_y[index * batch_size:(index + 1) * batch_size] }) valid_score_i = theano.function( [index], self.errors, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] # Create a function that scans the entire test set def test_score(): return [test_score_i(i) for i in xrange(n_test_batches)] return train_fn, valid_score, test_score
def test_machine_translation(self): """ This test case comes from https://github.com/rizar/scan-grad-speed and is an example of actual computation done with scan in the context of machine translation 'dim' has been reduced from 1000 to 5 to make the test run faster """ # Parameters from an actual machine tranlation run batch_size = 80 seq_len = 50 n_words = 80 * 50 dim = 5 # Weight matrices U = theano.shared( numpy.random.normal(size=(dim, dim), scale=0.0001).astype(config.floatX)) U.name = 'U' V = theano.shared(U.get_value()) V.name = 'V' W = theano.shared(U.get_value()) W.name = 'W' # Variables and their values x = T.tensor3('x') x_value = numpy.random.normal(size=(seq_len, batch_size, dim), scale=0.0001).astype(config.floatX) ri = T.tensor3('ri') ri_value = x_value zi = T.tensor3('zi') zi_value = x_value init = T.alloc(numpy.cast[config.floatX](0), batch_size, dim) def rnn_step1( # sequences x, ri, zi, # outputs_info h): pre_r = ri + h.dot(U) pre_z = zi + h.dot(V) r = T.nnet.sigmoid(pre_r) z = T.nnet.sigmoid(pre_z) after_r = r * h pre_h = x + after_r.dot(W) new_h = T.tanh(pre_h) res_h = z * new_h + (1 - z) * h return res_h # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name='fpass1', mode=opt_mode) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") h, _ = theano.scan(rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name='fpass1', mode=no_opt_mode) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_no_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=no_opt_mode) # Validate that the optimization has been applied scan_node_grad = [ node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan) ][1] for output in scan_node_grad.op.outputs: assert not ( isinstance(output.owner.op, T.elemwise.Elemwise) and any([isinstance(i, T.Dot) for i in output.owner.inputs])) # Compare the outputs of the two functions on the same input data. f_opt_output = f_opt(x_value, ri_value, zi_value) f_no_opt_output = f_no_opt(x_value, ri_value, zi_value) utt.assert_allclose(f_opt_output, f_no_opt_output)
# updates from ADAM updates = Adam(cost, params) ########################################################### ########################################################### ############ THEANO FUNC. FOR TRAINING, VAL., ETC. ####### ########################################################### print '....compiling training and testing functions' train_model = theano.function([sent, phonemes], outputs=[cost, encoder_cost, cross_entropy_cost], updates=updates, givens={ x: sent[:-1], ahead: sent[1:], y: phonemes[:-1] }) probe_model = theano.function([sent, phonemes], outputs=[cost, encoder_cost, cross_entropy_cost], givens={ x: sent[:-1], ahead: sent[1:], y: phonemes[:-1] }) validate_model = theano.function( inputs=[sent, phonemes], outputs=[cost, encoder_cost, cross_entropy_cost],
def train_doctorAI( seqFile='seqFile.txt', inputDimSize=20000, labelFile='labelFile.txt', numClass=500, outFile='outFile.txt', timeFile='timeFile.txt', predictTime=False, tradeoff=1.0, useLogTime=True, embFile='embFile.txt', embSize=200, embFineTune=True, hiddenDimSize=[200,200], batchSize=100, max_epochs=10, L2_output=0.001, L2_time=0.001, dropout_rate=0.5, logEps=1e-8, verbose=False ): options = locals().copy() if len(timeFile) > 0: useTime = True else: useTime = False options['useTime'] = useTime print 'Initializing the parameters ... ', params = init_params(options) tparams = init_tparams(params, options) print 'Building the model ... ', f_grad_shared = None f_update = None if predictTime and embFineTune: print 'predicting duration, fine-tuning code representations' use_noise, x, y, t, t_label, mask, lengths, cost = build_model(tparams, options) grads = T.grad(cost, wrt=tparams.values()) f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t, t_label) elif predictTime and not embFineTune: print 'predicting duration, not fine-tuning code representations' W_emb = theano.shared(params['W_emb'], name='W_emb') use_noise, x, y, t, t_label, mask, lengths, cost = build_model(tparams, options, W_emb) grads = T.grad(cost, wrt=tparams.values()) f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t, t_label) elif useTime and embFineTune: print 'using duration information, fine-tuning code representations' use_noise, x, y, t, mask, lengths, cost = build_model(tparams, options) grads = T.grad(cost, wrt=tparams.values()) f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t) elif useTime and not embFineTune: print 'using duration information, not fine-tuning code representations' W_emb = theano.shared(params['W_emb'], name='W_emb') use_noise, x, y, t, mask, lengths, cost = build_model(tparams, options, W_emb) grads = T.grad(cost, wrt=tparams.values()) f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options, t) elif not useTime and embFineTune: print 'not using duration information, fine-tuning code representations' use_noise, x, y, mask, lengths, cost = build_model(tparams, options) grads = T.grad(cost, wrt=tparams.values()) f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options) elif not useTime and not embFineTune: print 'not using duration information, not fine-tuning code representations' W_emb = theano.shared(params['W_emb'], name='W_emb') use_noise, x, y, mask, lengths, cost = build_model(tparams, options, W_emb) grads = T.grad(cost, wrt=tparams.values()) f_grad_shared, f_update = adadelta(tparams, grads, x, y, mask, lengths, cost, options) print 'Loading data ... ', trainSet, validSet, testSet = load_data(seqFile, labelFile, timeFile) n_batches = int(np.ceil(float(len(trainSet[0])) / float(batchSize))) print 'done' if predictTime: test_model = theano.function(inputs=[x, y, t, t_label, mask, lengths], outputs=cost, name='test_model') elif useTime: test_model = theano.function(inputs=[x, y, t, mask, lengths], outputs=cost, name='test_model') else: test_model = theano.function(inputs=[x, y, mask, lengths], outputs=cost, name='test_model') bestValidCrossEntropy = 1e20 bestValidEpoch = 0 testCrossEntropy = 0.0 print 'Optimization start !!' for epoch in xrange(max_epochs): iteration = 0 costVector = [] for index in random.sample(range(n_batches), n_batches): use_noise.set_value(1.) batchX = trainSet[0][index*batchSize:(index+1)*batchSize] batchY = trainSet[1][index*batchSize:(index+1)*batchSize] if predictTime: batchT = trainSet[2][index*batchSize:(index+1)*batchSize] x, y, t, t_label, mask, lengths = padMatrixWithTimePrediction(batchX, batchY, batchT, options) cost = f_grad_shared(x, y, t, t_label, mask, lengths) elif useTime: batchT = trainSet[2][index*batchSize:(index+1)*batchSize] x, y, t, mask, lengths = padMatrixWithTime(batchX, batchY, batchT, options) cost = f_grad_shared(x, y, t, mask, lengths) else: x, y, mask, lengths = padMatrixWithoutTime(batchX, batchY, options) cost = f_grad_shared(x, y, mask, lengths) costVector.append(cost) f_update() if (iteration % 10 == 0) and verbose: print 'epoch:%d, iteration:%d/%d, cost:%f' % (epoch, iteration, n_batches, cost) iteration += 1 print 'epoch:%d, mean_cost:%f' % (epoch, np.mean(costVector)) use_noise.set_value(0.) validAuc = calculate_auc(test_model, validSet, options) print 'Validation cross entropy:%f at epoch:%d' % (validAuc, epoch) if validAuc < bestValidCrossEntropy: bestValidCrossEntropy = validAuc bestValidEpoch = epoch bestParams = unzip(tparams) testCrossEntropy = calculate_auc(test_model, testSet, options) print 'Test cross entropy:%f at epoch:%d' % (testCrossEntropy, epoch) tempParams = unzip(tparams) np.savez_compressed(outFile + '.' + str(epoch), **tempParams) print 'The best valid cross entropy:%f at epoch:%d' % (bestValidCrossEntropy, bestValidEpoch) print 'The test cross entropy: %f' % testCrossEntropy
def evaluate_lenet5( learning_rate=0.1, n_epochs=200, dataset="mnist.pkl.gz", nkerns=[20, 50], batch_size=500, ): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # display some chars: display_some(train_set_x, train_set_y.eval(), n=5, title="label=") # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print("... building the model") # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2), ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2), ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size], }, ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size], }, ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( inputs=[index], outputs=[cost, layer3.errors(y)], updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], }, ) ############### # TRAIN MODEL # ############### print("... training") # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0.0 start_time = timeit.default_timer() epoch = 0 done_looping = False # for error_curve plot cost_train = [] # observe likelihood cost while training err_train = [] # observe train err while training err_valid = [] # observe valid err while training err_test = [] # observe test err while training while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print("training @ iter = ", iter) train_outputs = train_model(minibatch_index) cost_ij = train_outputs[0] err_train.append(train_outputs[1]) # add error_train if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) err_valid.append(this_validation_loss) print("epoch %i, minibatch %i/%i, validation error %f %%" % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0, )) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i)[0] for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) err_test.append(test_score) print((" epoch %i, minibatch %i/%i, test error of " "best model %f %%") % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100.0, )) """ # save the best model with open('../doc/data/best_model.pkl', 'wb') as f: pickle.dump(layer0, layer1, layer2, layer3, f) """ if patience <= iter: done_looping = True break end_time = timeit.default_timer() print("Optimization complete.") print("Best validation score of %f %% obtained at iteration %i, " "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)) print( ("The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)), file=sys.stderr, ) model = [layer0, layer1, layer2, layer3] # save the best model with open("../doc/data/best_model.pkl", "wb") as f: pickle.dump(model, f) test_pred_y = test_model(0)[1] # predict on first batch_size sampless # display some chars using predict display_some(test_set_x, test_pred_y, n=5, title="pred=") # n < batch_size return err_train, err_valid, err_test
def _run(self, num_features, num_timesteps, batch_size, mode): # determine shapes of inputs and targets depending on the batch size if batch_size == 1: inputs_size = (num_timesteps, num_features) targets_size = (num_timesteps, 1) else: inputs_size = (num_timesteps, batch_size, num_features) targets_size = (num_timesteps, batch_size, 1) # make inputs and targets shared variables inputs = theano.shared(self.rng.uniform(size=inputs_size).astype( config.floatX), borrow=True) targets = theano.shared(self.rng.uniform(size=targets_size).astype( config.floatX), borrow=True) # create symbolic inputs and targets variables if batch_size == 1: x = T.matrix('inputs') t = T.matrix('targets') else: x = T.tensor3('inputs') t = T.tensor3('inputs') x.tag.test_value = inputs.get_value(borrow=True) t.tag.test_value = targets.get_value(borrow=True) # create a set of parameters for a simple RNN W_xh = theano.shared( (0.01 * self.rng.uniform(size=(num_features, 10))).astype( config.floatX), borrow=True) W_hh = theano.shared( (0.01 * self.rng.uniform(size=(10, 10))).astype(config.floatX), borrow=True) W_hy = theano.shared( (0.01 * self.rng.uniform(size=(10, 1))).astype(config.floatX), borrow=True) b_h = theano.shared(numpy.zeros(10).astype(config.floatX), borrow=True) b_y = theano.shared(numpy.zeros(1).astype(config.floatX), borrow=True) params = [W_xh, W_hh, W_hy, b_h, b_y] # recurrent function def step(x_t, h_tm1): h = T.tanh(T.dot(h_tm1, W_hh) + T.dot(x_t, W_xh) + b_h) return h # build recurrent graph if batch_size == 1: h_0 = T.alloc(0.0, 10).astype(config.floatX) else: h_0 = T.alloc(0.0, batch_size, 10).astype(config.floatX) h, updates = theano.scan(step, sequences=[x], outputs_info=[h_0]) # network output y = T.dot(h, W_hy) + b_y # Create Gauss-Newton-Matrix object. Not really of any use here, but I # need it for Hessian-Free optimization. gn = GaussNewtonMatrix(y) # compute MSE cost = ((t - y)**2).sum(axis=1).mean() # Compute the cost at some other point in the parameter # space. Not really of any use here, but this is how I do it # during certain iterations of CG in the HF algorithm. There, # it's in fact `pi + current update proposal`. For simplicity, # I just multiply by 2 here. cost_ = theano.clone(cost, replace=dict([(pi, 2 * pi) for pi in params])) # Compute Gauss-Newton-Matrix times some vector `v` which is `p` in CG, # but for simplicity, I just take the parameters vector because it's # already there. Gv = gn(v=params, cost=cost, parameters=params, damp=T.constant(1.0)) # compile Theano function f = theano.function([], [cost_] + Gv, givens={ x: inputs, t: targets }, mode=mode) # execute f()
params = layers.get_all_params(NET, trainable=True) # The dynamic learning rate is applied during the training process lr_dynamic = T.scalar(name='learning_rate') # The adam update methode is used to update the params based on the loss function & the learning rate param_updates = updates.adam(loss, params, learning_rate=lr_dynamic) #################### TRAIN FUNCTION ###################### # The theano train functions takes images and class targets as input # It updates the parameters of the net and returns the current loss as float value # Compiling theano functions #print "COMPILING THEANO TRAIN FUNCTION...", train_net = theano.function( [layers.get_all_layers(NET)[0].input_var, targets, lr_dynamic], loss, updates=param_updates) ################# PREDICTION FUNCTION #################### # The prediction function is used to calculate the validation accuracy # First the CNN's output is retrieved net_output = layers.get_output(NET) # Compiling theano test function print "COMPILING THEANO TEST FUNCTION...", test_net = theano.function([layers.get_all_layers(NET)[0].input_var, targets], [net_output, loss, accuracy]) ##################### STAT PLOT ######################### plt.ion()
def main(): # step 1: load the data, transform as needed train = loadmat('../large_files/train_32x32.mat') test = loadmat('../large_files/test_32x32.mat') # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) max_iter = 8 print_period = 10 lr = np.float32(0.00001) reg = np.float32(0.01) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N / batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = ( 20, 3, 5, 5 ) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = ( 50, 20, 5, 5 ) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0] * 5 * 5, M) / np.sqrt(W2_shape[0] * 5 * 5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # step 2: define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # momentum changes dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1') db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1') dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2') db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2') dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3') db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3') dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4') db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax(Z3.dot(W4) + b4) # define the cost function and prediction params = (W1, b1, W2, b2, W3, b3, W4, b4) reg_cost = reg * np.sum((param * param).sum() for param in params) cost = -(Y * T.log(pY)).sum() + reg_cost prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions update_W1 = W1 + mu * dW1 - lr * T.grad(cost, W1) update_b1 = b1 + mu * db1 - lr * T.grad(cost, b1) update_W2 = W2 + mu * dW2 - lr * T.grad(cost, W2) update_b2 = b2 + mu * db2 - lr * T.grad(cost, b2) update_W3 = W3 + mu * dW3 - lr * T.grad(cost, W3) update_b3 = b3 + mu * db3 - lr * T.grad(cost, b3) update_W4 = W4 + mu * dW4 - lr * T.grad(cost, W4) update_b4 = b4 + mu * db4 - lr * T.grad(cost, b4) # update weight changes update_dW1 = mu * dW1 - lr * T.grad(cost, W1) update_db1 = mu * db1 - lr * T.grad(cost, b1) update_dW2 = mu * dW2 - lr * T.grad(cost, W2) update_db2 = mu * db2 - lr * T.grad(cost, b2) update_dW3 = mu * dW3 - lr * T.grad(cost, W3) update_db3 = mu * db3 - lr * T.grad(cost, b3) update_dW4 = mu * dW4 - lr * T.grad(cost, W4) update_db4 = mu * db4 - lr * T.grad(cost, b4) train = theano.function( inputs=[X, Y], updates=[ (W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2), (W3, update_W3), (b3, update_b3), (W4, update_W4), (b4, update_b4), (dW1, update_dW1), (db1, update_db1), (dW2, update_dW2), (db2, update_db2), (dW3, update_dW3), (db3, update_db3), (dW4, update_dW4), (db4, update_db4), ], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() LL = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % ( i, j, cost_val, err) LL.append(cost_val) print "Elapsed time:", (datetime.now() - t0) plt.plot(LL) plt.show()
w3, b3 = init_weights_bias2((num_filters2 * 3 * 3, 100), X.dtype) w4, b4 = init_weights_bias2((100, 10), X.dtype) y1, o1, y2, o2, py_x = model(X, w1, b1, w2, b2, w3, b3, w4, b4) y_x = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) params = [w1, b1, w2, b2, w3, b3, w4, b4] updates = sgd(cost, params, learningrate, decayparameter) updates2 = sgd_momentum(cost, params, learningrate, decayparameter, momentum) updates3 = RMSprop(cost, params, learningrateRMS, decayparameterRMS, p, ebs) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True) train2 = theano.function(inputs=[X, Y], outputs=cost, updates=updates2, allow_input_downcast=True) train3 = theano.function(inputs=[X, Y], outputs=cost, updates=updates3, allow_input_downcast=True) predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) test = theano.function(inputs=[X], outputs=[y1, o1, y2, o2], allow_input_downcast=True)
def train(data, layers, negative_importance, negative_threshold, entropy_importance, updates_function, batch_size=10, epoch_size=200, initial_patience=1000, improvement_threshold=0.99, patience_increase=5, max_iter=100000): ''' Utility function for training a siamese net for (potentially cross-modal) embedding of sequences. Assumes data['X']['train'][n] should be mapped close to data['Y']['train'][m] only when n == m The networks for hashing sequences from each modality should be given in the ``layers`` dictionary (see below). Parameters ---------- data : dict of dict of list of np.ndarray Dict with keys ``'X'`` and ``'Y'``, corresponding to each modality, with each key mapping to a dict with keys ``'train'`` and ``'validate'``, each of which containing a list of np.ndarrays of shape ``(n_filters, n_time_steps, n_features)``. layers : dict of list of lasagne.layers.Layer This should be a dict with two keys, ``'X'`` and ``'Y'``, with each key mapping to a list of ``lasagne.layers.Layer`` instance corresponding to the layers in each network. The only constraints are that the input shape should match the shape produced by ``sample_sequences`` when it's called with the provided data arrays (``data['X']['train']``, etc.) and that the output dimensionality of both networks should be the same. negative_importance : float Scaling parameter for cross-modality negative example cost negative_threshold : int Cross-modality negative example threshold entropy_importance : float Scaling parameter for hash entropy encouraging term updates_function : function Function for computing updates, probably from ``lasagne.updates``. Should take two arguments, a Theano tensor variable and a list of shared variables, and should return a dictionary of updates for those parameters (all other arguments, such as learning rate, should be factored out). batch_size : int Mini-batch size epoch_size : int Number of mini-batches per epoch initial_patience : int Always train on at least this many batches improvement_threshold : float Validation cost must decrease by this factor to increase patience patience_increase : int How many more epochs should we wait when we increase patience max_iter : int Maximum number of batches to train on Returns ------- epoch : iterator Results for each epoch are yielded ''' # First neural net, for X modality X_p_input = T.tensor4('X_p_input') X_n_input = T.tensor4('X_n_input') # For eval X_input = T.tensor4('X_input') # Second neural net, for Y modality Y_p_input = T.tensor4('Y_p_input') Y_n_input = T.tensor4('Y_n_input') Y_input = T.tensor4('Y_input') # Compute \sum max(0, m - ||a - b||_2)^2 def hinge_cost(m, a, b): dist = m - T.sqrt(T.sum((a - b)**2, axis=1)) return T.mean((dist * (dist > 0))**2) def hasher_cost(deterministic): X_p_output = lasagne.layers.get_output(layers['X'][-1], X_p_input, deterministic=deterministic) X_n_output = lasagne.layers.get_output(layers['X'][-1], X_n_input, deterministic=deterministic) Y_p_output = lasagne.layers.get_output(layers['Y'][-1], Y_p_input, deterministic=deterministic) Y_n_output = lasagne.layers.get_output(layers['Y'][-1], Y_n_input, deterministic=deterministic) # Unthresholded, unscaled cost of positive examples across modalities cost_p = T.mean(T.sum((X_p_output - Y_p_output)**2, axis=1)) # Thresholded, scaled cost of cross-modality negative examples cost_n = negative_importance * hinge_cost(negative_threshold, X_n_output, Y_n_output) # Cost to encourage each output unit to vary cost_e = entropy_importance * (T.mean(X_p_output**2) + T.mean(Y_p_output**2)) # Sum positive and negative costs for overall cost cost = cost_p + cost_n + cost_e return cost # Combine all parameters from both networks params = (lasagne.layers.get_all_params(layers['X'][-1], trainable=True) + lasagne.layers.get_all_params(layers['Y'][-1], trainable=True)) # Compute gradient descent updates updates = updates_function(hasher_cost(False), params) # Function for training the network train = theano.function([X_p_input, X_n_input, Y_p_input, Y_n_input], hasher_cost(False), updates=updates) # Compute cost without training cost = theano.function([X_p_input, X_n_input, Y_p_input, Y_n_input], hasher_cost(True)) # Start with infinite validate cost; we will always increase patience once current_validate_cost = np.inf patience = initial_patience # Functions for computing the neural net output on the train and val sets X_output = theano.function([X_input], lasagne.layers.get_output(layers['X'][-1], X_input, deterministic=True)) Y_output = theano.function([Y_input], lasagne.layers.get_output(layers['Y'][-1], Y_input, deterministic=True)) # Create sampled sequences for validation X_validate = utils.sample_sequences(data['X']['validate'], batch_size) Y_validate = utils.sample_sequences(data['Y']['validate'], batch_size) # Create fixed negative example validation set X_validate_shuffle = np.random.permutation(len(data['X']['validate'])) Y_validate_shuffle = X_validate_shuffle[utils.random_derangement( len(data['Y']['validate']))] X_validate_n = utils.sample_sequences( [data['X']['validate'][n] for n in X_validate_shuffle], batch_size) Y_validate_n = utils.sample_sequences( [data['Y']['validate'][n] for n in Y_validate_shuffle], batch_size) # Create iterator to sample sequences from training data data_iterator = utils.get_next_batch(data['X']['train'], data['Y']['train'], batch_size, max_iter) # We will accumulate the mean train cost over each epoch train_cost = 0 for n, (X_p, Y_p, X_n, Y_n) in enumerate(data_iterator): # Occasionally Theano was raising a MemoryError, this fails gracefully try: train_cost += train(X_p, X_n, Y_p, Y_n) except MemoryError as e: print "MemoryError: {}".format(e) return # Stop training if a NaN is encountered if not np.isfinite(train_cost): print 'Bad training cost {} at iteration {}'.format(train_cost, n) break # Validate the net after each epoch if n and (not n % epoch_size): epoch_result = collections.OrderedDict() epoch_result['iteration'] = n # Compute average training cost over the epoch epoch_result['train_cost'] = train_cost / float(epoch_size) # Reset training cost mean accumulation train_cost = 0 # We need to accumulate the validation cost and network output over # batches to avoid MemoryErrors epoch_result['validate_cost'] = 0 validate_batches = 0 X_val_output = [] Y_val_output = [] for batch_idx in range(len(X_validate)): # Compute and accumulate cost epoch_result['validate_cost'] += cost(X_validate[batch_idx], X_validate_n[batch_idx], Y_validate[batch_idx], Y_validate_n[batch_idx]) # Keep track of # of batches for normalization validate_batches += 1 # Compute network output and accumulate result X_val_output.append(X_output(X_validate[batch_idx])) Y_val_output.append(Y_output(Y_validate[batch_idx])) # Normalize cost by number of batches and store epoch_result['validate_cost'] /= float(validate_batches) # Concatenate per-batch output to tensors X_val_output = np.concatenate(X_val_output, axis=0) Y_val_output = np.concatenate(Y_val_output, axis=0) # Compute in-class and out-of-class distances in_dists = np.mean((X_val_output - Y_val_output)**2, axis=1) out_dists = np.mean((X_val_output[X_validate_shuffle] - Y_val_output[Y_validate_shuffle])**2, axis=1) # Objective is Bhattacharrya coefficient of in-class and # out-of-class distances epoch_result['validate_objective'] = utils.bhatt_coeff( in_dists, out_dists) # Test whether this validate cost is the new smallest if epoch_result['validate_cost'] < current_validate_cost: # To update patience, we must be smaller than # improvement_threshold*(previous lowest validation cost) patience_cost = improvement_threshold * current_validate_cost if epoch_result['validate_cost'] < patience_cost: # Increase patience by the supplied about patience += epoch_size * patience_increase # Even if we didn't increase patience, update lowest valid cost current_validate_cost = epoch_result['validate_cost'] # Store patience after this epoch epoch_result['patience'] = patience yield epoch_result if n > patience: break return
def run(only_forward=False): logger = afs_safe_logger.Logger( os.path.join(FLAGS.log_path, FLAGS.experiment_name) + ".log") if FLAGS.data_type == "bl": data_manager = load_boolean_data elif FLAGS.data_type == "sst": data_manager = load_sst_data elif FLAGS.data_type == "snli": data_manager = load_snli_data else: logger.Log("Bad data type.") return pp = pprint.PrettyPrinter(indent=4) logger.Log("Flag values:\n" + pp.pformat(FLAGS.FlagValuesDict())) # Load the data. raw_training_data, vocabulary = data_manager.load_data( FLAGS.training_data_path) # Load the eval data. raw_eval_sets = [] if FLAGS.eval_data_path: for eval_filename in FLAGS.eval_data_path.split(":"): eval_data, _ = data_manager.load_data(eval_filename) raw_eval_sets.append((eval_filename, eval_data)) # Prepare the vocabulary. if not vocabulary: logger.Log( "In open vocabulary mode. Using loaded embeddings without fine-tuning." ) train_embeddings = False vocabulary = util.BuildVocabulary( raw_training_data, raw_eval_sets, FLAGS.embedding_data_path, logger=logger, sentence_pair_data=data_manager.SENTENCE_PAIR_DATA) else: logger.Log("In fixed vocabulary mode. Training embeddings.") train_embeddings = True # Load pretrained embeddings. if FLAGS.embedding_data_path: logger.Log("Loading vocabulary with " + str(len(vocabulary)) + " words from " + FLAGS.embedding_data_path) initial_embeddings = util.LoadEmbeddingsFromASCII( vocabulary, FLAGS.word_embedding_dim, FLAGS.embedding_data_path) else: initial_embeddings = None # Trim dataset, convert token sequences to integer sequences, crop, and # pad. logger.Log("Preprocessing training data.") training_data = util.PreprocessDataset( raw_training_data, vocabulary, FLAGS.seq_length, data_manager, eval_mode=False, logger=logger, sentence_pair_data=data_manager.SENTENCE_PAIR_DATA, for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW") training_data_iter = util.MakeTrainingIterator(training_data, FLAGS.batch_size) eval_iterators = [] for filename, raw_eval_set in raw_eval_sets: logger.Log("Preprocessing eval data: " + filename) e_X, e_transitions, e_y, e_num_transitions = util.PreprocessDataset( raw_eval_set, vocabulary, FLAGS.seq_length, data_manager, eval_mode=True, logger=logger, sentence_pair_data=data_manager.SENTENCE_PAIR_DATA, for_rnn=FLAGS.model_type == "RNN" or FLAGS.model_type == "CBOW") eval_iterators.append( (filename, util.MakeEvalIterator( (e_X, e_transitions, e_y, e_num_transitions), FLAGS.batch_size))) # Set up the placeholders. y = T.vector("y", dtype="int32") lr = T.scalar("lr") training_mode = T.scalar( "training_mode") # 1: Training with dropout, 0: Eval ground_truth_transitions_visible = T.scalar( "ground_truth_transitions_visible", dtype="int32") logger.Log("Building model.") vs = util.VariableStore(default_initializer=util.UniformInitializer( FLAGS.init_range), logger=logger) if FLAGS.model_type == "CBOW": model_cls = spinn.cbow.CBOW elif FLAGS.model_type == "RNN": model_cls = spinn.plain_rnn.RNN else: model_cls = getattr(spinn.fat_stack, FLAGS.model_type) # Generator of mask for scheduled sampling numpy_random = np.random.RandomState(1234) ss_mask_gen = T.shared_randomstreams.RandomStreams( numpy_random.randint(999999)) # Training step number ss_prob = T.scalar("ss_prob") if data_manager.SENTENCE_PAIR_DATA: X = T.itensor3("X") transitions = T.itensor3("transitions") num_transitions = T.imatrix("num_transitions") predicted_premise_transitions, predicted_hypothesis_transitions, logits = build_sentence_pair_model( model_cls, len(vocabulary), FLAGS.seq_length, X, transitions, len(data_manager.LABEL_MAP), training_mode, ground_truth_transitions_visible, vs, initial_embeddings=initial_embeddings, project_embeddings=(not train_embeddings), ss_mask_gen=ss_mask_gen, ss_prob=ss_prob) else: X = T.matrix("X", dtype="int32") transitions = T.imatrix("transitions") num_transitions = T.vector("num_transitions", dtype="int32") predicted_transitions, logits = build_sentence_model( model_cls, len(vocabulary), FLAGS.seq_length, X, transitions, len(data_manager.LABEL_MAP), training_mode, ground_truth_transitions_visible, vs, initial_embeddings=initial_embeddings, project_embeddings=(not train_embeddings), ss_mask_gen=ss_mask_gen, ss_prob=ss_prob) xent_cost, acc = build_cost(logits, y) # Set up L2 regularization. l2_cost = 0.0 for var in vs.trainable_vars: l2_cost += FLAGS.l2_lambda * T.sum(T.sqr(vs.vars[var])) # Compute cross-entropy cost on action predictions. if (not data_manager.SENTENCE_PAIR_DATA) and FLAGS.model_type not in [ "Model0", "RNN", "CBOW" ]: transition_cost, action_acc = build_transition_cost( predicted_transitions, transitions, num_transitions) elif data_manager.SENTENCE_PAIR_DATA and FLAGS.model_type not in [ "Model0", "RNN", "CBOW" ]: p_transition_cost, p_action_acc = build_transition_cost( predicted_premise_transitions, transitions[:, :, 0], num_transitions[:, 0]) h_transition_cost, h_action_acc = build_transition_cost( predicted_hypothesis_transitions, transitions[:, :, 1], num_transitions[:, 1]) transition_cost = p_transition_cost + h_transition_cost action_acc = (p_action_acc + h_action_acc ) / 2.0 # TODO(SB): Average over transitions, not words. else: transition_cost = T.constant(0.0) action_acc = T.constant(0.0) transition_cost = transition_cost * FLAGS.transition_cost_scale total_cost = xent_cost + l2_cost + transition_cost if ".ckpt" in FLAGS.ckpt_path: checkpoint_path = FLAGS.ckpt_path else: checkpoint_path = os.path.join(FLAGS.ckpt_path, FLAGS.experiment_name + ".ckpt") if os.path.isfile(checkpoint_path): logger.Log("Found checkpoint, restoring.") step, best_dev_error = vs.load_checkpoint( checkpoint_path, num_extra_vars=2, skip_saved_unsavables=FLAGS.skip_saved_unsavables) else: assert not only_forward, "Can't run an eval-only run without a checkpoint. Supply a checkpoint." step = 0 best_dev_error = 1.0 # Do an evaluation-only run. if only_forward: if FLAGS.eval_output_paths: eval_output_paths = FLAGS.eval_output_paths.strip().split(":") assert len(eval_output_paths) == len( eval_iterators), "Invalid no. of output paths." else: eval_output_paths = [ FLAGS.experiment_name + "-" + os.path.split(eval_set[0])[1] + "-parse" for eval_set in eval_iterators ] # Load model from checkpoint. logger.Log("Checkpointed model was trained for %d steps." % (step, )) # Generate function for forward pass. logger.Log("Building forward pass.") if data_manager.SENTENCE_PAIR_DATA: eval_fn = theano.function([ X, transitions, y, num_transitions, training_mode, ground_truth_transitions_visible, ss_prob ], [ acc, action_acc, logits, predicted_hypothesis_transitions, predicted_premise_transitions ], on_unused_input='ignore', allow_input_downcast=True) else: eval_fn = theano.function([ X, transitions, y, num_transitions, training_mode, ground_truth_transitions_visible, ss_prob ], [acc, action_acc, logits, predicted_transitions], on_unused_input='ignore', allow_input_downcast=True) # Generate the inverse vocabulary lookup table. ind_to_word = {v: k for k, v in vocabulary.iteritems()} # Do a forward pass and write the output to disk. for eval_set, eval_out_path in zip(eval_iterators, eval_output_paths): logger.Log("Writing eval output for %s." % (eval_set[0], )) evaluate_expanded( eval_fn, eval_set, eval_out_path, logger, step, data_manager.SENTENCE_PAIR_DATA, ind_to_word, FLAGS.model_type not in ["Model0", "RNN", "CBOW"]) else: # Train new_values = util.RMSprop(total_cost, vs.trainable_vars.values(), lr) new_values += [(key, vs.nongradient_updates[key]) for key in vs.nongradient_updates] # Training open-vocabulary embeddings is a questionable idea right now. Disabled: # new_values.append( # util.embedding_SGD(total_cost, embedding_params, embedding_lr)) # Create training and eval functions. # Unused variable warnings are supressed so that num_transitions can be passed in when training Model 0, # which ignores it. This yields more readable code that is very slightly slower. logger.Log("Building update function.") update_fn = theano.function([ X, transitions, y, num_transitions, lr, training_mode, ground_truth_transitions_visible, ss_prob ], [total_cost, xent_cost, transition_cost, action_acc, l2_cost, acc], updates=new_values, on_unused_input='ignore', allow_input_downcast=True) logger.Log("Building eval function.") eval_fn = theano.function([ X, transitions, y, num_transitions, training_mode, ground_truth_transitions_visible, ss_prob ], [acc, action_acc], on_unused_input='ignore', allow_input_downcast=True) logger.Log("Training.") # Main training loop. for step in range(step, FLAGS.training_steps): if step % FLAGS.eval_interval_steps == 0: for index, eval_set in enumerate(eval_iterators): acc = evaluate(eval_fn, eval_set, logger, step) if FLAGS.ckpt_on_best_dev_error and index == 0 and ( 1 - acc) < 0.99 * best_dev_error and step > 1000: best_dev_error = 1 - acc logger.Log( "Checkpointing with new best dev accuracy of %f" % acc) vs.save_checkpoint(checkpoint_path + "_best", extra_vars=[step, best_dev_error]) X_batch, transitions_batch, y_batch, num_transitions_batch = training_data_iter.next( ) learning_rate = FLAGS.learning_rate * ( FLAGS.learning_rate_decay_per_10k_steps**(step / 10000.0)) ret = update_fn( X_batch, transitions_batch, y_batch, num_transitions_batch, learning_rate, 1.0, 1.0, np.exp(step * np.log(FLAGS.scheduled_sampling_exponent_base))) total_cost_val, xent_cost_val, transition_cost_val, action_acc_val, l2_cost_val, acc_val = ret if step % FLAGS.statistics_interval_steps == 0: logger.Log("Step: %i\tAcc: %f\t%f\tCost: %5f %5f %5f %5f" % (step, acc_val, action_acc_val, total_cost_val, xent_cost_val, transition_cost_val, l2_cost_val)) if step % FLAGS.ckpt_interval_steps == 0 and step > 0: vs.save_checkpoint(checkpoint_path, extra_vars=[step, best_dev_error])
def _get_fprop(large_network=False, output_layers=[-1], detailed=False): arch = _get_architecture(large_network, detailed=detailed) expressions, input_var = fuse(arch, output_expressions=output_layers, input_dtype='float32') fprop = theano.function([input_var], expressions) return fprop
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=5, n_hidden=10): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ #print(trainO) Xo = theano.shared(value=np.asarray(trainD, dtype='float64'), name='Xo') yo = theano.shared(value=np.asarray(trainO, dtype='int32'), name='yo') Xot = theano.shared(value=np.asarray(testD, dtype='float64'), name='Xot') yot = theano.shared(value=np.asarray(testO, dtype='int32'), name='yot') Xov = theano.shared(value=np.asarray(validD, dtype='float64'), name='Xot') yov = theano.shared(value=np.asarray(validO, dtype='int32'), name='yot') #print(y) # sys.exit() train_set_x, train_set_y = (Xo, yo) valid_set_x, valid_set_y = (Xov, yov) test_set_x, test_set_y = (Xot, yot) # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### #print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=20285, n_hidden=n_hidden, n_out=2) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # start-snippet-5 # compute the gradient of cost with respect to theta (sorted in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-5 ############### # TRAIN MODEL # ############### #print('... training') # early-stopping parameters patience = 900 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) #print( # 'epoch %i, minibatch %i/%i, validation error %f %%' % # ( # epoch, # minibatch_index + 1, # n_train_batches, # this_validation_loss * 100. # ) #) # if we got the best validation score until now if (1 < 2): #if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = np.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) #if patience <= iter: #done_looping = True #break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) print("\n")
def train( dataname='5r', dataset='5Label_300_40000_glove.6B', n_words=40000, decay_c=1e-4, optimizer=adagrad, clip_c=4., valid_batch_size=64, batch_size=32, disp_frq=1000, valid_freq=100, save_freq=1000, max_epochs=100, # lrate=0.05, lrate=0.05, lrate_embed=0.1, use_dropout=True, noise_std=0.5, patience=15, saveto='model.npz', encoder='lstm', dim_proj=300, end=True, dim_hidden=100): # Model options model_options = locals().copy() print(model_options) print 'Loading data' path = os.path.join('..', '..', '..', 'Data', 'TC', dataname, dataset + '.pkl') # path = os.path.join('..', '..', 'Data', 'TC', dataname, dataset + '.pkl') data = pkl.load(open(path, 'rb')) train, valid, test, emb = data print(emb.shape) ydim = numpy.max(train[1]) - numpy.min(train[1]) + 1 if numpy.min(train[1]) is not 0: bias = numpy.min(train[1]) print 'Min of class is ', bias def min_y_to_zero(set): X, Y = set[0], set[1] new_Y = [] for y in Y: new_Y.append(y - bias) return [X, new_Y] train = min_y_to_zero(train) valid = min_y_to_zero(valid) test = min_y_to_zero(test) model_options['ydim'] = ydim print 'Building model' params = init_params(model_options) params['Wemb'] = emb.astype(config.floatX) tparams = init_tparams(params) (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): # weight_decay+=(theano.ifelse(kk is 'Wemb'), ((vv ** 2).sum() / 5.), ((vv ** 2).sum())) # if kk is 'Wemb': # weight_decay += (vv ** 2).sum() / 5. # else: # weight_decay += (vv ** 2).sum() weight_decay += (vv**2).sum() weight_decay *= decay_c cost_decay = weight_decay + cost f_cost = theano.function([x, mask, y], cost_decay, name='f_cost') grads = tensor.grad(cost_decay, wrt=tparams.values()) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') # lrate_embed = tensor.scalar(name='lrate_embed') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost, cost_decay) print 'Optimization' # kf_train4valid = get_minibatches_idx(len(train4valid[0]), valid_batch_size) kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) history_errs = [] best_p = None bad_count = 0 if valid_freq == -1: valid_freq = len(train[0]) / batch_size if save_freq == -1: save_freq = len(train[0]) / batch_size uidx = 0 estop = False start_time = time.time() try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 if use_dropout is True: use_noise.set_value(1.) else: use_noise.set_value(0.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost, cost_decay = f_grad_shared(x, mask, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, disp_frq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'Cost_decay', cost_decay if numpy.mod(uidx, save_freq) == 0: print 'Saving...', # import ipdb; ipdb.set_trace() if best_p != None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' if numpy.mod(uidx, valid_freq) == 0: use_noise.set_value(0.) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) history_errs.append(valid_err) if (uidx == 0 or valid_err <= numpy.array(history_errs).min()): best_p = unzip(tparams) bad_counter = 0 if len(history_errs ) > patience and valid_err >= numpy.array( history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Valid ', valid_err print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) except KeyboardInterrupt: print "Training interupted" end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) test_err = pred_error(f_pred, prepare_data, test, kf_test) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) print 'Valid ', valid_err, 'Test ', test_err train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) # print 'Train ', train_err,'Train4Valid ',train4valid_err, 'Valid ', valid_err, 'Test ', test_err print 'Train ', train_err print 'Dataset', dataname, 'Test Acc', (1. - test_err) print(model_options) if saveto: numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err
import theano import theano.tensor as T N = T.iscalar('n') def fibonacci(n, x, x_prev): return x + x_prev, x outputs, updates = theano.scan( fn=fibonacci, sequences=T.arange(N), n_steps=N, outputs_info=[1, 1], ) fib_op = theano.function(inputs=[N], outputs=outputs) print(fib_op(5))
def SGD(self, training_data, epochs, mini_batch_size, eta, validation_data, test_data, lmbda=0.0): """Train the network using mini-batch stochastic gradient descent.""" training_x, training_y = training_data validation_x, validation_y = validation_data test_x, test_y = test_data # compute number of minibatches for training, validation and testing num_training_batches = size(training_data) / mini_batch_size num_validation_batches = size(validation_data) / mini_batch_size num_test_batches = size(test_data) / mini_batch_size # define the (regularized) cost function, symbolic gradients, and updates l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers]) cost = self.layers[-1].cost(self)+\ 0.5*lmbda*l2_norm_squared/num_training_batches grads = T.grad(cost, self.params) updates = [(param, param - eta * grad) for param, grad in zip(self.params, grads)] # define functions to train a mini-batch, and to compute the # accuracy in validation and test mini-batches. i = T.lscalar() # mini-batch index train_mb = theano.function( [i], cost, updates=updates, givens={ self.x: training_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: training_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) validate_mb_accuracy = theano.function( [i], self.layers[-1].accuracy(self.y), givens={ self.x: validation_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: validation_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) test_mb_accuracy = theano.function( [i], self.layers[-1].accuracy(self.y), givens={ self.x: test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) self.test_mb_predictions = theano.function( [i], self.layers[-1].y_out, givens={ self.x: test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) # Do the actual training best_validation_accuracy = 0.0 for epoch in xrange(epochs): for minibatch_index in xrange(num_training_batches): iteration = num_training_batches * epoch + minibatch_index if iteration % 1000 == 0: print("Training mini-batch number {0}".format(iteration)) cost_ij = train_mb(minibatch_index) if (iteration + 1) % num_training_batches == 0: validation_accuracy = np.mean([ validate_mb_accuracy(j) for j in xrange(num_validation_batches) ]) print("Epoch {0}: validation accuracy {1:.2%}".format( epoch, validation_accuracy)) if validation_accuracy >= best_validation_accuracy: print("This is the best validation accuracy to date.") best_validation_accuracy = validation_accuracy best_iteration = iteration if test_data: test_accuracy = np.mean([ test_mb_accuracy(j) for j in xrange(num_test_batches) ]) print('The corresponding test accuracy is {0:.2%}'. format(test_accuracy)) print("Finished training network.") print("Best validation accuracy of {0:.2%} obtained at iteration {1}". format(best_validation_accuracy, best_iteration)) print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
def __init__(self, model, trn_data, trn_loss, trn_target=None, val_data=None, val_loss=None, val_target=None, step=ss.Adam()): """ Constructs and configures the trainer. :param model: the model to be trained :param trn_data: train inputs and (possibly) train targets :param trn_loss: theano variable representing the train loss to minimize :param trn_target: theano variable representing the train target :param val_data: validation inputs and (possibly) validation targets :param val_loss: theano variable representing the validation loss :param val_target: theano variable representing the validation target :param step: step size strategy object :return: None """ # parse input # TODO: it would be good to type check the other inputs too assert isinstance(step, ss.StepStrategy), 'Step must be a step strategy object.' # prepare train data n_trn_data_list = set([x.shape[0] for x in trn_data]) assert len(n_trn_data_list) == 1, 'Number of train data is not consistent.' self.n_trn_data = list(n_trn_data_list)[0] trn_data = [theano.shared(x.astype(dtype), borrow=True) for x in trn_data] # compile theano function for a single training update grads = tt.grad(trn_loss, model.parms) idx = tt.ivector('idx') trn_inputs = [model.input] if trn_target is None else [model.input, trn_target] self.make_update = theano.function( inputs=[idx], outputs=trn_loss, givens=zip(trn_inputs, [x[idx] for x in trn_data]), updates=step.updates(model.parms, grads) ) # if model uses batch norm, compile a theano function for setting up stats if getattr(model, 'batch_norm', False): batch_norm_givens = [(bn.m, bn.bm) for bn in model.bns] + [(bn.v, bn.bv) for bn in model.bns] self.set_batch_norm_stats = theano.function( inputs=[], givens=zip(trn_inputs, trn_data), updates=[(bn.bm, bn.m) for bn in model.bns] + [(bn.bv, bn.v) for bn in model.bns] ) else: self.set_batch_norm_stats = None batch_norm_givens = [] # if validation data is given, then set up validation too self.do_validation = val_data is not None if self.do_validation: # prepare validation data n_val_data_list = set([x.shape[0] for x in val_data]) assert len(n_val_data_list) == 1, 'Number of validation data is not consistent.' self.n_val_data = list(n_val_data_list)[0] val_data = [theano.shared(x.astype(dtype), borrow=True) for x in val_data] # compile theano function for validation val_inputs = [model.input] if val_target is None else [model.input, val_target] self.validate = theano.function( inputs=[], outputs=val_loss, givens=zip(val_inputs, val_data) + batch_norm_givens ) # create checkpointer to store best model self.checkpointer = ModelCheckpointer(model) self.best_val_loss = float('inf') # initialize some variables self.trn_loss = float('inf') self.idx_stream = ds.IndexSubSampler(self.n_trn_data)
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" data_names = data_name.split(":") data_count = len(data_names) print "Train dataset:" for i in xrange(data_count): print "%d: %s" % (i, data_names[i]) print "Test dataset:" test_data_names = test_dataname.split(":") test_data_count = len(test_data_names) for i in xrange(test_data_count): print "%d: %s" % (i, test_data_names[i]) if test_data_count != data_count: raise Exception( "The amount of test and train dataset must be the same.") rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") docLabel = T.ivector('docLabel') sentenceW = None sentenceB = None docW = None docB = None hidden_layer_w = None hidden_layer_b = None logistic_layer_w = None logistic_layer_b = None layer0 = list() layer1 = list() layer2 = list() local_params = list() # for list-type data for i in xrange(data_count): layer0.append(DocEmbeddingNN(corpus, docSentenceCount, sentenceWordCount, rng, wordEmbeddingDim=249, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 249], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], sentenceW=sentenceW, sentenceB=sentenceB, docW=docW, docB=docB, pooling_mode=pooling_mode)) sentenceW = layer0[i].sentenceW sentenceB = layer0[i].sentenceB docW = layer0[i].docW docB = layer0[i].docB layer1.append( HiddenLayer(rng, input=layer0[i].output, n_in=layer0[i].outputDimension, n_out=10, activation=T.tanh, W=hidden_layer_w, b=hidden_layer_b)) hidden_layer_w = layer1[i].W hidden_layer_b = layer1[i].b layer2.append( LogisticRegression(input=layer1[i].output, n_in=10, n_out=2, W=logistic_layer_w, b=logistic_layer_b)) # logistic_layer_w = layer2[i].W # logistic_layer_b = layer2[i].b local_params.append(layer2[i].params) share_params = list(layer0[0].params + layer1[0].params) # construct the parameter array. params = list(layer0[0].params) + layer1[0].params for i in xrange(data_count): params += layer2[i].params # data_name = "car" para_path = "data/" + data_name + "/share_hidden_low_model_multiinput/" + pooling_mode + ".model" traintext = [ "data/" + data_names[i] + "/train/text" for i in xrange(data_count) ] trainlabel = [ "data/" + data_names[i] + "/train/label" for i in xrange(data_count) ] testtext = [ "data/" + test_data_names[i] + "/test/text" for i in xrange(data_count) ] testlabel = [ "data/" + test_data_names[i] + "/test/label" for i in xrange(data_count) ] # Load the parameters last time, optionally. loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): train_model = list() valid_model = list() print "Loading train data." batchSize = 10 share_learning_rate = 0.1 local_learning_rate = 0.1 n_batches = list() print "Loading test data." all_pred_label = list() all_real_label = list() all_pred_prob = list() for i in xrange(data_count): cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext[i], labelset=trainlabel[i]) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, posList = cr_train.getCorpus( [0, 100000]) # docMatrixes = numpy.column_stack((docMatrixes, posList)) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) # posList = transToTensor(posList, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) index = T.lscalar("index") n_batches.append((len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1) print "Dataname: %s" % data_names[i] print "Train set size is ", len(docMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches[i] error = layer2[i].errors(docLabel) cost = layer2[i].negative_log_likelihood(docLabel) share_grads = T.grad(cost, share_params) share_updates = [ (param_i, param_i - share_learning_rate * grad_i) for param_i, grad_i in zip(share_params, share_grads) ] grads = T.grad(cost, local_params[i]) local_updates = [ (param_i, param_i - local_learning_rate * grad_i) for param_i, grad_i in zip(local_params[i], grads) ] updates = share_updates + local_updates print "Compiling train computing graph." if mode == "train": train_model.append( theano.function( [index], [cost, error, layer2[i].y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize] })) print "Compiled." print "Load test dataname: %s" % test_data_names[i] cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext[i], labelset=testlabel[i]) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, validPosList = cr_test.getCorpus( [0, 1000]) # validDocMatrixes = numpy.column_stack((validDocMatrixes, validPosList)) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) # validPosList = transToTensor(validPosList, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Data loaded." print "Compiling test computing graph." valid_model.append( theano.function( [], [ cost, error, layer2[i].y_pred, docLabel, T.transpose(layer2[i].p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels })) print "Compiled." costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ i]() all_pred_label.extend(pred_label) all_real_label.extend(real_label) all_pred_prob.extend(pred_prob) print "Valid current model :", data_names[i] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] print "Valid current model :", data_names errorNum = 1 - accuracy_score(all_real_label, all_pred_label) print "Error: ", errorNum fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label) if 1 in threshold: index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] if mode == "test": return print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(max(n_batches)): for dataset_index in xrange(data_count): if i >= n_batches[dataset_index]: continue # for list-type data print "dataset_index: %d, i: %d" % (dataset_index, i) costNum, errorNum, pred_label, real_label = train_model[ dataset_index](i) ite = ite + 1 # for padding data if (ite % 1 == 0): print print "Dataset name: ", data_names[dataset_index] print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model all_pred_label = list() all_real_label = list() all_pred_prob = list() for dataset_index in xrange(data_count): costNum, errorNum, pred_label, real_label, pred_prob = valid_model[ dataset_index]() all_pred_label.extend(pred_label) all_real_label.extend(real_label) all_pred_prob.extend(pred_prob) print "Valid current model :", data_names[dataset_index] print "Cost: ", costNum print "Error: ", errorNum fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] print "Valid current model :", data_names errorNum = 1 - accuracy_score(all_real_label, all_pred_label) print "Error: ", errorNum fpr, tpr, _ = roc_curve(all_real_label, all_pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(all_real_label, all_pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved."
def train_lstm( dim_proj=128, # word embeding dimension and LSTM number of hidden units. patience=10, # Number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there validFreq=370, # Compute the validation error after this number of update. saveFreq=1110, # Save the parameters after every saveFreq updates maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. ): # Model options model_options = locals().copy() print "model options", model_options load_data, prepare_data = get_dataset(dataset) print 'Loading data' train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U']**2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = tensor.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = tensor.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print 'Optimization' kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) print "%d test examples" % len(test[0]) history_errs = [] best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.clock() try: for eidx in xrange(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost if saveto and numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err]) if (uidx == 0 or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 print('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.clock() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err if saveto: numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err
def setup(self): """ Set up the model to train. """ # input_words: shape (n_batch, n_sentence, sentence_len) input_words = T.itensor3() n_batch, n_sentences, sentence_len = input_words.shape # query_words: shape (n_batch, query_len) query_words = T.imatrix() # correct_output: shape (n_batch, ?, num_output_words) correct_output = T.ftensor3() # graph_num_new_nodes: shape(n_batch, n_sentence) graph_num_new_nodes = T.imatrix() # graph_new_node_strengths: shape(n_batch, n_sentence, new_nodes_per_iter) graph_new_node_strengths = T.ftensor3() # graph_new_node_ids: shape(n_batch, n_sentence, new_nodes_per_iter, num_node_ids) graph_new_node_ids = T.ftensor4() # graph_new_edges: shape(n_batch, n_sentence, pad_graph_size, pad_graph_size, num_edge_types) graph_new_edges = T.TensorType('floatX', (False,)*5)() def _build(with_correct_graph, snap_to_best, using_dropout, evaluate_accuracy): info = {} # Process each sentence, flattened to (?, sentence_len) flat_input_words = input_words.reshape([-1, sentence_len]) flat_input_reprs, flat_ref_matrices = self.input_transformer.process(flat_input_words) # flat_input_reprs of shape (?, input_repr_size) # flat_ref_matrices of shape (?, num_node_ids, input_repr_size) input_reprs = flat_input_reprs.reshape([n_batch, n_sentences, self.input_repr_size]) ref_matrices = flat_ref_matrices.reshape([n_batch, n_sentences, self.num_node_ids, self.input_repr_size]) query_repr, query_ref_matrix = self.input_transformer.process(query_words) if using_dropout: iter_dropouts = [] states_mask = util.make_dropout_mask((self.node_state_size,), self.dropout_keep, self.srng) if self.nodes_mutable: iter_dropouts.extend(self.node_state_updater.dropout_masks(self.srng, states_mask)) if len(self.word_node_mapping) > 0: iter_dropouts.extend(self.direct_reference_updater.dropout_masks(self.srng, states_mask)) if self.intermediate_propagate != 0: iter_dropouts.extend(self.intermediate_propagator.dropout_masks(self.srng, states_mask)) if self.dynamic_nodes: iter_dropouts.extend(self.new_node_adder.dropout_masks(self.srng)) iter_dropouts.extend(self.edge_state_updater.dropout_masks(self.srng)) else: iter_dropouts = [] states_mask = None def _iter_fn(input_repr, ref_matrix, gstate, correct_num_new_nodes=None, correct_new_strengths=None, correct_new_node_ids=None, correct_edges=None, dropout_masks=None): # If necessary, update node state if self.nodes_mutable: gstate, dropout_masks = self.node_state_updater.process(gstate, input_repr, dropout_masks) if len(self.word_node_mapping) > 0: gstate, dropout_masks = self.direct_reference_updater.process(gstate, ref_matrix, dropout_masks) # If necessary, propagate node state if self.intermediate_propagate != 0: gstate, dropout_masks = self.intermediate_propagator.process_multiple(gstate, self.intermediate_propagate, dropout_masks) node_loss = None node_accuracy = None # Propose and vote on new nodes if self.dynamic_nodes: new_strengths, new_ids, dropout_masks = self.new_node_adder.get_candidates(gstate, input_repr, self.new_nodes_per_iter, dropout_masks) # new_strengths and correct_new_strengths are of shape (n_batch, new_nodes_per_iter) # new_ids and correct_new_node_ids are of shape (n_batch, new_nodes_per_iter, num_node_ids) if with_correct_graph: perm_idxs = np.array(list(itertools.permutations(range(self.new_nodes_per_iter)))) permuted_correct_str = correct_new_strengths[:,perm_idxs] permuted_correct_ids = correct_new_node_ids[:,perm_idxs] # due to advanced indexing, we should have shape (n_batch, permutation, new_nodes_per_iter, num_node_ids) ext_new_str = T.shape_padaxis(new_strengths,1) ext_new_ids = T.shape_padaxis(new_ids,1) strength_ll = permuted_correct_str * T.log(ext_new_str + util.EPSILON) + (1-permuted_correct_str) * T.log(1-ext_new_str + util.EPSILON) ids_ll = permuted_correct_ids * T.log(ext_new_ids + util.EPSILON) reduced_perm_lls = T.sum(strength_ll, axis=2) + T.sum(ids_ll, axis=[2,3]) if self.best_node_match_only: node_loss = -T.max(reduced_perm_lls, 1) else: full_ll = util.reduce_log_sum(reduced_perm_lls, 1) # Note that some of these permutations are identical, since we likely did not add the maximum # amount of nodes. Thus we will have added repeated elements here. # We have log(x+x+...+x) = log(kx), where k is the repetition factor and x is the probability we want # log(kx) = log(k) + log(x) # Our repetition factor k is given by (new_nodes_per_iter - correct_num_new_nodes)! # Recall that n! = gamma(n+1) # so log(x) = log(kx) - log(gamma(k+1)) log_rep_factor = T.gammaln(T.cast(self.new_nodes_per_iter - correct_num_new_nodes + 1, 'floatX')) scaled_ll = full_ll - log_rep_factor node_loss = -scaled_ll if evaluate_accuracy: best_match_idx = T.argmax(reduced_perm_lls, 1) # should be of shape (n_batch), indexing the best permutation best_correct_str = permuted_correct_str[T.arange(n_batch), best_match_idx] best_correct_ids = permuted_correct_ids[T.arange(n_batch), best_match_idx] snapped_strengths = util.independent_best(new_strengths) snapped_ids = util.categorical_best(new_ids) * T.shape_padright(snapped_strengths) close_strengths = T.all(T.isclose(best_correct_str, snapped_strengths), (1)) close_ids = T.all(T.isclose(best_correct_ids, snapped_ids), (1,2)) node_accuracy = T.and_(close_strengths, close_ids) # now substitute in the correct nodes gstate = gstate.with_additional_nodes(correct_new_strengths, correct_new_node_ids) elif snap_to_best: snapped_strengths = util.independent_best(new_strengths) snapped_ids = util.categorical_best(new_ids) gstate = gstate.with_additional_nodes(snapped_strengths, snapped_ids) else: gstate = gstate.with_additional_nodes(new_strengths, new_ids) # Update edge state gstate, dropout_masks = self.edge_state_updater.process(gstate, input_repr, dropout_masks) if with_correct_graph: cropped_correct_edges = correct_edges[:,:gstate.n_nodes,:gstate.n_nodes,:] edge_lls = cropped_correct_edges * T.log(gstate.edge_strengths + util.EPSILON) + (1-cropped_correct_edges) * T.log(1-gstate.edge_strengths + util.EPSILON) # edge_lls currently penalizes for edges connected to nodes that do not exist # we do not want it to do this, so we mask it with node strengths mask_src = util.shape_padaxes(gstate.node_strengths,[2,3]) mask_dest = util.shape_padaxes(gstate.node_strengths,[1,3]) masked_edge_lls = edge_lls * mask_src * mask_dest edge_loss = -T.sum(masked_edge_lls, axis=[1,2,3]) if evaluate_accuracy: snapped_edges = util.independent_best(gstate.edge_strengths) close_edges = T.isclose(cropped_correct_edges, snapped_edges) ok_mask = T.invert(T.cast(mask_src * mask_dest,'bool')) # its OK for things not to match if node strengths are NOT both 1 edge_accuracy = T.all(T.or_(close_edges, ok_mask), (1,2,3)) overall_accuracy = edge_accuracy if node_accuracy is None else T.and_(node_accuracy, edge_accuracy) else: overall_accuracy = None gstate = gstate.with_updates(edge_strengths=cropped_correct_edges) return gstate, node_loss, edge_loss, overall_accuracy elif snap_to_best: snapped_edges = util.independent_best(gstate.edge_strengths) gstate = gstate.with_updates(edge_strengths=snapped_edges) return gstate else: return gstate # Scan over each sentence def _scan_fn(input_repr, *stuff): # (input_repr, [ref_matrix?], [*correct_graph_stuff?], [dropout_masks?], *flat_graph_state, pad_graph_size) stuff = list(stuff) if len(self.word_node_mapping) > 0: ref_matrix = stuff[0] stuff = stuff[1:] else: ref_matrix = None if with_correct_graph: c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges = stuff[:4] stuff = stuff[4:] if using_dropout: dropout_masks = stuff[:len(iter_dropouts)] stuff = stuff[len(iter_dropouts):] else: dropout_masks = None flat_graph_state = stuff[:-1] pad_graph_size = stuff[-1] gstate = GraphState.unflatten_from_const_size(flat_graph_state) if with_correct_graph: gstate, node_loss, edge_loss, overall_accuracy = _iter_fn(input_repr, ref_matrix, gstate, c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges, dropout_masks=dropout_masks) else: gstate = _iter_fn(input_repr, ref_matrix, gstate, dropout_masks=dropout_masks) retvals = gstate.flatten_to_const_size(pad_graph_size) if with_correct_graph: if self.dynamic_nodes: retvals.append(node_loss) retvals.append(edge_loss) if evaluate_accuracy: retvals.append(overall_accuracy) return retvals if self.dynamic_nodes: initial_gstate = GraphState.create_empty(n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types) else: initial_gstate = GraphState.create_full_unique(n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types) # Account for all nodes, plus the extra padding node to prevent GPU unpleasantness if self.dynamic_nodes: pad_graph_size = n_sentences * self.new_nodes_per_iter + 1 else: pad_graph_size = self.num_node_ids outputs_info = initial_gstate.flatten_to_const_size(pad_graph_size) prepped_input = input_reprs.dimshuffle([1,0,2]) sequences = [prepped_input] if len(self.word_node_mapping) > 0: sequences.append(ref_matrices.dimshuffle([1,0,2,3])) if with_correct_graph: sequences.append(graph_num_new_nodes.swapaxes(0,1)) sequences.append(graph_new_node_strengths.swapaxes(0,1)) sequences.append(graph_new_node_ids.swapaxes(0,1)) sequences.append(graph_new_edges.swapaxes(0,1)) if self.dynamic_nodes: outputs_info.extend([None]) if evaluate_accuracy: outputs_info.extend([None]) outputs_info.extend([None]) if using_dropout: sequences.extend(iter_dropouts) all_scan_out, _ = theano.scan(_scan_fn, sequences=sequences, outputs_info=outputs_info, non_sequences=[pad_graph_size]) graph_accurate_list = None if with_correct_graph: if evaluate_accuracy: full_graph_accuracy = all_scan_out[-1] all_scan_out = all_scan_out[:-1] graph_accurate_list = T.all(full_graph_accuracy, 0) info["graph_accuracy"]=T.sum(graph_accurate_list, dtype='floatX')/T.cast(n_batch, 'floatX') if self.dynamic_nodes: all_flat_gstates = all_scan_out[:-2] node_loss, edge_loss = all_scan_out[-2:] reduced_node_loss = T.sum(node_loss)/T.cast(n_batch, 'floatX') reduced_edge_loss = T.sum(edge_loss)/T.cast(n_batch, 'floatX') avg_graph_loss = (reduced_node_loss + reduced_edge_loss)/T.cast(input_words.shape[1], 'floatX') info["node_loss"]=reduced_node_loss info["edge_loss"]=reduced_edge_loss else: all_flat_gstates = all_scan_out[:-1] edge_loss = all_scan_out[-1] reduced_edge_loss = T.sum(edge_loss)/T.cast(n_batch, 'floatX') avg_graph_loss = reduced_edge_loss/T.cast(input_words.shape[1], 'floatX') info["edge_loss"]=reduced_edge_loss else: all_flat_gstates = all_scan_out if self.sequence_representation: # Each part of all_flat_gstates is of shape (n_sentences, n_batch, ...) # except for the last one, which we handle separately # Swap to (n_batch, n_sentences, ...) # Then flatten to (n_batch*n_sentences, ...) for further processing final_flat_gstate = [x.swapaxes(0,1).reshape(T.concatenate([[-1], x.shape[2:]]), ndim=(x.ndim-1)) for x in all_flat_gstates[:-1]] # As for the last one, we need to get a single scalar value. The last one will be the biggest # so we will take that. Note that this will introduce a bunch of zero-nodes, but thats # OK and we can process that later. (We REQUIRE that padding in graph_state makes zero strength # nodes here!) final_flat_gstate.append(all_flat_gstates[-1][-1]) # We also need to repeat query_repr and query_ref_matrix so that they broadcast together query_repr = T.extra_ops.repeat(query_repr, n_sentences, 0) query_ref_matrix = T.extra_ops.repeat(query_ref_matrix, n_sentences, 0) else: # Extract last timestep final_flat_gstate = [x[-1] for x in all_flat_gstates] final_gstate = GraphState.unflatten_from_const_size(final_flat_gstate) if self.train_with_query: if self.wipe_node_state: final_gstate = final_gstate.with_updates(node_states=T.zeros_like(final_gstate.node_states)) qnsu_dropout_masks = self.query_node_state_updater.dropout_masks(self.srng, states_mask) query_gstate, _ = self.query_node_state_updater.process(final_gstate, query_repr, qnsu_dropout_masks) if len(self.word_node_mapping) > 0: qdru_dropout_masks = self.query_direct_reference_updater.dropout_masks(self.srng, states_mask) query_gstate, _ = self.query_direct_reference_updater.process(query_gstate, query_ref_matrix, qdru_dropout_masks) fp_dropout_masks = self.final_propagator.dropout_masks(self.srng, states_mask) propagated_gstate, _ = self.final_propagator.process_multiple(query_gstate, self.final_propagate, fp_dropout_masks) agg_dropout_masks = self.aggregator.dropout_masks(self.srng) aggregated_repr, _ = self.aggregator.process(propagated_gstate, agg_dropout_masks) # shape (n_batch, output_repr_size) if self.sequence_representation: # aggregated_repr is of shape (n_batch*n_sentences, repr_width) # We want to split back to timesteps: (n_batch, n_sentences, repr_width) agg_repr_seq = aggregated_repr.reshape([n_batch, n_sentences, -1]) # Now collapse it to a summary representation aggsum_dropout_masks = self.aggregate_summarizer.dropout_masks(self.srng) aggregated_repr, _ = self.aggregate_summarizer.process(agg_repr_seq, aggsum_dropout_masks) # At this point aggregated_repr is (n_batch, repr_width) as desired max_seq_len = correct_output.shape[1] if self.output_format == ModelOutputFormat.sequence: final_output = self.output_processor.process(aggregated_repr, max_seq_len) # shape (n_batch, ?, num_output_words) else: final_output = self.output_processor.process(aggregated_repr) if snap_to_best: final_output = self.output_processor.snap_to_best(final_output) if self.output_format == ModelOutputFormat.subset: elemwise_loss = T.nnet.binary_crossentropy(final_output, correct_output) query_loss = T.sum(elemwise_loss) else: flat_final_output = final_output.reshape([-1, self.num_output_words]) flat_correct_output = correct_output.reshape([-1, self.num_output_words]) timewise_loss = T.nnet.categorical_crossentropy(flat_final_output, flat_correct_output) query_loss = T.sum(timewise_loss) query_loss = query_loss/T.cast(n_batch, 'floatX') info["query_loss"] = query_loss else: final_output = T.zeros([]) full_loss = np.array(0.0,np.float32) if with_correct_graph: full_loss = full_loss + avg_graph_loss if self.train_with_query: full_loss = full_loss + query_loss if self.train_with_query: adjusted_query_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim+1)) if self.sequence_representation else T.shape_padaxis(x,1) for x in query_gstate.flatten()] adjusted_prop_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim+1)) if self.sequence_representation else T.shape_padaxis(x,1) for x in propagated_gstate.flatten()] full_flat_gstates = [T.concatenate([a.swapaxes(0,1),b,c],1) for a,b,c in zip(all_flat_gstates[:-1], adjusted_query_gstates, adjusted_prop_gstates)] else: full_flat_gstates = [a.swapaxes(0,1) for a in all_flat_gstates[:-1]] max_seq_len = T.iscalar() return full_loss, final_output, full_flat_gstates, graph_accurate_list, max_seq_len, info train_loss, _, _, _, _, train_info = _build(self.train_with_graph, False, True, False) adam_updates = Adam(train_loss, self.params, lr=self.learning_rate_var) self.info_keys = list(train_info.keys()) print("Compiling...") optimizer = theano.compile.predefined_optimizers['fast_run' if self.check_mode == 'debug' else theano.config.optimizer] optimizer = optimizer.excluding("scanOp_pushout_output","remove_constants_and_unused_inputs_scan") if self.check_mode == 'nan': mode = NanGuardMode(optimizer=optimizer, nan_is_error=True, inf_is_error=True, big_is_error=True) elif self.check_mode == 'debug': mode = DebugMode(optimizer=optimizer, check_isfinite=False, check_py_code=False, stability_patience=1) theano.tensor.TensorType.filter_checks_isfinite = False else: mode = theano.Mode(optimizer=optimizer) self.train_fn = theano.function([input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges], [train_loss]+list(train_info.values()), updates=adam_updates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) eval_loss, _, full_flat_gstates, graph_accurate_list, _, eval_info = _build(self.train_with_graph, False, False, True) self.eval_info_keys = list(eval_info.keys()) self.eval_fn = theano.function( [input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges], [eval_loss, graph_accurate_list]+list(eval_info.values()), allow_input_downcast=True, on_unused_input='ignore', mode=mode) self.debug_test_fn = theano.function( [input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges], full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(False, False, False, False) self.fuzzy_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []), [final_output] + full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(False, True, False, False) self.snap_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []), [final_output] + full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode)
X = T.matrix() state = model.fprop(X) target = T.matrix() wrong_target = T.matrix() right_cost = model.layers[-1].kl(Y=target, Y_hat=state) wrong_cost = model.layers[-1].kl(Y=wrong_target, Y_hat=state) from theano.printing import Print right_cost = Print('right_cost')(right_cost) acc = (wrong_cost > right_cost).mean() from theano import function f = function([X, target, wrong_target], acc) wrong_target = dataset.y.copy() used = np.zeros((500,), dtype='bool') for i in xrange(wrong_target.shape[0]): dists = np.square(dataset.y - dataset.y[i,:]).sum(axis=1) dists[i] = np.inf dists[used] = np.inf idx = np.argmin(dists) used[idx] = 1 wrong_target[i, :] = dataset.y[idx, :].copy() acc = f(dataset.X, dataset.y, wrong_target) print dataset.y.sum() print wrong_target.sum()
shared_x = theano.shared(numpy.asarray(Feature_normalized, dtype=theano.config.floatX), borrow=True) numpy_rng = numpy.random.RandomState(123) ########################## ### model 1 第一种网络构架模式 # ########################## dbn = GRBM_DBN(numpy_rng=numpy_rng, n_ins=528, hidden_layers_sizes=[1000, 1000, 500], n_outs=201) dbn.load('dbn_2014-05-23-20-07-28.npy')#预先训练好的构架 #这里就是theano的奇葩函数构架 validate_model = theano.function(inputs=[], outputs=dbn.logLayer.p_y_given_x,#输出是逻辑回归层的输出 givens={ dbn.x: shared_x}) observ_likelihood_1 = validate_model()#调用函数得到结果 del dbn """ ########################## ### model 2 ########################## dbn = GRBM_DBN(numpy_rng=numpy_rng, n_ins=528, hidden_layers_sizes=[1000, 1000, 500], n_outs=201) dbn.load('dbn_2014-05-24-05-53-17.npy')
X = T.matrix('X') M = T.imatrix('M') X_complete = T.where(M, X, X_shared) ll = model.get_log_likelihood(X_complete) grad = T.grad(ll.mean(), X_shared, disconnected_inputs='warn') updates = OrderedDict() lr = T.scalar('lr') is_noise = sharedX(0., 'is_noise') updates[X_shared] = X_shared + lr * (grad + model.prior.theano_rng.normal(size=X_shared.shape)) updates[X_shared] = T.where(M, X, updates[X_shared]) updates[X_shared] = T.clip(updates[X_shared], 0, 1) f = theano.function([X, M, lr], [ll.mean()], updates=updates, allow_input_downcast=True) print 'Compiled training function' # Setup for training and display dataset_yaml_src = model.dataset_yaml_src train_set = yaml_parse.load(dataset_yaml_src) test_set = yaml_parse.load(dataset_yaml_src.replace("unlabeled", "test")) dataset = train_set num_samples = n_examples vis_batch = dataset.get_batch_topo(num_samples) rval = tuple(vis_batch.shape[dataset.X_topo_space.axes.index(axis)] for axis in ('b', 0, 1, 'c')) _, patch_rows, patch_cols, channels = rval mapback = hasattr(dataset, 'mapback_for_viewer')
def visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args): in_gates = gate_values["in_gates"] out_gates = gate_values["out_gates"] forget_gates = gate_values["forget_gates"] # Handle the theano shared variables that allow carrying the hidden state givens, f_updates = carry_hidden_state(updates, 1, not(has_indices(args.dataset))) generate_in = theano.function(inputs=ComputationGraph(in_gates).inputs, outputs=in_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) generate_out = theano.function(inputs=ComputationGraph(out_gates).inputs, outputs=out_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) generate_forget = theano.function(inputs=ComputationGraph(forget_gates).inputs, outputs=forget_gates, givens=givens, updates=f_updates, mode=Mode(optimizer='fast_compile')) # Generate epoch_iterator = valid_stream.get_epoch_iterator() for num in range(10): init_ = next(epoch_iterator)[0][0: args.visualize_length, 0:1] last_output_in = generate_in(init_) last_output_out = generate_out(init_) last_output_forget = generate_forget(init_) layers = len(last_output_in) time = last_output_in[0].shape[0] if has_indices(args.dataset): ticks = tuple(conv_into_char(init_[:, 0], args.dataset)) else: ticks = tuple(np.arange(time)) for i in range(layers): plt.subplot(3, layers, 1 + i) plt.plot(np.arange(time), np.mean( np.abs(last_output_in[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("in_gate of layer " + str(i)) plt.subplot(3, layers, layers + 1 + i) plt.plot(np.arange(time), np.mean( np.abs(last_output_out[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("out_gate of layer " + str(i)) plt.subplot(3, layers, 2 * layers + 1 + i) plt.plot(np.arange(time), np.mean( np.abs(last_output_forget[i][:, 0, :]), axis=1)) plt.xticks(range(args.visualize_length), ticks) plt.grid(True) plt.title("forget_gate of layer " + str(i)) plt.tight_layout() if args.local: plt.show() else: plt.savefig( args.save_path + "/visualize_gates_" + str(num) + ".png") logger.info("Figure \"visualize_gates_" + str(num) + ".png\" saved at directory: " + args.save_path)