def test_lop_override(self, cls_ofg): x = T.vector() y = 1. / (1. + T.exp(-x)) def lop_ov(inps, outs, grads): y_, = outs dedy_, = grads return [2. * y_ * (1. - y_) * dedy_] y_, dedy = T.vector(), T.vector() op_lop_ov = cls_ofg([x, y_, dedy], [2. * y_ * (1. - y_) * dedy]) xx = T.vector() yy1 = T.sum(T.nnet.sigmoid(xx)) gyy1 = 2. * T.grad(yy1, xx) for ov in [lop_ov, op_lop_ov]: op = cls_ofg([x], [y], lop_overrides=ov) yy2 = T.sum(op(xx)) gyy2 = T.grad(yy2, xx) fn = function([xx], [gyy1, gyy2]) xval = np.random.rand(32).astype(config.floatX) y1val, y2val = fn(xval) assert np.allclose(y1val, y2val)
def _compile_func(): beta = T.vector('beta') b = T.scalar('b') X = T.matrix('X') y = T.vector('y') C = T.scalar('C') params = [beta, b, X, y, C] cost = 0.5 * (T.dot(beta, beta) + b * b) + C * T.sum( T.nnet.softplus( -T.dot(T.diag(y), T.dot(X, beta) + b) ) ) # Function computing in one go the cost, its gradient # with regard to beta and with regard to the bias. cost_grad = theano.function(params,[ cost, T.grad(cost, beta), T.grad(cost, b) ]) # Function for computing element-wise sigmoid, used for # prediction. log_predict = theano.function( [beta, b, X], T.nnet.sigmoid(b + T.dot(X, beta)), on_unused_input='warn' ) return (cost_grad, log_predict)
def _build_hidden_layers(self, input, add_cost, Y, updates, external_grad=None): lin_output = tensor.dot(input, self.W_theano.T)+self.b_theano[None,:] if self.activation=='tanh': output = tensor.tanh(lin_output) elif self.activation=='softplus': output = tensor.nnet.softplus(lin_output) elif self.activation is None: output = lin_output else: raise 'Unsupported activation function!' if self.regularization == 'L1': add_cost = add_cost -self.reg_weight*tensor.abs(self.W_theano).sum() elif self.regularization == 'L2': add_cost = add_cost -self.reg_weight*(self.W_theano**2).sum() # Compute the cost function if self.layer_forward is None: if external_grad is None: cost = -((output-Y)**2).sum()/self.sigma2_theano[0]+add_cost else: cost = (external_grad*output).sum() Y_out = output else: cost, Y_out = self.layer_forward._build_hidden_layers(output, add_cost, Y, updates, external_grad=external_grad) # Update parameter gradients W_grad = tensor.grad(cost, self.W_theano) b_grad = tensor.grad(cost, self.b_theano) updates.extend([(self.W_grad_theano,self.W_grad_theano+W_grad), (self.b_grad_theano,self.b_grad_theano+b_grad)]) return cost, Y_out
def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False): N, D = X.shape n_batches = N / batch_sz W0 = init_weights((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared(np.zeros(self.M), 'bh_%s' % self.id) self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] self.forward_params = [self.W, self.bh] # TODO: technically these should be reset before doing backprop self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] X_in = T.matrix('X_%s' % self.id) X_hat = self.forward_output(X_in) # attach it to the object so it can be used later # must be sigmoidal because the output is also a sigmoid H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh) self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D) cost_op = theano.function( inputs=[X_in], outputs=cost, ) updates = [ (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [ (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print "training autoencoder: %s" % self.id for i in xrange(epochs): print "epoch:", i X = shuffle(X) for j in xrange(n_batches): batch = X[j*batch_sz:(j*batch_sz + batch_sz)] train_op(batch) the_cost = cost_op(X) # technically we could also get the cost for Xtest here print "j / n_batches:", j, "/", n_batches, "cost:", the_cost costs.append(the_cost) if show_fig: plt.plot(costs) plt.show()
def __init__(self, input=tensor.dvector('input'), target=tensor.dvector('target'), n_input=1, n_hidden=1, n_output=1, lr=1e-3, **kw): super(NNet, self).__init__(**kw) self.input = input self.target = target self.lr = shared(lr, 'learning_rate') self.w1 = shared(numpy.zeros((n_hidden, n_input)), 'w1') self.w2 = shared(numpy.zeros((n_output, n_hidden)), 'w2') # print self.lr.type self.hidden = sigmoid(tensor.dot(self.w1, self.input)) self.output = tensor.dot(self.w2, self.hidden) self.cost = tensor.sum((self.output - self.target)**2) self.sgd_updates = { self.w1: self.w1 - self.lr * tensor.grad(self.cost, self.w1), self.w2: self.w2 - self.lr * tensor.grad(self.cost, self.w2)} self.sgd_step = pfunc( params=[self.input, self.target], outputs=[self.output, self.cost], updates=self.sgd_updates) self.compute_output = pfunc([self.input], self.output) self.output_from_hidden = pfunc([self.hidden], self.output)
def test_batch_normalization_test(): for axes in ('per-activation', 'spatial', (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector): x, scale, bias, mean, var = (vartype(n) for n in ('x', 'scale', 'bias', 'mean', 'var')) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out = bn.batch_normalization_test(x, scale, bias, mean, var, axes, eps) # reference forward pass if axes == 'per-activation': axes2 = (0,) elif axes == 'spatial': axes2 = (0,) + tuple(range(2, ndim)) else: axes2 = axes scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2) for t in (scale, bias, mean, var)) out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 # backward pass dy = vartype('dy') grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out, out2] + grads + grads2) # check if the abstract Ops have been replaced assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad)) for n in f.maker.fgraph.toposort()]) # run for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX) Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX) Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX) Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX) Var = numpy.random.rand(*param_shape).astype(theano.config.floatX) outputs = f(X, Scale, Bias, Mean, Var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[1]) # out # compare gradients utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def sgd_optimization(learning_rate=0.13, n_epochs=1000, batch_size=100): dataset = generate_data() train_x, train_y = dataset[0] print train_x.type, train_y.type validate_x, validate_y = dataset[1] test_x, test_y = dataset[2] print 'train set size %d' %(train_x.get_value().shape[0]) print 'validate set size %d' %(validate_x.get_value().shape[0]) print 'test set size %d' %(test_x.get_value().shape[0]) n_batches = train_x.get_value(borrow=True).shape[0] / batch_size index = T.lscalar() x = T.matrix('x') y = T.ivector('y') lr = LogisticRegression(x, train_x.get_value().shape[1]) cost = lr.negative_log_likelihood(y) print 'compile function test_model...' test_model = theano.function(inputs=[index], outputs=lr.errors(y), givens={ x : train_x[index*batch_size : (index+1)*batch_size], y : train_y[index*batch_size : (index+1)*batch_size] }) g_w = T.grad(cost=cost, wrt=lr.w) g_b = T.grad(cost=cost, wrt=lr.b) updates = [(lr.w, lr.w-learning_rate*g_w), (lr.b, lr.b-learning_rate*g_b)] print 'complie function train_model...' train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x : train_x[index*batch_size : (index+1)*batch_size], y : train_y[index*batch_size : (index+1)*batch_size] }) best_train_error = numpy.Inf start_time = time.clock() for epoch in xrange(n_epochs): for minibatch_index in xrange(n_batches): batch_cost = train_model(minibatch_index) train_errors = [test_model(i) for i in xrange(n_batches)] train_error = numpy.mean(train_errors) if best_train_error > train_error: best_train_error = train_error print 'epoch %d, best_train_error %lf, train_error %lf' \ %(epoch, best_train_error, train_error) #print 'iterator %d %lf' %(epoch*n_batches + minibatch_index+1, batch_cost) end_time = time.clock() print 'cost %d' %(end_time-start_time)
def __gradients(self, mini_batch): objective = self.__objective(mini_batch) gradient_entity = T.grad(objective, wrt=self.Entity) gradient_relation = T.grad(objective, wrt=self.Relation) gradient_surface = T.grad(objective, wrt=self.RelationNormal) return gradient_entity, gradient_relation, gradient_surface
def build(self): self.debug = [] lM = [] lpullerror = [] lpusherror = [] lupdate = [] for i in xrange(self.M): if not self.localM: lM.append(theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True)) lpullerror.append(0.0) lpusherror.append(0.0) continue M = theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True) pullerror, pusherror = self._local_error(M, i) pullerror *= (1-self.mu) pusherror *= self.mu error = pullerror + pusherror update = (M, M - self._lr[i] * T.grad(error, M)) lM.append(M) lpullerror.append((1-self.mu)*pullerror) lpusherror.append(self.mu*pusherror) lupdate.append(update) self.lM = lM self.lpusherror = lpusherror self.lpullerror = lpullerror self.lupdate = lupdate #gError = 0.0 gM = [] gpullerror = [] gpusherror = [] gupdate = [] for i in xrange(self.M): if not self.globalM: gM.append(theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True)) gpullerror.append(0.0) gpusherror.append(0.0) continue M = theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True) if i == 0: pullerror, pusherror = self._global_error(M, i, None) else: pullerror, pusherror = self._global_error(M, i, gM[-1]) error = (1-self.mu) * pullerror + self.mu * pusherror # gError += error#*(float(i+1)/self.M) update = (M, M - self._lr[i+self.M] * T.grad(error, M)) gM.append(M) gpullerror.append((1-self.mu)*pullerror) gpusherror.append(self.mu*pusherror) gupdate.append(update) # if self.globalM: # gupdate = [(gM[i], gM[i] - self._lr[i+self.M]*T.grad(gError, M)) for i in xrange(self.M)] self.gM = gM self.gpusherror = gpusherror self.gpullerror = gpullerror self.gupdate = gupdate
def create_TrainFunc_tranPES(simfn, embeddings, marge=0.5, alpha=1., beta=1.): # parse the embedding data embedding = embeddings[0] # D x N matrix lembedding = embeddings[1] # declare the symbolic variables for training triples hp = S.csr_matrix('head positive') # N x batchsize matrix rp = S.csr_matrix('relation') tp = S.csr_matrix('tail positive') hn = S.csr_matrix('head negative') tn = S.csr_matrix('tail negative') lemb = T.scalar('embedding learning rate') lremb = T.scalar('relation learning rate') subtensorE = T.ivector('batch entities set') subtensorR = T.ivector('batch link set') # Generate the training positive and negative triples hpmat = S.dot(embedding.E, hp).T # batchsize x D dense matrix rpmat = S.dot(lembedding.E, rp).T tpmat = S.dot(embedding.E, tp).T hnmat = S.dot(embedding.E, hn).T tnmat = S.dot(embedding.E, tn).T # calculate the score pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat) negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat) negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat) costh, outh = margeCost(pos, negh, marge) costt, outt = margeCost(pos, negt, marge) embreg = regEmb(embedding, subtensorE, alpha) lembreg = regLink(lembedding, subtensorR, beta) cost = costh + costt + embreg[0] + lembreg out = T.concatenate([outh, outt]) outc = embreg[1] # list of inputs to the function list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR] # updating the embeddings using gradient descend emb_grad = T.grad(cost, embedding.E) New_embedding = embedding.E - lemb*emb_grad remb_grad = T.grad(cost, lembedding.E) New_rembedding = lembedding.E - lremb * remb_grad updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding}) return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg], updates=updates, on_unused_input='ignore')
def get_gradients(self, X, Y, weights=1.0): W_mean, W_ls, b_mean, b_ls = self.parameters mean, log_sigma = self.sample_expected(Y) sigma = tensor.exp(log_sigma) cost = -log_sigma - 0.5 * (X - mean) ** 2 / tensor.exp(2 * log_sigma) if weights != 1.0: cost = -weights.dimshuffle(0, "x") * cost cost_scaled = sigma ** 2 * cost cost_gscale = (sigma ** 2).sum(axis=1).dimshuffle([0, "x"]) cost_gscale = cost_gscale * cost gradients = OrderedDict() params = Selector(self.mlp).get_parameters() for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y]) gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y]) gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y]) gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y]) gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y]) return gradients
def theano_setup(self): # The matrices Wb and Wc were originally tied. # Because of that, I decided to keep Wb and Wc with # the same shape (instead of being transposed) to # avoid disturbing the code as much as possible. Wb = T.dmatrix('Wb') Wc = T.dmatrix('Wc') b = T.dvector('b') c = T.dvector('c') s = T.dscalar('s') x = T.dmatrix('x') h_act = T.dot(x, Wc) + c if self.act_func[0] == 'tanh': h = T.tanh(h_act) elif self.act_func[0] == 'sigmoid': h = T.nnet.sigmoid(h_act) elif self.act_func[0] == 'id': # bad idae h = h_act else: raise("Invalid act_func[0]") r_act = T.dot(h, Wb.T) + b if self.act_func[1] == 'tanh': r = s * T.tanh(r_act) elif self.act_func[1] == 'sigmoid': r = s * T.nnet.sigmoid(r_act) elif self.act_func[1] == 'id': r = s * r_act else: raise("Invalid act_func[1]") # Another variable to be able to call a function # with a noisy x and compare it to a reference x. y = T.dmatrix('y') loss = ((r - y)**2) sum_loss = T.sum(loss) # theano_encode_decode : vectorial function in argument X. # theano_loss : vectorial function in argument X. # theano_gradients : returns triplet of gradients, each of # which involves the all data X summed # so it's not a "vectorial" function. self.theano_encode_decode = function([Wb,Wc,b,c,s,x], r) self.theano_loss = function([Wb,Wc,b,c,s,x,y], loss) self.theano_gradients = function([Wb,Wc,b,c,s,x,y], [T.grad(sum_loss, Wb), T.grad(sum_loss, Wc), T.grad(sum_loss, b), T.grad(sum_loss, c), T.grad(sum_loss, s)]) # other useful theano functions for the experiments that involve # adding noise to the hidden states self.theano_encode = function([Wc,c,x], h) self.theano_decode = function([Wb,b,s,h], r)
def test_gradient_batch_normalization_op(): epsilon = 1e-8 op = gn.GradientBatchNormalizationOp(subtract_mean=True, keep_mean=False, epsilon=epsilon) X = np.random.randn(3, 4).astype(fX) W = np.random.randn(2, 3).astype(fX) x = T.matrix("x") w = T.matrix("w") orig_grad = T.grad(w.dot(x).sum(), x).eval({x: X, w: W}) new_grad = T.grad(w.dot(op(x)).sum(), x).eval({x: X, w: W}) mu = orig_grad.mean(axis=0, keepdims=True) sigma = orig_grad.std(axis=0, keepdims=True) + epsilon ans = (orig_grad - mu) / sigma np.testing.assert_allclose(ans, new_grad, rtol=1e-5) np.testing.assert_allclose(np.zeros(4), new_grad.mean(axis=0), atol=1e-5) np.testing.assert_allclose(np.ones(4), new_grad.std(axis=0), rtol=1e-5)
def __init__(self, sizes, input_dim, output_dim): self.layers = len(sizes) + 1 in_dim = [input_dim] + sizes out_dim = sizes + [output_dim] x = T.dvector('x') y = T.dvector('y') self.hyp_params = [] for i, (r,c) in enumerate(zip(in_dim,out_dim)): if i == 0: obj = HiddenLayer(x, r, c) else: obj = HiddenLayer(obj.output,r,c) self.hyp_params.append(obj.params) yhat = obj.output prediction = T.argmax(yhat) self.predict = theano.function([x],[yhat]) o_error = T.sum(T.sqr(yhat - y)) # o_error = T.sum(T.nnet.categorical_crossentropy(yhat, y)) updates = [] learning_rate = T.scalar('learning_rate') for param in self.hyp_params: updates.append((param['W'], param['W'] - learning_rate * T.grad(o_error,param['W']))) updates.append((param['b'], param['b'] - learning_rate * T.grad(o_error,param['b']))) self.train_step = theano.function([x,y,learning_rate],[o_error], updates = updates)
def train(self, epochs = 1000, learning_rate = 0.1): regression = self.regression X = self.X Y = self.Y x = T.matrix('x') # data, presented as rasterized images y = T.vector('y') # labels, presented as 1D vector of [int] labels error = regression.error(x, y) g_W = T.grad(cost=error, wrt=regression.W) g_b = T.grad(cost=error, wrt=regression.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(regression.W, regression.W - learning_rate * g_W), (regression.b, regression.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = tn.function( inputs=[], outputs=error, updates=updates, givens={ x: X, y: Y } ) print('training start:') start_time = timeit.default_timer() epoch = 0 while(epoch < epochs): avg_error = train_model() print('epoch {0}, error {1}'.format(epoch, avg_error), end='\r') epoch += 1 print('training finish (start: {0}) took {1} seconds.'.format(regression.error(X, Y).eval(), timeit.default_timer() - start_time)) # z = regression.compute(data_x).ravel() # e = regression.error(data_y, z) # l = regression.loss(e) # epoch = 0 # while(epoch < epochs): # g = regression.grad(data_y, z) # d = regression.delta(g, data_x) # regression.W -= learning_rate * d[0] # regression.b -= learning_rate * d[1] # # z = regression.compute(data_x).ravel() # e = regression.error(data_y, z) # l = regression.loss(e) # # print(l.eval()) # # epoch += 1 # print('epoch:', epoch, end='\r') pass
def test_relu_grad(self): seed = utt.fetch_seed() rng = numpy.random.RandomState(seed) imgsize_list = ((5, 5), (6, 6), (6, 6), (8, 8)) n, c = 4, 2 axis = 1 image = T.dtensor4('image') image1 = T.dtensor4('image1') for imgsize in imgsize_list: imval = rng.rand(n, c, imgsize[0], imgsize[1]) out = T.concatenate([image, image1], axis) sum_ref = T.sum(out) gx_ref = T.grad(sum_ref, [image, image1]) f_ref = theano.function([image, image1], outputs=gx_ref, mode=mode_without_mkl) output_ref = f_ref(imval, imval) out_mkl = self.mkl_concatenate_func(axis, image, image1) sum_mkl = T.sum(out_mkl) gx_mkl = T.grad(sum_mkl, [image, image1]) f_mkl = theano.function([image, image1], outputs=gx_mkl) output_mkl = f_mkl(imval, imval) utt.assert_allclose(output_mkl, output_ref)
def test_reduce_custom_dtype(self): """ Test the ability to provide your own output dtype for a reduce. """ # We try multiple axis combinations even though axis should not matter. idx = 0 for method in self.methods: for input_dtype in self.dtypes: x = tensor.matrix(dtype=input_dtype) for output_dtype in self.dtypes: # If the output is a complex, the gradient of the reduce will # cast the complex to the input dtype. We can't call the normal # cast on a complex to a not complex as this is ambiguous. if (not input_dtype.startswith('complex') and output_dtype.startswith('complex')): continue axis = self.axes[idx % len(self.axes)] var = getattr(x, method)(dtype=output_dtype, axis=axis) assert var.dtype == output_dtype f = theano.function([x], var, mode=self.mode) topo = f.maker.fgraph.toposort() assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype) data = numpy.random.rand(3, 4) * 10 data = data.astype(input_dtype) f(data) if "complex" in input_dtype: continue # Check that we can take the gradient tensor.grad(var.sum(), x, disconnected_inputs='ignore') idx += 1
def _training_updates(self, **kwargs): """Computes the update expression for updating the model parameters during training. .. note:: This method should only be called from the ``setup()`` class method. :type learning_rate: theano.config.floatX :param learning_rate: A coefficient by which the gradient is scaled on one update step. :type cost: theano.tensor.TensorType :param cost: The cost expression. :returns: A list of ``(param, update_expr)`` tuplets that can be passed directly to ``theano.function`` as the ``updates`` field. """ utils.check_kwargs(kwargs, ['learning_rate', 'cost']) learning_rate = kwargs['learning_rate'] bound_cost = kwargs['cost'] # Problem: need symbolic 'y' for self.negative_log_likelihood(y) # TODO: test behavior with dummy TT.ivector symbolic variable g_W = TT.grad(cost = bound_cost, wrt = self.W) g_b = TT.grad(cost = bound_cost, wrt = self.b) return [(self.W, self.W - learning_rate * g_W), (self.b, self.b - learning_rate * g_b)]
def __init__(self, numvis, numhid, vistype, init_features, selectionthreshold=1.0, weightcost=0.0): self.numvis = numvis self.numhid = numhid self.vistype = vistype self.weightcost = weightcost self.selectionthreshold = theano.shared(value=selectionthreshold, name='selectionthreshold') self.W_init = init_features.astype(theano.config.floatX) self.W = theano.shared(value = self.W_init, name='W') self.bvis = theano.shared(value=numpy.zeros(numvis, dtype=theano.config.floatX), name='bvis') self.inputs = T.matrix(name = 'inputs') self.params = [self.W, self.bvis] self._prehiddens = T.dot(self.inputs, self.W) self._hiddens = (self._prehiddens > self.selectionthreshold) * self._prehiddens if self.vistype == 'binary': self._outputs = T.nnet.sigmoid(T.dot(self._hiddens, self.W.T) + self.bvis) costpercase = -T.sum(self.inputs*T.log(self._outputs) + (1-self.inputs)*T.log(1-self._outputs), axis=1) elif self.vistype == 'real': self._outputs = T.dot(self._hiddens, self.W.T) + self.bvis costpercase = T.sum(0.5 * ((self.inputs - self._outputs)**2), axis=1) self._cost = T.mean(costpercase) self._cost += self.weightcost * T.sum(self.W**2) self._grads = T.grad(self._cost, self.params) self.cost = theano.function([self.inputs], self._cost) self.grad = theano.function([self.inputs], T.grad(self._cost, self.params)) self.prehiddens = theano.function([self.inputs], self._prehiddens) self.hiddens = theano.function([self.inputs], self._hiddens) self.recons_from_prehiddens = theano.function([self._prehiddens], self._outputs) self.recons_from_inputs = theano.function([self.inputs], self._outputs)
def mcmc(ll, *frvs): full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)])) loglik = -full_log_likelihood(full_observations) proposals = free_RVs_prop H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik # -- this should be an inner loop g = [] g.append(tensor.grad(loglik, frvs)) proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)] rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)] full_observations = dict(observations) full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)])) new_loglik = -full_log_likelihood(full_observations) gnew = [] gnew.append(tensor.grad(new_loglik, rvsp)) proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)] # -- Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik dH = Hnew - H accept = tensor.or_(dH < 0., U < tensor.exp(-dH)) return [tensor.switch(accept, -new_loglik, ll)] + \ [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \ {}, theano.scan_module.until(accept)
def test_conv_no_bias(self): images = T.dtensor4('input_conv') weights = T.dtensor4('weights') images_internal = U2IConv(imshp=(12, 3, 256, 256), kshp=(12, 3, 3, 3))(images) convOut = Conv2D(imshp=(12, 3, 256, 256), kshp=(12, 3, 3, 3), filter_flip=False)(images_internal, weights) convOut_user = I2U()(convOut) convOutLoss = T.mean(convOut_user) conv_op_di = T.grad(convOutLoss, images) conv_op_dk = T.grad(convOutLoss, weights) convOutBack = [conv_op_di, conv_op_dk] ival = numpy.random.rand(12, 3, 256, 256).astype(numpy.float64) wval = numpy.random.rand(12, 3, 3, 3).astype(numpy.float64) fopt = theano.function(inputs=[images, weights], outputs=convOutBack, mode=mode_with_mkl) new_out = fopt(ival, wval) convOut = conv2d(images, weights, input_shape=(12, 3, 256, 256), filter_shape=(12, 3, 3, 3), filter_flip=False) convOutLoss = T.mean(convOut) conv_op_di = T.grad(convOutLoss, images) conv_op_dk = T.grad(convOutLoss, weights) convOutBack = [conv_op_di, conv_op_dk] fori = theano.function(inputs=[images, weights], outputs=convOutBack, mode=mode_without_mkl) old_out = fori(ival, wval) assert len(fopt.maker.fgraph.toposort()) != len(fori.maker.fgraph.toposort()) assert numpy.allclose(old_out[0], new_out[0]) assert new_out[0].dtype == 'float64'
def check_mat_rop_lop(self, y, out_shape): vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX) yv = tensor.Rop(y, self.mx, self.mv) rop_f = function([self.mx, self.mv], yv) sy, _ = theano.scan( lambda i,y,x,v: (tensor.grad(y[i],x)*v).sum(), sequences = tensor.arange(y.shape[0]), non_sequences = [y,self.mx,self.mv]) scan_f = function([self.mx,self.mv], sy) v1 = rop_f(vx,vv) v2 = scan_f(vx,vv) assert numpy.allclose(v1,v2), ('ROP mismatch: %s %s' % (v1, v2)) self.check_nondiff_rop( theano.clone(y, replace={self.mx:break_op(self.mx)})) vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX) yv = tensor.Lop(y, self.mx, self.v) lop_f = function([self.mx, self.v], yv) sy = tensor.grad((self.v*y).sum(), self.mx) scan_f = function([self.mx, self.v], sy) v1 = lop_f(vx,vv) v2 = scan_f(vx,vv) assert numpy.allclose(v1,v2), ('LOP mismatch: %s %s' % (v1, v2))
def get_net(net_cfg, args={"lambda":0.5}): l_out = net_cfg(args) X = T.tensor4('X') X_noise = X + srng.normal(X.shape, std=1.) b_prime = theano.shared( np.zeros( (1, 28, 28) ) ) net_out = get_output(l_out, X) net_out_noise = get_output(l_out, X_noise) energy = args["lambda"]*((X-b_prime)**2).sum() - net_out.sum() energy_noise = args["lambda"]*((X_noise-b_prime)**2).sum() - net_out_noise.sum() # reconstruction fx = X - T.grad(energy, X) fx_noise = X_noise - T.grad(energy_noise, X_noise) loss = ((X-fx_noise)**2).sum(axis=[1,2,3]).mean() params = get_all_params(l_out, trainable=True) params += [b_prime] lr = theano.shared(floatX(args["learning_rate"])) #updates = nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9) updates = adadelta(loss, params, learning_rate=lr) #updates = rmsprop(loss, params, learning_rate=lr) train_fn = theano.function([X], [loss,energy], updates=updates) energy_fn = theano.function([X], energy) out_fn = theano.function([X], fx) return { "train_fn": train_fn, "energy_fn": energy_fn, "out_fn": out_fn, "lr": lr, "b_prime": b_prime, "l_out": l_out }
def calculate_Rl(v_input): # Sample a h_sample according to one v_input _, hl_mean, hl_sample = self.sample_h_given_v(v_input) # Calculate the probability of visible output according to h_sample _, vn_mean = self.propdown(hl_sample) # - Part1. # Desc: Multiply each element in grad with T.log(vn_mean).sum() # Hint: [array(...), array(...), array(...)] = T.grad(..., self.params) # The number of elements in gradient is the number of params which are partial derivation. # part1 = map(lambda x: x * T.log(vn_mean).sum(), # T.grad(T.log(hl_mean).sum(), # self.params, # disconnected_inputs='warn')) part1 = [x * T.log(vn_mean).sum() for x in T.grad( T.log(hl_mean).sum(), self.params, disconnected_inputs='warn')] # - Part2. part2 = T.grad((T.log(self.propdown(hl_sample)[1]).sum()), self.params, consider_constant=[hl_sample], disconnected_inputs='warn') # Rl is the result that add corresponding elements in two gradient. # Rl = log(p(v^n|h^l;\theta)) * grad(log(p(h^l|v^n;\theta))) + grad(log(p(v^n|h^l;\theta))) # Rl = map(lambda p1, p2: p1 + p2, part1, part2) Rl = [x + y for x, y in zip(part1, part2)] mi_cost_xi = T.log(vn_mean).sum() Rl.append(mi_cost_xi) return Rl
def test_downsample(): shps = [ (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), ] numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) for shp in shps: for ds in (2, 2), (3, 2), (1, 1): if ds[0] > shp[2]: continue if ds[1] > shp[3]: continue # GpuDownsampleFactorMax doesn't like having more than 512 columns # in the output tensor. if float(shp[3]) / ds[1] > 512: continue for ignore_border in (True, False): print "test_downsample", shp, ds, ignore_border ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border) a = tcn.shared_constructor(my_rand(*shp), "a") f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu) f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu) assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.env.toposort()]) assert any([isinstance(node.op, DownsampleFactorMax) for node in f2.maker.env.toposort()]) assert numpy.allclose(f(), f2()) g = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu) g2 = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu) assert any( [isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.env.toposort()] ) assert any([isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.env.toposort()]) assert numpy.allclose(g(), g2())
def get_params_and_grads(graph, cost, verbose=False): params = [] for k, p in graph.items(): if k == DATASETS_ID: # skip datasets continue if k == RANDOM_ID: # skip random continue params.append(p) if verbose: grads = [] for k, p in graph.items(): if k == DATASETS_ID: # skip datasets continue if k == RANDOM_ID: # skip random continue print("Computing grad w.r.t %s" % k) grad = tensor.grad(cost, p) grads.append(grad) else: grads = tensor.grad(cost, params) return params, grads
def fit(self,data_x,data_y): print "Training" start = time.clock() n_batches = data_x.get_value(borrow=True).shape[0]/self.batch_size tensor_x = T.matrix('x') tensor_y = T.ivector('y') index = T.lscalar('index') self.single_layer = Layer(self.n_in,self.n_out,T.nnet.softmax) cost = self.single_layer.negative_log_likelihood(tensor_x, tensor_y) g_W = T.grad(cost,self.single_layer.W) g_b = T.grad(cost,self.single_layer.b) updates = [(self.single_layer.W,self.single_layer.W - g_W*self.learning_rate), (self.single_layer.b,self.single_layer.b - g_b*self.learning_rate)] train_batch = theano.function([index],[cost], updates=updates, givens={tensor_x : data_x[index*self.batch_size : (index + 1)*self.batch_size], tensor_y : data_y[index*self.batch_size : (index + 1)*self.batch_size]}) train_batch_costs = [0 for i in xrange(n_batches)] for iter in xrange(self.iters): for minibatch_index in xrange(n_batches): train_batch_costs[minibatch_index] = train_batch(minibatch_index) if self.verbose==1: print "Iter %d --> %f" % (iter,np.mean(train_batch_costs)) end = time.clock() print "Finished Training Logistic Regression Model\n" \ "Iterations %d\n" \ "Time Taken : %d secs" % (self.iters,end - start)
def __build_theano__(self): x = ivector(name="x") y = ivector(name="y") U, V, W = self.U, self.V, self.W def forword_prop_step(x_t, s_t_prev, U, V, W): s_t = T.tanh(U[:,x_t] + V.dot(s_t_prev)) o_t = T.nnet.softmax(W.dot(s_t)) return [o_t[0], s_t] [o,s], updates = theano.scan(forword_prop_step, sequences=x, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], non_sequences=[U,V,W], truncate_gradient=4, strict=True) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) dU = T.grad(o_error, U) dV = T.grad(o_error, V) dW = T.grad(o_error, W) self.forward = theano.function([x], o) self.predict = theano.function([x], prediction) self.c_error = theano.function([x, y], o_error) self.bptt = theano.function([x, y], [dU, dV, dW]) learning_rate = scalar(name="learning_rate") self.sgd_step = theano.function([x, y, learning_rate], [], updates=[(self.U, self.U-learning_rate*dU), (self.V, self.V-learning_rate*dV), (self.W, self.W-learning_rate*dW)])
def get_mean_square_norm_gradients_variance_method_00(D_by_layer, cost, accum = 0): # This returns a theano variable that will be of shape (minibatch_size, ). # It will contain, for each training example, the associated mean of the # variance wrt the gradient of that minibatch. for (layer_name, D) in D_by_layer.items(): input = D['input'] input_square_norms = tensor.sqr(D['input']).sum(axis=1) backprop_output = tensor.grad(cost, D['output']) # I don't think that theano recomputes this. # It should be just redundant nodes in the computational graph # that end up being computed only once anyways. grad_weight = tensor.grad(cost, D['weight']) grad_bias = tensor.grad(cost, D['bias']) backprop_output_square_norms = tensor.sqr(backprop_output).sum(axis=1) if D.has_key('weight'): A = input_square_norms * backprop_output_square_norms C = tensor.sqr(grad_weight).sum() # all the terms get this "middle" expression added to them B = (backprop_output.dot(grad_weight.T) * input).sum(axis=1) accum += (A - 2*B + C) if D.has_key('bias'): # this last `sum` could be a component-wise `max` if we wanted # to carry the maximum of the variances instead of the sum of squares accum = accum + tensor.sqr(backprop_output - grad_bias.reshape((1,-1))).sum(axis=1) return accum
def __theano_build__(self): U, V, W = self.U, self.V, self.W x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_t_prev, U, V, W): s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev)) o_t = T.nnet.softmax(V.dot(s_t)) return [o_t[0], s_t] [o,s], updates = theano.scan( forward_prop_step, sequences=x, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], non_sequences=[U, V, W], truncate_gradient=self.bptt_truncate, strict=True) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Gradients dU = T.grad(o_error, U) dV = T.grad(o_error, V) dW = T.grad(o_error, W) # Assign functions self.forward_propagation = theano.function([x], o) self.predict = theano.function([x], prediction) self.ce_error = theano.function([x, y], o_error) self.bptt = theano.function([x, y], [dU, dV, dW]) # SGD learning_rate = T.scalar('learning_rate') self.sgd_step = theano.function([x,y,learning_rate], [], updates=[(self.U, self.U - learning_rate * dU), (self.V, self.V - learning_rate * dV), (self.W, self.W - learning_rate * dW)])
def fit(self, X, Y, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, show_fig=False): # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = np.random.randn(M1, K) / np.sqrt(M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) # gradients wrt each param grads = T.grad(cost, self.params) # for momentum dparams = [ theano.shared(np.zeros_like(p.get_value())) for p in self.params ] # for rmsprop cache = [ theano.shared(np.ones_like(p.get_value())) for p in self.params ] new_cache = [ decay * c + (1 - decay) * g * g for p, c, g in zip(self.params, cache, grads) ] new_dparams = [ mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10) for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads) ] updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [ (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams) ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)] # momentum only # updates = [ # (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams) # ] + [ # (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams) # ] train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N / batch_sz costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e if show_fig: plt.plot(costs) plt.show()
input3_var: x3 }, deterministic=False) #Evaluate eval_out = lasagne.layers.get_output(output, { input_var: x1, input2_var: x2, input2_var: x3 }, deterministic=True) all_params = lasagne.layers.get_all_params(output, trainable=True) cost = T.nnet.binary_crossentropy(train_out, target_var).mean() all_grads = T.grad(cost, all_params) # Set the update function for parameters # you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc. updates = lasagne.updates.nesterov_momentum(all_grads, all_params, learning_rate=0.01, momentum=0.75) f_eval = theano.function([input_var, input2_var, input3_var], eval_out) f_train = theano.function([input_var, input2_var, input3_var, target_var], [cost], updates=updates) from confusionmatrix import ConfusionMatrix
def test_scan_debugprint5(): k = tensor.iscalar("k") A = tensor.dvector("A") # Symbolic description of the result result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A, outputs_info=tensor.ones_like(A), non_sequences=A, n_steps=k) final_result = tensor.grad(result[-1].sum(), A) output_str = theano.printing.debugprint(final_result, file='str') lines = output_str.split('\n') expected_output = """Subtensor{int64} [id A] '' |for{cpu,grad_of_scan_fn}.1 [id B] '' | |Elemwise{sub,no_inplace} [id C] '' | | |Subtensor{int64} [id D] '' | | | |Shape [id E] '' | | | | |for{cpu,scan_fn} [id F] '' | | | | |k [id G] | | | | |IncSubtensor{Set;:int64:} [id H] '' | | | | | |AllocEmpty{dtype='float64'} [id I] '' | | | | | | |Elemwise{add,no_inplace} [id J] '' | | | | | | | |k [id G] | | | | | | | |Subtensor{int64} [id K] '' | | | | | | | |Shape [id L] '' | | | | | | | | |Rebroadcast{0} [id M] '' | | | | | | | | |InplaceDimShuffle{x,0} [id N] '' | | | | | | | | |Elemwise{second,no_inplace} [id O] '' | | | | | | | | |A [id P] | | | | | | | | |InplaceDimShuffle{x} [id Q] '' | | | | | | | | |TensorConstant{1.0} [id R] | | | | | | | |Constant{0} [id S] | | | | | | |Subtensor{int64} [id T] '' | | | | | | |Shape [id U] '' | | | | | | | |Rebroadcast{0} [id M] '' | | | | | | |Constant{1} [id V] | | | | | |Rebroadcast{0} [id M] '' | | | | | |ScalarFromTensor [id W] '' | | | | | |Subtensor{int64} [id K] '' | | | | |A [id P] | | | |Constant{0} [id X] | | |TensorConstant{1} [id Y] | |Subtensor{:int64:} [id Z] '' | | |Subtensor{::int64} [id BA] '' | | | |Subtensor{:int64:} [id BB] '' | | | | |for{cpu,scan_fn} [id F] '' | | | | |Constant{-1} [id BC] | | | |Constant{-1} [id BD] | | |ScalarFromTensor [id BE] '' | | |Elemwise{sub,no_inplace} [id C] '' | |Subtensor{:int64:} [id BF] '' | | |Subtensor{:int64:} [id BG] '' | | | |Subtensor{::int64} [id BH] '' | | | | |for{cpu,scan_fn} [id F] '' | | | | |Constant{-1} [id BI] | | | |Constant{-1} [id BJ] | | |ScalarFromTensor [id BK] '' | | |Elemwise{sub,no_inplace} [id C] '' | |Subtensor{::int64} [id BL] '' | | |IncSubtensor{Inc;int64::} [id BM] '' | | | |Elemwise{second,no_inplace} [id BN] '' | | | | |for{cpu,scan_fn} [id F] '' | | | | |InplaceDimShuffle{x,x} [id BO] '' | | | | |TensorConstant{0.0} [id BP] | | | |IncSubtensor{Inc;int64} [id BQ] '' | | | | |Elemwise{second,no_inplace} [id BR] '' | | | | | |Subtensor{int64::} [id BS] '' | | | | | | |for{cpu,scan_fn} [id F] '' | | | | | | |Constant{1} [id BT] | | | | | |InplaceDimShuffle{x,x} [id BU] '' | | | | | |TensorConstant{0.0} [id BP] | | | | |Elemwise{second} [id BV] '' | | | | | |Subtensor{int64} [id BW] '' | | | | | | |Subtensor{int64::} [id BS] '' | | | | | | |Constant{-1} [id BX] | | | | | |InplaceDimShuffle{x} [id BY] '' | | | | | |Elemwise{second,no_inplace} [id BZ] '' | | | | | |Sum{acc_dtype=float64} [id CA] '' | | | | | | |Subtensor{int64} [id BW] '' | | | | | |TensorConstant{1.0} [id R] | | | | |Constant{-1} [id BX] | | | |Constant{1} [id BT] | | |Constant{-1} [id CB] | |Alloc [id CC] '' | | |TensorConstant{0.0} [id BP] | | |Elemwise{add,no_inplace} [id CD] '' | | | |Elemwise{sub,no_inplace} [id C] '' | | | |TensorConstant{1} [id Y] | | |Subtensor{int64} [id CE] '' | | |Shape [id CF] '' | | | |A [id P] | | |Constant{0} [id CG] | |A [id P] |Constant{-1} [id CH] Inner graphs of the scan ops: for{cpu,grad_of_scan_fn}.1 [id B] '' >Elemwise{add,no_inplace} [id CI] '' > |Elemwise{mul} [id CJ] '' > | |<TensorType(float64, vector)> [id CK] -> [id BL] > | |A_copy [id CL] -> [id P] > |<TensorType(float64, vector)> [id CM] -> [id BL] >Elemwise{add,no_inplace} [id CN] '' > |Elemwise{mul} [id CO] '' > | |<TensorType(float64, vector)> [id CK] -> [id BL] > | |<TensorType(float64, vector)> [id CP] -> [id Z] > |<TensorType(float64, vector)> [id CQ] -> [id CC] for{cpu,scan_fn} [id F] '' >Elemwise{mul,no_inplace} [id CR] '' > |<TensorType(float64, vector)> [id CP] -> [id H] > |A_copy [id CL] -> [id P] for{cpu,scan_fn} [id F] '' >Elemwise{mul,no_inplace} [id CR] '' for{cpu,scan_fn} [id F] '' >Elemwise{mul,no_inplace} [id CR] '' for{cpu,scan_fn} [id F] '' >Elemwise{mul,no_inplace} [id CR] '' for{cpu,scan_fn} [id F] '' >Elemwise{mul,no_inplace} [id CR] ''""" for truth, out in zip(expected_output.split("\n"), lines): assert truth.strip() == out.strip()
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c max_x = T.iscalar('max_x') x = tlist.TypedListType(T.ivector)() l = tlist.length(x) def batch_padding(index, x_t, max_x): #f = func([wl_t], word_length, updates = {(num_zeros, 10-word_length[0])}) #f(wl_t) shape_ex = T.shape(x_t[index]) zero_vec = T.arange(max_x - shape_ex[0], dtype='int64') padded_x_t = T.concatenate( [x_t[index], T.zeros_like(zero_vec)], axis=0) return padded_x_t x_padded, updates = theano.scan(fn=batch_padding, outputs_info=None, non_sequences=[x, max_x], sequences=[T.arange(l, dtype='int64')]) #x = T.imatrix('x') #y = T.imatrix('y') def forward_prop_step(x_t_padded, s_t1_prev, s_t2_prev): # This is how we calculated the hidden state in a simple RNN. No longer! # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev)) # Word embedding layer x_e = E[:, x_t_padded] # GRU Layer 1 z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0]) r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1]) c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2]) s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev # GRU Layer 2 z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3]) r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4]) c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5]) s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row o_t = T.nnet.softmax(V.dot(s_t2) + c)[0] return [o_t, s_t1, s_t2] [o, s, s2], updates = theano.scan(forward_prop_step, sequences=x_padded, truncate_gradient=self.bptt_truncate, outputs_info=[ None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)) ]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o + 1e-6, y)) # Total cost (could add regularization here) cost = o_error # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], o) self.predict_class = theano.function([x], prediction) self.ce_error = theano.function([x, y], cost) self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 self.sgd_step = theano.function( [x, y, learning_rate, theano.Param(decay, default=0.9)], [], updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)])
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print('... training the model') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of' ' best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) # save the best model with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600): datasets = load_data( dataset ) # return value datasets is a three-element list, with each element a two-element tuple train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size # asarray.shape returns the size of the asarray as a tuple n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size #Build Model print('...building the model') index = T.lscalar() # variable declaration. Equal to "int64 index" in C++. # 0-dimension (i.e. ndim=0) variable with no name x = T.matrix('x') # 'x' is the name of the matrix variable x y = T.ivector('y') classifier = LogisticRegression( input=x, n_in=28 * 28, n_out=10) # instantialize an class object called classifier cost = classifier.negative_log_likelihood(y) test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #Train the model print('... training the model') patience = 5000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 # for each epoch, all minibatches are used for training for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) if patience < iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
shape=(None, None)) print(patch_op.shape[0]) ffn = get_model(inp, patch_op) output = LL.get_output(ffn) pred = LL.get_output(ffn, deterministic=True) target = T.ivector('idxs') cla = utils_lasagne.categorical_crossentropy_logdomain(output, target, nclasses).mean() acc = LO.categorical_accuracy(pred, target).mean() regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2) cost = cla + l2_weight * regL2 params = LL.get_all_params(ffn, trainable=True) grads = T.grad(cost, params) grads_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grads]), 2) updates = L.updates.adam(grads, params, learning_rate=0.001) funcs = dict() funcs['train'] = theano.function( [inp.input_var, patch_op.input_var, target], [cost, cla, l2_weight * regL2, grads_norm, acc], updates=updates, on_unused_input='warn') funcs['acc_loss'] = theano.function( [inp.input_var, patch_op.input_var, target], [acc, cost], on_unused_input='warn') funcs['predict'] = theano.function([inp.input_var, patch_op.input_var], [pred], on_unused_input='warn')
def __init__(self, d, V, r, nc, nf, pairwise_constraint=False, embeddings=None, fix_embeddings=False): #d = dimensionality of embeddings #V = size of vocabulary #r = number of dependency relations #nc = number of classes for classification #|V| x d embedding matrix if embeddings is None: self.We = theano.shared( name='embeddings', value=0.2 * np.random.uniform(-1.0, 1.0, (V, d))).astype( theano.config.floatX) else: self.We = theano.shared(name='embeddings', value=embeddings).astype( theano.config.floatX) #r x d x d tensor (matrix for each dependency relation) self.Wr = theano.shared( name='dependencies', value=0.2 * np.random.uniform(-1.0, 1.0, (r, d, d))).astype( theano.config.floatX) #d x d map from embedding to hidden vector self.Wv = theano.shared( name='Wv', value=0.2 * np.random.uniform(-1.0, 1.0, (d, d))).astype(theano.config.floatX) #d long bias vector self.b = theano.shared(name='b', value=np.zeros(d, dtype=theano.config.floatX)) #weights for fine grained features plus bias #self.beta = theano.shared(name='beta', # value=0.2 * np.random.uniform(-1.0, 1.0, (nc, nf)) # ).astype(theano.config.floatX) #low dimension approximation to classification parameters self.a = [] for i in range(nc): a = [] for j in range(3): a.append( theano.shared(name='a_{}_{}'.format(i, j), value=0.2 * np.random.uniform(-1.0, 1.0, d)).astype( theano.config.floatX)) #value=np.zeros(d, dtype=theano.config.floatX))) self.a.append(a) self.pairwise_constraint = pairwise_constraint if fix_embeddings: self.params = [self.Wr, self.Wv, self.b ] + [j for i in self.a for j in i] # + [self.beta] else: self.params = [self.We, self.Wr, self.Wv, self.b ] + [j for i in self.a for j in i] # + [self.beta] self.descender = Adagrad(self.params) #self.f = T.tanh self.f = normalized_tanh def recurrence(n, hidden_states, hidden_sums, x, r, p): #at each node n in the tree, calculate Wr(p,n) \dot f(W_v \dot We_word(n) + b + sum_n) and add to sum_p h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n]) sum_n = T.dot(r[n], h_n) return T.set_subtensor(hidden_states[n], h_n), T.inc_subtensor( hidden_sums[p[n]], sum_n) idxs = [] x = [] rel_idxs = [] r = [] p = [] hidden_sums = [] hidden_states = [] h = [] s = [] if pairwise_constraint: num_events = 4 else: num_events = 2 for i in range(num_events): idxs.append(T.ivector('idxs')) x.append(self.We[idxs[i]]) rel_idxs.append(T.ivector('rel_idxs')) r.append(self.Wr[rel_idxs[i]]) p.append(T.ivector('parents')) hidden_states.append( T.zeros((idxs[i].shape[0], d), dtype=theano.config.floatX)) #needs to be sent_length + 1 to store final sum hidden_sums.append( T.zeros((idxs[i].shape[0] + 1, d), dtype=theano.config.floatX)) h.append(None) s.append(None) [h[i], s[i]], updates = theano.scan( fn=recurrence, sequences=T.arange(x[i].shape[0]), outputs_info=[hidden_states[i], hidden_sums[i]], non_sequences=[x[i], r[i], p[i]]) #A = T.dot(self.a_1, self.a_2.reshape((1, d))) + T.nlinalg.diag(self.a_3) #cost = T.dot(T.dot(h[0][-1, -1], A), h[1][-1, -1]) #cost = T.dot(h[0][-1, -1], h[1][-1, -1]) #grad = T.grad(cost, self.params) #self.cost_and_grad = theano.function(inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]], # outputs=[cost] + grad) A_stack = [] for i in range(len(self.a)): A_stack.append( T.dot(self.a[i][0].reshape((d, 1)), self.a[i][1].reshape( (1, d))) + T.nlinalg.diag(self.a[i][2])) A = T.vertical_stack(*A_stack).reshape((d, d, nc)) self.states = theano.function( inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]], outputs=[h[0], h[1]]) #add fine-grained features #phi = T.vector('phi') p_y_given_x = T.nnet.softmax( T.dot(h[0][-1, -1], A).T.dot(h[1][-1, -1])) # + T.dot(self.beta, phi)) y_pred = T.argmax(p_y_given_x, axis=1) self.classify = theano.function( inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]], # , phi], outputs=y_pred) y = T.iscalar('y') if not pairwise_constraint: sentence_nll = -(T.log(p_y_given_x)[0, y]) grad = T.grad(sentence_nll, self.params) self.cost_and_grad = theano.function( inputs=[ idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], y ], #, phi, y], outputs=[sentence_nll] + grad) else: lambda_e = T.scalar('lambda_e') phi2 = T.vector('phi2') p_y_given_x1 = T.nnet.softmax( T.dot(h[0][-1, -1], A).T.dot(h[1][-1, -1]) + T.dot(self.beta, phi)) p_y_given_x2 = T.nnet.softmax( T.dot(h[2][-1, -1], A).T.dot(h[3][-1, -1]) + T.dot(self.beta, phi2)) sentence_nll = -(T.log(p_y_given_x1)[0, y]) - ( T.log(p_y_given_x2)[0, y]) #add constraint that events should be maximally similar cost = sentence_nll - lambda_e * T.dot(h[0][-1, -1], h[2][ -1, -1]) - lambda_e * T.dot(h[1][-1, -1], h[3][-1, -1]) #grad = T.grad(sentence_nll, self.params[:4] + [A]) grad = T.grad(cost, self.params) self.cost_and_grad = theano.function(inputs=[ idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], phi, idxs[2], rel_idxs[2], p[2], idxs[3], rel_idxs[3], p[3], phi2, y, theano.In(lambda_e, value=1) ], outputs=[cost] + grad)
def run(subBatchSize=500, maxEpochNum=100, eta=0.1, trainErrPeriod=5, testErrPeriod=10, logfile='./log.txt', saveWeightFile=None, saveWeightsFor='train', loadWeightFile=None): my = myCNN() # Read dataset base = './datasets/10class' trainSet_dir = base + '/' train_filename = ('subset1.pkl', 'subset2.pkl', 'subset3.pkl', 'subset4.pkl', 'subset5.pkl', 'subset6.pkl', 'subset7.pkl', 'subset8.pkl') (trainImages, trainLabels) = load_pet_dataset(trainSet_dir, train_filename) testSet_dir = base + '/' test_filename = ('subset9.pkl', 'subset10.pkl') (testImages, testLabels) = load_pet_dataset(testSet_dir, test_filename) # Get the number of images in the training set numOfTrainImages = trainImages.get_value().shape[0] # Get the number of images in the test set numOfTestImages = testImages.get_value().shape[0] # Get the sub batch size for training set assert ( numOfTrainImages % subBatchSize == 0 ), "The subbatch size must be a divisor of the number of train images" numOfTrainSubBatches = numOfTrainImages / subBatchSize # Get the sub batch size for test set assert ( numOfTestImages % subBatchSize == 0 ), "The subbatch size must be a divisor of the number of test images" numOfTestSubBatches = numOfTestImages / subBatchSize x = T.matrix('x') # data input symbolic variable y = T.ivector('y') # labels symbolic variable # -----< Construction of Network Model >----- layer0 = x.reshape((subBatchSize, 64, 64, 3)).transpose(0, 3, 1, 2) [layer1, layer1_w, layer1_b] = my.convolutionLayer(featureMaps=layer0, featureMapShape=(subBatchSize, 3, 64, 64), kernelShape=(16, 3, 7, 7), bias=0.1) layer2 = my.maxPoolingLayer(featureMaps=layer1, poolingShape=(2, 2), stride=2) layer3 = my.reLuLayer(featureMaps=layer2) [layer4, layer4_w, layer4_b ] = my.convolutionLayer(featureMaps=layer3, featureMapShape=(subBatchSize, 32, 29, 29), kernelShape=(32, 16, 4, 4)) layer5 = my.maxPoolingLayer(featureMaps=layer4, poolingShape=(2, 2), stride=2) layer6 = my.reLuLayer(featureMaps=layer5) [layer7, layer7_w, layer7_b ] = my.convolutionLayer(featureMaps=layer6, featureMapShape=(subBatchSize, 32, 13, 13), kernelShape=(64, 32, 4, 4)) layer8 = my.maxPoolingLayer(featureMaps=layer7, poolingShape=(2, 2), stride=2) layer9 = my.reLuLayer(featureMaps=layer8) layer9 = layer9.flatten(2) #[layer10, layer10_w, layer10_b] = my.dropoutLayer(inputUnits=layer9, # inputDim=64*5*5, # outputDim=64, # prob=0.5) [layer10, layer10_w, layer10_b] = my.fullyConnectedLayer(inputUnits=layer9, inputDim=64 * 5 * 5, outputDim=64) layer10 = layer10.reshape((subBatchSize, 64)) [error, numOfWrongClass, layer11_w, layer11_b] = my.softmaxLayer(inputVect=layer10, labels=y, inputDim=64, numOfClasses=10) # --------------------< Construction of Training Function >-------------------- # Load weight if it is desired loadweight = True if loadweight is True and loadWeightFile is not None: with open(loadWeightFile, 'rb') as w: weights = pickle.load(w) (param1, param2, param3, param4, param5, param6, param7, param8, param9, param10) = weights layer1_w.set_value(param1) layer1_b.set_value(param2) layer4_w.set_value(param3) layer4_b.set_value(param4) layer7_w.set_value(param5) layer7_b.set_value(param6) layer10_w.set_value(param7) layer10_b.set_value(param8) layer11_w.set_value(param9) layer11_b.set_value(param10) loadweight = False print "Pretrained weights were loaded!" # Define symbolic index variable index = T.iscalar('index') # Define parameters params = [ layer1_w, layer1_b, layer4_w, layer4_b, layer7_w, layer7_b, layer10_w, layer10_b, layer11_w, layer11_b ] # Take the derivative of error function with respect to parameters grads = T.grad(cost=error, wrt=params) # Define updates updates = [(w, w - eta * delta) for w, delta in zip(params, grads)] # Definition of symbolic training function training = function( [index], error, givens={ x: trainImages[index * subBatchSize:(index + 1) * subBatchSize], y: trainLabels[index * subBatchSize:(index + 1) * subBatchSize] }, updates=updates, ) # Definiton of the symbolic function computing the training error computeTrainingError = function( [index], numOfWrongClass, givens={ x: trainImages[index * subBatchSize:(index + 1) * subBatchSize], y: trainLabels[index * subBatchSize:(index + 1) * subBatchSize] }) # Definiton of the symbolic testing function testing = function( [index], numOfWrongClass, givens={ x: testImages[index * subBatchSize:(index + 1) * subBatchSize], y: testLabels[index * subBatchSize:(index + 1) * subBatchSize] }) print "The total number of training images in the dataset : " + str( numOfTrainImages) print "The total number of test images in the dataset : " + str( numOfTestImages) # Log file with open(logfile, "a") as logf: logf.write('The total number of training images in the dataset : ' + str(numOfTrainImages) + '\n') logf.write('The total number of test images in the dataset : ' + str(numOfTestImages) + '\n') minErr = numOfTrainImages + numOfTestImages for epoch in range(1, maxEpochNum + 1): for subBatchIndex in range(numOfTrainSubBatches): err = training(subBatchIndex) if (epoch % trainErrPeriod == 0) or (epoch == 1): # Compute the training error trainingError = [ computeTrainingError(inx) for inx in range(numOfTrainSubBatches) ] # Get the total wrong classified number of elements in the training set totalWrongClass = np.sum(trainingError) print "Epoch : " + str(epoch) + " Training error : %" + str( totalWrongClass * 100.0 / numOfTrainImages) + " " + str(totalWrongClass) # Write log file with open(logfile, "a") as logf: logf.write('Epoch : ' + str(epoch) + '\n') logf.write('Training : ' + str(totalWrongClass * 100.0 / numOfTrainImages) + ' ' + str(totalWrongClass) + '\n') if (epoch % testErrPeriod == 0) or (epoch == 1): # Compute the testing error testingError = [testing(inx) for inx in range(numOfTestSubBatches)] # Get the total wrong classified number of elements in the test set totalTestWrongClass = np.sum(testingError) print "\t\t Testing error : %" + str( totalTestWrongClass * 100.0 / numOfTestImages) + " " + str(totalTestWrongClass) # Write log file with open(logfile, "a") as logf: logf.write('Testing : ' + str(totalTestWrongClass * 100.0 / numOfTestImages) + ' ' + str(totalTestWrongClass) + '\n') # Save weights if saveWeightsFor == 'train': currentErr = totalWrongClass elif saveWeightsFor == 'test': currentErr = totalTestWrongClass else: print "Please enter the option name to save weights for training or test!" if minErr > currentErr and saveWeightFile is not None: print "Weights are saved!" minErr = currentErr with open(saveWeightFile, 'wb') as w: pickle.dump((layer1_w.get_value(), layer1_b.get_value(), layer4_w.get_value(), layer4_b.get_value(), layer7_w.get_value(), layer7_b.get_value(), layer10_w.get_value(), layer10_b.get_value(), layer11_w.get_value(), layer11_b.get_value()), w, protocol=pickle.HIGHEST_PROTOCOL)
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Buliding sampler' f_init, f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
word_embeddings=id_vec, batch_size=params['batch_size'], max_sequence_len=params['max_sequence_len'], embedding_size=params['embedding_size'], filter_sizes=params["filter_size"], num_filters=params["num_filters"]) dbg_x1 = model.dbg_x1 # = que_x dbg_outputs_que = model.dbg_outputs_que # = que_vec[0].shape #在类中只是将计算图定义完了,计算图的真正的启动--定义函数输入输出以触发图的计算,还有梯度反传的定部分还没有定义, #梯度反传和计算图之间是要通过function的定义联系到一起的。 cost, cos_sim = model.cost, model.cos_sim graph_params = model.params grads = T.grad(cost, graph_params) learning_rate = T.dscalar("learning_rate") updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(graph_params, grads)] qt, at, lt = T.matrix("q1"), T.matrix("a1"), T.vector("l1") prob = T.fscalar("prob") train_model = theano.function(inputs=[qt, at, lt, prob, learning_rate], outputs=[cost, dbg_x1, dbg_outputs_que], updates=updates, givens={ que: qt, ans: at, label: lt, keep_prob: prob })
def train_model(batch_size=100, n_h=50, n_epochs=40): # Load the datasets with Fuel dictionary = pkl.load(open(DICT_FILE, 'r')) dictionary['~'] = len(dictionary) reverse_mapping = dict((j, i) for i, j in dictionary.items()) print("Loading the data") train = TextFile(files=[TRAIN_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) train_stream = DataStream.default_stream(train) # organize data in batches and pad shorter sequences with zeros train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) # idem dito for the validation text val = TextFile(files=[VAL_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) val_stream = DataStream.default_stream(val) # organize data in batches and pad shorter sequences with zeros val_stream = Batch(val_stream, iteration_scheme=ConstantScheme(batch_size)) val_stream = Padding(val_stream) print('Building model') # Set the random number generator' seeds for consistency rng = numpy.random.RandomState(12345) x = T.lmatrix('x') mask = T.matrix('mask') # Construct the LSTM layer recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1], n_in=n_h, n_out=111) cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, x[1:], mask[1:]) / batch_size # create a list of all model parameters to be fit by gradient descent params = logreg_layer.params + recurrent_layer.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # update_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. learning_rate = 0.1 updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] update_model = theano.function([x, mask], cost, updates=updates) evaluate_model = theano.function([x, mask], cost) # Define and compile a function for generating a sequence step by step. x_t = T.iscalar() h_p = T.vector() c_p = T.vector() h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b energy_exp = T.exp(energy - T.max(energy, 1)[:, None]) output = energy_exp / energy_exp.sum(1)[:, None] single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) start_time = time.clock() iteration = 0 for epoch in range(n_epochs): print 'epoch:', epoch for x_, mask_ in train_stream.get_epoch_iterator(): iteration += 1 cross_entropy = update_model(x_.T, mask_.T) # Generate some text after each 20 minibatches if iteration % 40 == 0: try: prediction = numpy.ones(111, dtype=config.floatX) / 111.0 h_p = numpy.zeros((n_h, ), dtype=config.floatX) c_p = numpy.zeros((n_h, ), dtype=config.floatX) initial = 'the meaning of life is ' sentence = initial for char in initial: x_t = dictionary[char] prediction, h_p, c_p = single_step( x_t, h_p.flatten(), c_p.flatten()) sample = numpy.random.multinomial(1, prediction.flatten()) for i in range(450): x_t = numpy.argmax(sample) prediction, h_p, c_p = single_step( x_t, h_p.flatten(), c_p.flatten()) sentence += reverse_mapping[x_t] sample = numpy.random.multinomial( 1, prediction.flatten()) print 'LSTM: "' + sentence + '"' except ValueError: print 'Something went wrong during sentence generation.' if iteration % 40 == 0: print 'epoch:', epoch, ' minibatch:', iteration val_scores = [] for x_val, mask_val in val_stream.get_epoch_iterator(): val_scores.append(evaluate_model(x_val.T, mask_val.T)) print 'Average validation CE per sentence:', numpy.mean( val_scores) end_time = time.clock() print('Optimization complete.') print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=300, momentum_coeff=0.): """ :param learning_rate: learning rate used for the parameters :param L1_reg: lambda for the L1 regularization :param L2_reg: lambda for the L2-squared regularization :param n_epochs: number of epochs on which to train the data. :param dataset: pickled mnist data file :param batch_size: size of the mini-batch to be used with sgd :param n_hidden: number of hidden units :param momentum_coeff: Controls the amount of damping of the velocity as a result of previous gradients in sgd """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # Compute the number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Allocate symbolic variables for the data index = T.lscalar() # index to minibatch x = T.matrix('x') y = T.ivector('y') is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction rng = numpy.random.RandomState(1234) theano_rng = T.shared_randomstreams.RandomStreams(rng.randint(999999)) # construct the MLP class classifier = MLP( rng=rng, theano_rng=theano_rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, is_train=is_train ) # The cost that we minimize during training is the negative log likelihood # of the model plus the regularization terms cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # We compile a Theano function that computes the mistakes that are # made by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size], is_train: numpy.asarray([0], dtype='int32')[0] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size], is_train: numpy.asarray([0], dtype='int32')[0] } ) train_loss = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size], is_train: numpy.asarray([0], dtype='int32')[0] } ) # Compute the gradient of the cost w.r.t theta #check gparams = [T.grad(cost, param) for param in classifier.params] # # specify how to update the parameters of the model as a list of # # (variable, update expression) pairs # updates = [ # (param, param - learning_rate * gparam) # for param, gparam in zip(classifier.params, gparams) # ] # List of updates for every set of parameters updates = [] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs for param, gparam in zip(classifier.params, gparams): # Each parameter is updated by taking a step in the direction of the gradient. # However, we also "mix in" previous gradients i.e. when the previous momenta # have the same direction, this contributes to the velocity of the gradient descent # and therefore, we take larger steps. Here, the velocity `dict` tracks old gradients. velocity = theano.shared(theano._asarray(param.get_value()*0., dtype=theano.config.floatX)) updated_velocity = momentum_coeff * velocity - learning_rate * gparam updates.append((velocity, updated_velocity)) updates.append((param, param + updated_velocity)) # compiling a Theano function which returns the cost, but at the # same time updates the parameters of the model train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size : (index + 1) * batch_size], y: train_set_y[index * batch_size : (index + 1) * batch_size], is_train: numpy.asarray([1], dtype='int32')[0] } ) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # The number of iterations to execute regardless of the validation error patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. best_W = None best_epoch = 0 start_time = time.clock() epoch = 0 done_looping = False # Keeping track of training, testing and validation errors # per epoch validations = [] tests = [] trainings = [] while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # A fancy way of keeping track of the current iteration iter = (epoch - 1) * n_train_batches + minibatch_index # Check the validation error every validation frequency # (in this case, we check every epoch) if (iter + 1) % validation_frequency == 0: # Compute the validation error i.e. the zero-one # loss on the validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] # The validation error is the mean over all the minibatches # of the validation set this_validation_loss = numpy.mean(validation_losses) # test the current model using the test set, # averaging over the test scores obtained by # all minibatches test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) # The error achieved by the current model # on the training dataset train_losses = [train_loss(i) for i in xrange(n_train_batches)] train_score = numpy.mean(train_losses) # For plotting error curve validations.append(this_validation_loss * 100) tests.append(test_score * 100) trainings.append(train_score * 100) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # Maintain global best for validation loss if this_validation_loss < best_validation_loss: # If the improvement in the validation loss surpasses # the improvement threshold, we allow an increase in # patience if(this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # Update the global best best_validation_loss = this_validation_loss best_iter = iter best_W = classifier.hiddenLayer.W.get_value(borrow=False) best_epoch = epoch print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) image = Image.fromarray( tile_raster_images(best_W.T, img_shape=(28, 28), tile_shape=(3, 10), tile_spacing=(1, 1))) image.save('repflds.png') # Plot the errors against the epochs epochs = numpy.arange(1, n_epochs + 1) plt.plot(epochs, trainings, 'b', epochs, validations, 'g', epochs, tests, 'r') green_circle, = plt.plot(best_epoch, best_validation_loss * 100., 'o', mec='g', ms=15, mew=1, mfc='none', label="Best Validation Error") # Create plot legend blue_patch = mpatches.Patch(color='blue', label='Train') green_patch = mpatches.Patch(color='green', label='Validation') red_patch = mpatches.Patch(color='red', label='Test') plt.legend(handles=[blue_patch, green_patch, red_patch, green_circle], numpoints = 1) plt.savefig('error.png')
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def sgd(cost, params, lr=0.01): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): updates.append([p, p - g * lr]) return updates
def build_model(self): trng = RandomStreams(self.seed) # Used for dropout. self.use_noise = theano.shared(numpy_floatX(0.)) if self.reload_model: self.load_params() self.tparams = self.init_tparams() self.lr = tensor.scalar(dtype=config.floatX) self.x, self.mask_x, emb_x, self.y, self.mask_y, emb_y, self.z = self.emb_layer.build( self.tparams) emb_x = dropout_layer(emb_x, self.use_noise, trng, self.dropout_rate) emb_y = dropout_layer(emb_y, self.use_noise, trng, self.dropout_rate) proj_x_fw = self.encoder_lstm_fw_layer.build(self.tparams, emb_x, self.mask_x) proj_x_bw = reverse( self.encoder_lstm_bw_layer.build(self.tparams, reverse(emb_x), reverse(self.mask_x))) proj_x = tensor.concatenate([proj_x_fw, proj_x_bw], axis=-1) * self.mask_x[:, :, None] proj_y_fw = self.encoder_lstm_fw_layer.build(self.tparams, emb_y, self.mask_y) proj_y_bw = reverse( self.encoder_lstm_bw_layer.build(self.tparams, reverse(emb_y), reverse(self.mask_y))) proj_y = tensor.concatenate([proj_y_fw, proj_y_bw], axis=-1) * self.mask_y[:, :, None] weight = tensor.batched_dot(proj_x.dimshuffle(1, 0, 2), proj_y.dimshuffle(1, 2, 0)).dimshuffle(1, 2, 0) weight_x = tensor.exp(weight - weight.max(axis=0, keepdims=True)) weight_y = tensor.exp(weight - weight.max(axis=1, keepdims=True)) weight_x = weight_x * self.mask_x[:, None, :] weight_y = weight_y * self.mask_y[None, :, :] alpha = weight_x / weight_x.sum(axis=0, keepdims=True) beta = weight_y / weight_y.sum(axis=1, keepdims=True) proj_y_att = (proj_x.dimshuffle(0, 'x', 1, 2) * alpha.dimshuffle(0, 1, 2, 'x')).sum(axis=0) proj_x_att = (proj_y.dimshuffle('x', 0, 1, 2) * beta.dimshuffle(0, 1, 2, 'x')).sum(axis=1) proj_x_cat = tensor.concatenate( [proj_x, proj_x_att, proj_x - proj_x_att, proj_x * proj_x_att], axis=-1) proj_y_cat = tensor.concatenate( [proj_y, proj_y_att, proj_y - proj_y_att, proj_y * proj_y_att], axis=-1) fusion_mlp_x = ReLU( self.fusion_mlp_layer.build(self.tparams, proj_x_cat)) fusion_mlp_y = ReLU( self.fusion_mlp_layer.build(self.tparams, proj_y_cat)) fusion_mlp_x = dropout_layer(fusion_mlp_x, self.use_noise, trng, self.dropout_rate) fusion_mlp_y = dropout_layer(fusion_mlp_y, self.use_noise, trng, self.dropout_rate) fusion_lstm_fw_x = self.fusion_lstm_fw_layer.build( self.tparams, fusion_mlp_x, self.mask_x) fusion_lstm_bw_x = reverse( self.fusion_lstm_bw_layer.build(self.tparams, reverse(fusion_mlp_x), reverse(self.mask_x))) fusion_lstm_x = tensor.concatenate( [fusion_lstm_fw_x, fusion_lstm_bw_x], axis=-1) fusion_lstm_fw_y = self.fusion_lstm_fw_layer.build( self.tparams, fusion_mlp_y, self.mask_y) fusion_lstm_bw_y = reverse( self.fusion_lstm_bw_layer.build(self.tparams, reverse(fusion_mlp_y), reverse(self.mask_y))) fusion_lstm_y = tensor.concatenate( [fusion_lstm_fw_y, fusion_lstm_bw_y], axis=-1) logit_x_mean = (fusion_lstm_x * self.mask_x[:, :, None]).sum( axis=0) / self.mask_x.sum(axis=0)[:, None] logit_x_max = (fusion_lstm_x * self.mask_x[:, :, None]).max(axis=0) logit_y_mean = (fusion_lstm_y * self.mask_y[:, :, None]).sum( axis=0) / self.mask_y.sum(axis=0)[:, None] logit_y_max = (fusion_lstm_y * self.mask_y[:, :, None]).max(axis=0) logit = tensor.concatenate( [logit_x_mean, logit_x_max, logit_y_mean, logit_y_max], axis=-1) logit = dropout_layer(logit, self.use_noise, trng, self.dropout_rate) logit = tensor.tanh(self.dense_mlp_layer.build(self.tparams, logit)) logit = dropout_layer(logit, self.use_noise, trng, self.dropout_rate) self.pred_prob = tensor.nnet.nnet.softmax( self.class_mlp_layer.build(self.tparams, logit)) self.pred = self.pred_prob.argmax(axis=-1) off = 1e-8 if self.pred_prob.dtype == 'float16': off = 1e-6 self.log_cost = -tensor.log(self.pred_prob[ tensor.arange(self.x.shape[1]), self.z] + off).mean() self.cost = self.log_cost if self.decay_c > 0.: decay_c = theano.shared(numpy.float32(self.decay_c), name='decay_c') weight_decay = 0. for kk, vv in self.tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c self.cost += weight_decay self.grads = tensor.grad(self.cost, wrt=self.tparams.values()) g2 = 0. for g in self.grads: g2 += (g**2).sum() self.grad_norm = tensor.sqrt(g2) if self.clip_c > 0.: new_grads = [] for g in self.grads: new_grads.append( tensor.switch(g2 > self.clip_c**2, g * self.clip_c / tensor.sqrt(g2), g)) self.grads = new_grads
def fit(self, X, Y, V=None, K=None, D=50, lr=10e-1, mu=0.99, batch_sz=100, epochs=6): if V is None: V = len(set(X)) if K is None: K = len(set(Y)) N = len(X) W = np.random.randn(V, K) / np.sqrt(V + K) b = np.zeros(K) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] thX = T.ivector('X') thY = T.ivector('Y') py_x = T.nnet.softmax(self.W[thX] + self.b) prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] self.cost_predict_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], allow_input_downcast=True, ) updates = [(p, p + mu * dp - lr * g) for p, dp, g in zip(self.params, dparams, grads)] + [ (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads) ] train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates, allow_input_downcast=True) costs = [] n_batches = N / batch_sz for i in xrange(epochs): X, Y = shuffle(X, Y) print "epoch:", i for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) costs.append(c) if j % 200 == 0: print "i:", i, "j:", j, "n_batches:", n_batches, "cost:", c, "error:", np.mean( p != Ybatch) plt.plot(costs) plt.show()
_, y = apply_along_axis(lambda row: random.multinomial(1, exp(row)), axis=1, arr=lP).nonzero() y = y.astype(int32) W = W.astype(float32) X = X.astype(float32) #setup theano tW = T.matrix('W') tX = T.matrix('X') ty = T.ivector('y') tlambda = T.scalar('lambda') #symbolic representation tEta = T.dot(tX, tW) tP = T.nnet.softmax(tEta) terror = T.nnet.categorical_crossentropy(tP, ty).mean( ) + tlambda * tW.norm(2)**2 # we could add some Tikhonov regularization tgrad = T.grad(terror, tW) f = theano.function([tW, tX, ty, tlambda], terror) g = theano.function([tW, tX, ty, tlambda], tgrad) W0 = random.randn(D, K).astype(float32) #gradient descent for it in xrange(500): ft = f(W0, X, y, 0.1) gt = g(W0, X, y, 0.1) W0 -= 0.1 * gt print it, "objective:", ft, "gradnorm:", linalg.norm(gt, ord=inf)
def build_chain_trainer(self): bs = self.bs td = self.td wi = T.ivector('wi') # bs (disamb. word indices) nwi = T.ivector('nwi') # negative samples lr = T.dscalar('lr').astype(theano.config.floatX) # learning rate lam = T.dscalar('lam').astype(theano.config.floatX) L = self.params['L'] L1 = self.params['L1'] # hd x td #Wt = self.params['Wt'] if not self.hinge_cost: L2 = self.params['L2'] B = self.params['B'] # td B2 = self.params['B2'] dwe = self.params['dwe'] df = self.dat[wi, :] #T.itensor3('df')# bs x mw x ms pr = self.sense_priors[wi, :] # bs x mw x ms mk = self.dmask[wi, :] #T.itensor3('mk')# bs x mw x ms pd = self.pd[ wi, :] #T.imatrix('pd') # bs x mdw (plain definition sentence) pe = self.ex[wi, :] # plain example sentences bs x mew dw = dwe[wi, :] # bs x td msk = self.wmask[wi, :].dimshuffle(0, 1, 'x') # bs x mw x 1 ndw = dwe[nwi, :] # negative words def to_vect(d, m, p): hid_inp = dwe[d, :] # mw x ms x hd logit = T.exp(T.dot(hid_inp, L0)[:, :, p]) # (mw x ms) x mw mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) mask = mk.dimshuffle(0, 'x', 'x') l2 = logit * mask # mw x ms x mw l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m # mw x ms w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x') w1 = T.switch(T.isnan(w0), 0, w0) w = w1.dimshuffle(0, 1, 'x') # mw x ms x 1 res = T.sum(w * hid_inp, axis=1) # mw x hd return res #, logit, weights def to_weight(d, m, p, prior): logit = T.tensordot(dwe[d, :], dwe.T, axes=1)[:, :, d] # mw x ms x mw x ms cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0) # 1 x 1 x mw logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1), axis=3) / cnt # mw x ms x mw logit = T.exp(10 * T.switch(T.isnan(logit), 0, logit)) # mw x ms x mw logit = T.prod(logit, axis=2) * prior # mw x ms sm = T.sum(logit * m, axis=1, keepdims=True) # mw x 1 #mask = T.switch(T.lt(p, 0), 0, 1).dimshuffle(0, 'x') # logit = (logit * m) / sm # mw x ms return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit) '''def to_weight(d, m, p, prior): A = dwe[d, :] # mw x ms x td #tmp = T.tensordot(T.dot(A, Wt), A.T, axes=1) # mw x ms x ms x mw #B = A * Wt.dimshuffle('x', 'x', 0) # 'diag' setting #tmp = T.tensordot(B, B.T, axes = 1) tmp = T.tensordot(A, A.T, axes = 1) # 'iden' setting tmp = T.exp(1000 * tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms tmp = tmp * m.dimshuffle('x', 'x', 0, 1) nrm = T.sum(tmp, axis=3) tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x') tmp = T.switch(T.isnan(tmp), 0, tmp) mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m) tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms tmp = tmp * prior tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x') return T.switch(T.isnan(tmp), 0, tmp)''' def cosim(x, y): return T.mean( T.sum(x * y, axis=1) / (x.norm(2, axis=1) * y.norm(2, axis=1))) #dat, _ = theano.scan(fn=to_vect, sequences=[df, mk, pd]) # bs x mw x td #ndat, _ = theano.scan(fn=to_vect_tmp, sequences=[ndf, nmk, npd]) # bs x mw x td weights, _ = theano.scan(fn=to_weight, sequences=[df, mk, pd, pr]) # bs x mw x ms hid_inp = dwe[df, :] # bs x mw x ms x td dat = T.sum(weights.dimshuffle(0, 1, 2, 'x') * hid_inp, axis=2) # bs x mw x td ''' inp = dat.astype(theano.config.floatX) def_emb = T.sum(T.dot(inp, L) * msk, axis=1) # bs x hd #neg_inp = ndat.astype(theano.config.floatX) #def_emb = get_sentence(inp, msk) # bs x hd #neg_def_emb = get_sentence(neg_inp, neg_msk) #w_cost = T.sum((def_emb - dw) ** 2) #w_neg_cost = T.sum((def_emb - ndw) ** 2) if self.hinge_cost: def_emb = T.dot(def_emb, L1) w_cost = -cosim(def_emb, dw) rep = nwi.shape[0] / wi.shape[ 0] # b/c there are more negative samples than pos. de = T.extra_ops.repeat(def_emb, rep, axis=0) w_neg_cost = -cosim(de, ndw) cost = T.mean(T.maximum(0, 0.01 + w_cost - w_neg_cost)) # hingeloss else: regress = T.dot(T.nnet.sigmoid(T.dot(def_emb, L1) + B), L2) + B2 # bs x td cost = T.mean( (regress - dw)** 2) + 0.01 * T.sum(abs(L2)) # only regularize the last if self.reg_alpha: cost += 0.1 * T.sum(abs(weights)) #w_cost = get_word_probs(def_emb, wi, L1) #dwe.T) # dwe instead of L1 #w_neg_cost = get_word_probs(def_emb, nwi, L1) #dwe.T) # dwe instead of L1 #c_cost = -get_context_probs(def_emb, pe, L0) # negative of the likelihood #c_neg_cost = -get_context_probs(def_emb, npe, L0) #all_params = [self.params[k] for k in self.params if k != 'dwe' and not k.startswith('L')] all_params = [self.params[k] for k in self.params if k != 'dwe'] #L_params = [L0] '''Copy of the same function in Lasagne (with minor changes)''' def apply_nesterov_momentum(ups, mom, shape=None): params = ups.keys() ups = OrderedDict(ups) if shape is None: shape = [p.get_value(borrow=True).shape for p in params] for (param, shp) in zip(params, shape): velocity = theano.shared(np.zeros(shp, dtype=theano.config.floatX), broadcastable=param.broadcastable) x = mom * velocity + ups[param] - param ups[velocity] = x ups[param] = mom * x + ups[param] return ups dwe_params = [dw, ndw] if self.do_sgd: grads = T.grad(cost, all_params) updates = OrderedDict() for (p, g) in zip(all_params, grads): updates[p] = p - lr * g apply_nesterov_momentum(updates, mom=0.9) if self.no_alt or not self.do_fixedpoint: dgrads = T.grad(cost, dwe_params) dwe_update = OrderedDict() for (p, g) in zip(dwe_params, dgrads): dwe_update[p] = p - lr * g foo = lr * g apply_nesterov_momentum(dwe_update, mom=0.9, shape=[(bs, td), (bs, td)]) else: updates = adadelta(cost, all_params, learning_rate=lr) #L_update = adadelta(cost, L_params, learning_rate = lr) if self.no_alt or not self.do_fixedpoint: dwe_update = adadelta(cost, dwe_params, learning_rate=lr) if not self.no_alt and self.do_fixedpoint: # because no alternating training means optimization if self.do_rw: #posword = self.base[wi] + 0.3 * def_emb #0.3 * ((1 - self.lam) * def_emb + self.lam * dw) idf = self.idf[wi].dimshuffle( 0, 1, 'x') # bs x mw x 1 (dat is bs x mw x hd) rw_term = T.sum(dat * idf, axis=1) # bs x hd disc_fact = 0.9 if self.init_dwe: #posword = disc_fact * rw_term # + self.base[wi] # truerw posword = ( 1 - lam ) * dw + lam * disc_fact * rw_term # + self.base[wi] # truerw else: base = self.lam * def_emb + (1 - self.lam) * dw posword = base + disc_fact * rw_term word_update = T.set_subtensor( dw, posword.astype(theano.config.floatX)) dwe_update = {dwe: word_update} dwe_ret = T.max(T.abs_(posword - dw)) # max-norm of the increment else: posword = (1 - self.lam) * def_emb + self.lam * dw word_update = T.set_subtensor(dw, posword - self.lam * ndw) dwe_update = {dwe: word_update} dwe_ret = word_update else: #elif not self.do_fixedpoint or self.no_alt: word_update = dwe_update[dw] word_update = T.set_subtensor(dw, word_update) nword_update = dwe_update[ndw] word_update = T.set_subtensor(word_update[nwi, :], nword_update) dwe_update = {dwe: word_update} #T.set_subtensor(dw, word_update) if self.no_alt: updates.update({dwe: word_update}) dwe_ret = word_update #updates.update({dwe: dwe_update[dwe]}) #word_update}) #updates.update({dwe: word_update}) self.train_step = theano.function([wi, nwi, lr], [cost, weights], updates=updates) if not self.no_alt: self.dwe_train_step = theano.function([wi, nwi, lam], [cost, dwe_ret, weights], updates=dwe_update)
def evaluate_mnist_1(learning_rate=0.1, n_epochs=100, nkerns=[4, 6], batch_size=2): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(3) xs = [] ys = [] # f = open('temp_value', 'r+') # f = open('out_10', 'r+') f = open('out_10_10', 'r+') while (1): line = f.readline() line2 = f.readline() if not line: break line = line.replace("\n", "") values = [float(i) for i in line.split()] value = float(line2) xs.append(values) ys.append(value) print(len(xs)) print(len(xs[0])) print(len(ys)) # print(ys) # print(xs) test_set_x, test_set_y = shared_dataset([xs, ys]) valid_set_x, valid_set_y = shared_dataset([xs, ys]) train_set_x, train_set_y = shared_dataset([xs, ys]) # train_set_x, train_set_y = datasets[0] # valid_set_x, valid_set_y = datasets[1] # test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing batch_size = len(ys) # batch_size=1 n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # n_train_batches = 1 # n_valid_batches = 1 # n_test_batches = 1 # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # myprint=theano.function([x],x) # myprint([layer2_input]) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=20, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=20, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) prob = layer3.prob_y_given_x(y) f1 = open('weights', 'w+') print "layer 0 weights" for w in layer0.W.get_value(): for r in w: for s in r: for d in s: f1.write(str(d) + '\n') # print layer0.W.get_value() # print layer0.b.get_value() print "layer 1 weights" # print layer1.W.get_value() # print layer1.b.get_value() for w in layer1.W.get_value(): for r in w: for s in r: for d in s: f1.write(str(d) + '\n') print "layer 2 weights" # print layer2.W.get_value() w = layer2.W.get_value() # for d in w: # print d for i in range(len(w[0])): for j in range(len(w)): f1.write(str(w[j][i]) + '\n') # print layer2.b.get_value() # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) prob_model = theano.function( [index], prob, givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) conv_model0 = theano.function( [index], layer0.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model0_conv = theano.function( [index], layer0.conv_out, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model1 = theano.function( [index], layer1.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model1_conv = theano.function( [index], layer1.conv_out, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model2 = theano.function( [index], layer2.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # params = layer0.params + layer1.params + layer2.params + layer3.params # x_printed = theano.printing.Print('this is a very important value')(x) # f_with_print = theano.function([x], x_printed) # f_with_print(layer3.params) # create a list of gradients for all model parameters grads = T.grad(cost, params) val_grads = T.grad(cost, layer3.p_y_given_x) # print "AAAA" # theano.printing.debugprint(temp_grads) # print "AAAA" grad_model = theano.function( [index], grads, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) val_grad_model = theano.function( [index], val_grads, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False bestConvW = layer0.W.get_value() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index val_grads_ij = val_grad_model(minibatch_index) grads_ij = grad_model(minibatch_index) conv0_ij = conv_model0(minibatch_index) conv1_ij = conv_model1(minibatch_index) conv2_ij = conv_model2(minibatch_index) conv0_conv_ij = conv_model0_conv(minibatch_index) conv1_conv_ij = conv_model1_conv(minibatch_index) print 'training @ iter = ', iter print "last layer var grads" print val_grads_ij[0] # print "Layer 0 convolution" # for c in conv0_conv_ij[0]: # print c # print "" # print "" # print "Layer 1 convolution" # for c in conv1_conv_ij[0]: # print c # print "" # print "" probs = prob_model(minibatch_index) print "Probs" print probs # print "layer 0 grads" # print grads_ij[6] # print grads_ij[7] # print "layer 1 grads" # print grads_ij[4] # print grads_ij[5] # print "layer 2 grads" # print grads_ij[2] # print grads_ij[3] print "log reg layer grads" print grads_ij[0] print grads_ij[1] print "Layer 0 output" # for c in conv0_ij: # for d in c: # print d # print conv0_ij[0][0] print "Layer 1 output" # print conv1_ij[0][0] # for c in conv1_ij: # for d in c: # print d print "Layer 2 output" # for c in conv2_ij: # print c cost_ij = train_model(minibatch_index) # for c in conv0_conv_ij[1]: # print c # print "" print "learning_rate" print learning_rate print "layer 0 weights" # print layer0.W.get_value() # print layer0.b.get_value() print "layer 1 weights" # print layer1.W.get_value() # print layer1.b.get_value() print "layer 2 weights" w = layer2.W.get_value() # print w[0] # print w[1] # for c in layer2.W.get_value(): # print c # print layer2.b.get_value() print "log reg layer weights" print layer3.W.get_value() print layer3.b.get_value() print "COST" print cost_ij if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: bestConvW = layer0.W.get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def get_cost_updates(self, lr=0.1, persistent=None, k=1): """This functions implements one step of CD-k or PCD-k :param lr: learning rate used to train the RBM :param persistent: None for CD. For PCD, shared variable containing old state of Gibbs chain. This must be a shared variable of size (batch size, number of hidden units). :param k: number of Gibbs steps to do in CD-k/PCD-k Returns a proxy for the cost and the updates dictionary. The dictionary contains the update rules for weights and biases but also an update of the shared variable used to store the persistent chain, if one is used. """ # compute positive phase pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input) # decide how to initialize persistent chain: # for CD, we use the newly generate hidden sample # for PCD, we initialize from the old state of the chain if persistent is None: chain_start = ph_sample else: chain_start = persistent # end-snippet-2 # perform actual negative phase # in order to implement CD-k/PCD-k we need to scan over the # function that implements one gibbs step k times. # Read Theano tutorial on scan for more information : # http://deeplearning.net/software/theano/library/scan.html # the scan will return the entire Gibbs chain ([ pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples ], updates) = theano.scan( self.gibbs_hvh, # the None are place holders, saying that # chain_start is the initial state corresponding to the # 6th output outputs_info=[None, None, None, None, None, chain_start], n_steps=k) # start-snippet-3 # determine gradients on RBM parameters # note that we only need the sample at the end of the chain chain_end = nv_samples[-1] cost = T.mean(self.free_energy(self.input)) - T.mean( self.free_energy(chain_end)) # We must not compute the gradient through the gibbs sampling gparams = T.grad(cost, self.params, consider_constant=[chain_end]) # end-snippet-3 start-snippet-4 # constructs the update dictionary for gparam, param in zip(gparams, self.params): # make sure that the learning rate is of the right dtype updates[param] = param - gparam * T.cast( lr, dtype=theano.config.floatX) if persistent: # Note that this works only if persistent is a shared variable updates[persistent] = nh_samples[-1] # pseudo-likelihood is a better proxy for PCD monitoring_cost = self.get_pseudo_likelihood_cost(updates) else: # reconstruction cross-entropy is a better proxy for CD monitoring_cost = self.get_reconstruction_cost( updates, pre_sigmoid_nvs[-1]) return monitoring_cost, updates
x = theano.shared(D[0], name="x") y = theano.shared(D[1], name="y") w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w") b = theano.shared(np.asarray(0., dtype=theano.config.floatX), name="b") x.tag.test_value = D[0] y.tag.test_value = D[1] #print "Initial model:" #print w.get_value(), b.get_value() # Construct Theano expression graph p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b)) # Probability of having a one prediction = p_1 > 0.5 # The prediction that is done: 0 or 1 xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1) # Cross-entropy cost = tt.cast(xent.mean(), 'float32') + \ 0.01 * (w ** 2).sum() # The cost to optimize gw, gb = tt.grad(cost, [w, b]) # Compile expressions to functions train = theano.function(inputs=[], outputs=[prediction, xent], updates=[(w, w - 0.01 * gw), (b, b - 0.01 * gb)], name="train") predict = theano.function(inputs=[], outputs=prediction, name="predict") if any([ n.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for n in train.maker.fgraph.toposort() ]): print('Used the cpu') elif any([ n.op.__class__.__name__ in ['GpuGemm', 'GpuGemv']
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): """ This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias We check that we loop when there are too many threads """ n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, theano.compile.DebugMode): n_in = 4098 n_out = 4099 y = T.lvector('y') b = T.fvector('b') # we precompute the dot with big shape before to allow the test of # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error # (the launch timed out and was terminated) on GPU card not # powerful enough. We need the big shape to check for corner # case. dot_result = T.fmatrix('dot_result') # Seed numpy.random with config.unittests.rseed utt.seed_rng() xx = numpy.asarray(numpy.random.rand(batch_size, n_in), dtype=numpy.float32) yy = numpy.ones((batch_size,), dtype='int32') b_values = numpy.zeros((n_out,), dtype='float32') W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32') dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32') del W_values p_y_given_x = T.nnet.softmax(dot_result + b) y_pred = T.argmax(p_y_given_x, axis=-1) loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y]) dW = T.grad(loss, dot_result) classify = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu) classify_gpu = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu) # theano.printing.debugprint(classify) # theano.printing.debugprint(classify_gpu) assert any([isinstance(node.op, T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.fgraph.toposort()]) assert any([isinstance(node.op, cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.fgraph.toposort()]) out = classify(yy, b_values, dot_value) gout = classify_gpu(yy, b_values, dot_value) assert len(out) == len(gout) == 3 assert numpy.allclose(out[0], gout[0]) assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute( gout - out).max() assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val) for id, val in enumerate(out[1] - gout[1]) if val != 0]
def __init__(self, n_words=20, n_embedding=100, lr=0.01, momentum=0.9, word_to_id=None, null_word_id=-1, load_from_file=None): if load_from_file: self.load_model(load_from_file) else: self.regularization = 0.01 self.n_embedding = n_embedding self.lr = lr self.momentum = momentum self.n_words = n_words self.batch_size = 4 self.word_to_id = word_to_id self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems()) self.null_word_id = null_word_id # Question embedding # self.B = init_shared_normal(self.n_words, self.n_embedding, 0.1) # Statement input, output embeddings self.weights = init_shared_normal_tensor(4, self.n_words, self.n_embedding, 0.1) # Linear mapping between layers self.H = init_shared_normal(self.n_embedding, self.n_embedding, 0.1) # Final outut weight matrix # self.W = init_shared_normal(self.n_embedding, self.n_words, 0.1) # Answer embedding matrix self.A = init_shared_normal(self.n_words, self.n_embedding, 0.1) # Final scoring matrix self.U = init_shared_normal(self.n_embedding, self.n_embedding, 0.1) zero_vector = T.vector('zv', dtype=theano.config.floatX) # Statement x = T.imatrix('x') xbatch = T.tensor3('xb', dtype='int32') # Positional encoding matrix pe = T.tensor3('pe') # Question q = T.ivector('q') qbatch = T.imatrix('qb') # True word r = T.iscalar('r') rbatch = T.ivector('rb') # Stacked answer vectors a = T.imatrix('a') abatch = T.tensor3('ab', dtype='int32') memory_cost = self.memnn_cost(x, q, a, pe) # memory_loss = -T.log(memory_cost[r]) # cross entropy on softmax memory_loss = self.memnn_batch_cost(xbatch, qbatch, rbatch, abatch, pe) params = [ self.weights, # self.B, # self.W, self.H, self.A, self.U, ] regularization_cost = reduce( lambda x, y: x + y, map(lambda x: self.regularization * T.sum(x**2), params)) cost = memory_loss + regularization_cost grads = T.grad(cost, params) l_rate = T.scalar('l_rate') # Parameter updates updates = get_param_updates(params, grads, lr=l_rate, method='adagrad', momentum=0.9, constraint=self._constrain_embedding( self.null_word_id, zero_vector)) self.train_function = theano.function( inputs=[ xbatch, qbatch, rbatch, abatch, pe, theano.Param(l_rate, default=self.lr), theano.Param(zero_vector, default=np.zeros((self.n_embedding, ), theano.config.floatX)) ], outputs=cost, updates=updates, allow_input_downcast=True, # mode='FAST_COMPILE', #mode='DebugMode' #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs) on_unused_input='warn') self.predict_function = theano.function( inputs=[x, q, a, pe], outputs=memory_cost, allow_input_downcast=True, # mode='FAST_COMPILE', on_unused_input='warn')
mu_phase, sigma_phase, coeff_phase = _slice_outs(phase_outs) target_split = n_out // 2 mag_target = target[:, :, :target_split] phase_target = target[:, :, target_split:] mag_cost = single_dimensional_gmms(mag_target, mu_mag, sigma_mag, coeff_mag) phase_cost = single_dimensional_phase_gmms(phase_target, mu_phase, sigma_phase, coeff_phase) cost = mag_cost + phase_cost cost = cost * mask cost = cost.sum() / cut_len grads = tensor.grad(cost, params) grads = gradient_clipping(grads, 10.) learning_rate = 1E-4 opt = adam(params, learning_rate) updates = opt.updates(params, grads) train_function = theano.function([ X_sym, X_mask_sym, c_sym, c_mask_sym, init_h1, init_h2, init_h3, init_kappa, init_w, bias_sym ], [cost, h1, h2, h3, kappa, w], updates=updates) cost_function = theano.function([ X_sym, X_mask_sym, c_sym, c_mask_sym, init_h1, init_h2, init_h3, init_kappa, init_w, bias_sym
# output = T.nnet.sigmoid(T.dot(output, W2) + b2) print lasagne.layers.get_output(l_decoder, inputs={ l_in: x_sym }).eval({ x_sym: ftest_x }).shape loss_all_target = lasagne.objectives.squared_error(output, t_sym).sum() loss_mean_target = loss_all_target / n_batch # print loss_mean_target.eval({x_sym:test_x,mask_x_sym:mask_test_x, t_sym: target_train, mask_t_sym: mask_target_train}) all_params_target = lasagne.layers.get_all_params([l_decoder]) all_grads_target = [ T.clip(g, -10, 10) for g in T.grad(loss_mean_target, all_params_target) ] all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 10) updates_target = adam(all_grads_target, all_params_target) train_model = theano.function([x_sym, t_sym], [loss_mean_target, output], updates=updates_target) test_model = theano.function([x_sym, t_sym], [loss_mean_target, output]) num_min_batches = 100 n_batch = 100 epochs = 100 for i in range(epochs): start_time = time.time()
def trainCompile(self): # Activation for i in xrange(self.lastArrayNum): self.architecture[i].compileActivation(self, i) # Sparse penalty for i in xrange(self.lastArrayNum): l = self.architecture[i] if l.sparsity: l.compileSparsity(self, i, self.options.minibatch_size) # Weight decay penalty for i in xrange(self.lastArrayNum): l = self.architecture[i] if l.weightDecay: l.compileWeightDecayPenalty(self, i) # Error XENT = 1.0 / self.options.minibatch_size * T.sum((self.y - self.varArrayA[-1]) ** 2 * 0.5) self.cost = XENT for err in self.regularize: self.cost += err # Update output array self.outputArray.append(self.cost) self.outputArray.append(XENT) self.outputArray.append(self.varArrayA[-1]) # Derivatives # All variables to gradArray list to show to Theano on which variables we need an gradient gradArray = [] for i in xrange(self.lastArrayNum): for k in self.varWeights[i].keys(): gradArray.append(self.varWeights[i][k]) self.derivativesArray = T.grad(self.cost, gradArray) # RMS if self.options.rmsProp: for i in xrange(len(self.derivativesArray)): mmsp = theano.shared(np.tile(0.0, gradArray[i].get_value().shape).astype(theano.config.floatX), name="mmsp%s" % (i + 1)) # 0.0 - 1.0 maybe self.MMSprev.append(mmsp) mmsn = self.options.rmsProp * mmsp + (1 - self.options.rmsProp) * self.derivativesArray[i] ** 2 #mmsn = T.clip(mmsn, self.options.mmsmin, 1e+15) # Fix nan if rmsProp mmsn = T.clip(mmsn, self.options.mmsmin, np.finfo(np.float32).max) # Fix nan if rmsProp self.MMSnew.append(mmsn) # Update values for i in xrange(len(self.derivativesArray)): if self.options.rmsProp: updateVar = self.options.learnStep * self.derivativesArray[i] / self.MMSnew[i] ** 0.5 self.updatesArray.append((self.MMSprev[i], self.MMSnew[i])) else: updateVar = self.options.learnStep * self.derivativesArray[i] self.updatesArray.append((gradArray[i], gradArray[i] - updateVar)) self.train = theano.function(inputs=[self.x, self.y], outputs=self.outputArray, updates=self.updatesArray, allow_input_downcast=True) return self
def train_med2vec(seqFile='seqFile.txt', demoFile='demoFile.txt', labelFile='labelFile.txt', outFile='outFile.txt', modelFile='modelFile.txt', L2_reg=0.001, numXcodes=20000, numYcodes=20000, embDimSize=1000, hiddenDimSize=2000, batchSize=100, demoSize=2, logEps=1e-8, windowSize=1, verbose=False, maxEpochs=1000): options = locals().copy() print('initializing parameters') params = init_params(options) #params = load_params(options) tparams = init_tparams(params) print('building models') f_grad_shared = None f_update = None if demoSize > 0 and numYcodes > 0: x, d, y, mask, iVector, jVector, cost = build_model(tparams, options) grads = T.grad(cost, wrt=list(tparams.values())) f_grad_shared, f_update = adadelta(tparams, grads, x, mask, iVector, jVector, cost, options, d=d, y=y) elif demoSize == 0 and numYcodes > 0: x, y, mask, iVector, jVector, cost = build_model(tparams, options) grads = T.grad(cost, wrt=list(tparams.values())) f_grad_shared, f_update = adadelta(tparams, grads, x, mask, iVector, jVector, cost, options, y=y) elif demoSize > 0 and numYcodes == 0: x, d, mask, iVector, jVector, cost = build_model(tparams, options) grads = T.grad(cost, wrt=list(tparams.values())) f_grad_shared, f_update = adadelta(tparams, grads, x, mask, iVector, jVector, cost, options, d=d) else: x, mask, iVector, jVector, cost = build_model(tparams, options) grads = T.grad(cost, wrt=list(tparams.values())) f_grad_shared, f_update = adadelta(tparams, grads, x, mask, iVector, jVector, cost, options) print('loading data') seqs, demos, labels = load_data(seqFile, demoFile, labelFile) n_batches = int(np.ceil(float(len(seqs)) / float(batchSize))) print('training start') for epoch in range(maxEpochs): iteration = 0 costVector = [] for index in random.sample(list(range(n_batches)), n_batches): batchX = seqs[batchSize * index:batchSize * (index + 1)] batchY = [] batchD = [] if demoSize > 0 and numYcodes > 0: batchY = labels[batchSize * index:batchSize * (index + 1)] x, y, mask, iVector, jVector = padMatrix( batchX, batchY, options) batchD = demos[batchSize * index:batchSize * (index + 1)] cost = f_grad_shared(x, batchD, y, mask, iVector, jVector) elif demoSize == 0 and numYcodes > 0: batchY = labels[batchSize * index:batchSize * (index + 1)] x, y, mask, iVector, jVector = padMatrix( batchX, batchY, options) cost = f_grad_shared(x, y, mask, iVector, jVector) elif demoSize > 0 and numYcodes == 0: x, mask, iVector, jVector = padMatrix(batchX, batchY, options) batchD = demos[batchSize * index:batchSize * (index + 1)] cost = f_grad_shared(x, batchD, mask, iVector, jVector) else: x, mask, iVector, jVector = padMatrix(batchX, batchY, options) cost = f_grad_shared(x, mask, iVector, jVector) costVector.append(cost) f_update() if (iteration % 10 == 0) and verbose: print('epoch:%d, iteration:%d/%d, cost:%f' % (epoch, iteration, n_batches, cost)) iteration += 1 print('epoch:%d, mean_cost:%f' % (epoch, np.mean(costVector))) tempParams = unzip(tparams) np.savez_compressed(outFile + '.' + str(epoch), **tempParams)
#-------------------- #declare theano variables #-------------------- X = theano.tensor.matrix('X', dtype='floatX') Y = theano.tensor.matrix('Y', dtype='floatX') theta = theano.shared(numpy.zeros((n, 1)), name="theta") #-------------------- #declare theano expressions for logistic regression regression #-------------------- #hypothesis h = T.nnet.sigmoid(T.dot(X, theta)) #cost function cost = 1.0 / m * T.sum(-Y * T.log(h) - (1 - Y) * T.log(1 - h)) #grad function gtheta = T.grad(cost, theta) #train function train = theano.function(inputs=[X, Y], outputs=[ cost, ], updates=((theta, theta - alpha * gtheta), )) #predict function predict = theano.function(inputs=[ X, ], outputs=[ h > 0.5, ]) #----------------- #train the logistic regression
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-5 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))