def getProb(bestModel, dataset, probFilename, P): print "...getting probability" setX, setY, setName = dataset sharedSetX, sharedSetY, castSharedSetY = dnnUtils.sharedDataXY(setX, setY) idx = T.ivector('i') sX = T.matrix(dtype=theano.config.floatX) sY = T.ivector() # bulid best DNN model predicter = DNN( input = dnnUtils.splicedX(sX, idx, P.spliceWidth), P = P, params = bestModel ) # Validation model Model = theano.function( inputs = [idx], outputs = predicter.p_y_given_x, givens={sX:sharedSetX, sY:castSharedSetY}, on_unused_input='ignore') # Center Index centerIdx = dnnUtils.findCenterIdxList(setY) # Total Center Index totalCenterIdxSize = len(centerIdx) # Make mini-Batch batchIdx = dnnUtils.makeBatch(totalCenterIdxSize, 16384) # Writing Probability dnnUtils.writeProb(Model, batchIdx, centerIdx, setName, probFilename) dnnUtils.clearSharedDataXY(sharedSetX, sharedSetY)
def __init__(self, vocab_size, dim, lr=0.5): W = np.asarray(np.random.rand(vocab_size, dim), dtype=theano.config.floatX) / float(dim) W1 = np.asarray((np.random.rand(vocab_size, dim)), dtype=theano.config.floatX) / float(dim) self.W = theano.shared(W, name='W', borrow=True) self.W1 = theano.shared(W1, name='W1', borrow=True) gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX) gW1 = np.asarray( np.ones((vocab_size, dim)), dtype=theano.config.floatX) self.gW = theano.shared(gW, name='gW', borrow=True) self.gW1 = theano.shared(gW1, name='gW1', borrow=True) X = T.vector() fX = T.vector() ind_W = T.ivector() ind_W1 = T.ivector() w = self.W[ind_W, :] w1 = self.W1[ind_W1, :] cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2)) grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0) updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :], grad[0] ** 2))] updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :], grad[1] ** 2))] updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :], - (lr / T.sqrt(self.gW[ind_W, :])) * grad[0]))] updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :], - (lr / T.sqrt(self.gW1[ind_W1, :])) * grad[1]))] updates = updates1 + updates2 + updates3 + updates4 self.cost_fn = theano.function( inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
def test_CSMGrad(self): imshp = (3, 3) nkern = 1 # per output pixel kshp = (2, 2) # ssizes = ((1,1),(2,2)) ssizes = ((1, 1),) # convmodes = ('full','valid',) convmodes = ("full",) kerns = tensor.dvector() indices = tensor.ivector() indptr = tensor.ivector() spmat_shape = tensor.ivector() for mode in ["FAST_COMPILE", "FAST_RUN"]: for conv_mode in convmodes: for ss in ssizes: indvals, indptrvals, spshapevals, sptype, outshp, kmap = sp.convolution_indices.sparse_eval( imshp, kshp, nkern, ss, conv_mode ) kvals = numpy.random.random(nkern * numpy.prod(kshp) * numpy.prod(outshp)).flatten() def d(kerns): return theano.sparse.dense_from_sparse( theano.sparse.CSM(sptype, kmap)(kerns, indvals, indptrvals, spshapevals) ) # symbolic stuff utt.verify_grad(d, [kvals])
def predict_next_batch(self, session_ids, input_item_ids, predict_for_item_ids=None, batch=100): ''' Gives predicton scores for a selected set of items. Can be used in batch mode to predict for multiple independent events (i.e. events of different sessions) at once and thus speed up evaluation. If the session ID at a given coordinate of the session_ids parameter remains the same during subsequent calls of the function, the corresponding hidden state of the network will be kept intact (i.e. that's how one can predict an item to a session). If it changes, the hidden state of the network is reset to zeros. Parameters -------- session_ids : 1D array Contains the session IDs of the events of the batch. Its length must equal to the prediction batch size (batch param). input_item_ids : 1D array Contains the item IDs of the events of the batch. Every item ID must be must be in the training data of the network. Its length must equal to the prediction batch size (batch param). predict_for_item_ids : 1D array (optional) IDs of items for which the network should give prediction scores. Every ID must be in the training set. The default value is None, which means that the network gives prediction on its every output (i.e. for all items in the training set). batch : int Prediction batch size. Returns -------- out : pandas.DataFrame Prediction scores for selected items for every event of the batch. Columns: events of the batch; rows: items. Rows are indexed by the item IDs. ''' if self.error_during_train: raise Exception if self.predict is None or self.predict_batch!=batch: X = T.ivector() Y = T.ivector() for i in range(len(self.layers)): self.H[i].set_value(np.zeros((batch,self.layers[i]), dtype=theano.config.floatX), borrow=True) if predict_for_item_ids is not None: H_new, yhat, _ = self.model(X, self.H, Y, 0) else: H_new, yhat = self.model_test(X, self.H) updatesH = OrderedDict() for i in range(len(self.H)): updatesH[self.H[i]] = H_new[i] if predict_for_item_ids is not None: self.predict = function(inputs=[X, Y], outputs=yhat, updates=updatesH, allow_input_downcast=True) else: self.predict = function(inputs=[X], outputs=yhat, updates=updatesH, allow_input_downcast=True) self.current_session = np.ones(batch) * -1 self.predict_batch = batch session_change = np.arange(batch)[session_ids != self.current_session] if len(session_change) > 0: for i in range(len(self.H)): tmp = self.H[i].get_value(borrow=True) tmp[session_change] = 0 self.H[i].set_value(tmp, borrow=True) self.current_session=session_ids.copy() in_idxs = self.itemidmap[input_item_ids] if predict_for_item_ids is not None: iIdxs = self.itemidmap[predict_for_item_ids] preds = np.asarray(self.predict(in_idxs, iIdxs)).T return pd.DataFrame(data=preds, index=predict_for_item_ids) else: in_idxs.values[np.isnan(in_idxs.values)] = 0 preds = np.asarray(self.predict(in_idxs)).T return pd.DataFrame(data=preds, index=self.itemidmap.index)
def bsgd1(nn, data, name='sgd', lr=0.022, alpha=0.3, batch_size=500, epochs = 10): train_set_x, train_set_y = data[0] valid_set_x, valid_set_y = data[1] test_set_x, test_set_y = data[2] # valid_y_numpy = y_numpy[0] # test_y_numpy = y_numpy[1] test_y_numpy = map_48_to_39(test_y_numpy) valid_y_numpy = map_48_to_39(valid_y_numpy) print test_y_numpy num_samples = train_set_x.get_value(borrow=True).shape[0] num_batches = num_samples / batch_size layers = nn.layers x = T.matrix('x') y = T.ivector('y') y_eval = T.ivector('y_eval') cost = nn.cost(x, y) accuracy = nn.calcAccuracy(x, y) params = nn.params delta_params = nn.delta_params print theano.pp(cost) # theano.pp(accuracy) p_grads = [T.grad(cost=cost, wrt = p) for p in params] # implementing gradient descent with momentum print p_grads updates = OrderedDict() for dp, gp in zip(delta_params, p_grads): updates[dp] = dp*alpha - gp*lr for p, dp in zip(params, delta_params): updates[p] = p + updates[dp] # updates = [(p, p - lr*gp) for p, gp in zip(params, p_grads)] index = T.ivector('index') batch_sgd_train = theano.function(inputs=[index], outputs=[cost, accuracy], updates=updates, givens={x: train_set_x[index], y:train_set_y[index]}) batch_sgd_valid = theano.function(inputs=[], outputs=[nn.calcAccuracy(x, y), nn.calcAccuracyTimit(x,y)], givens={x: valid_set_x, y:valid_set_y}) batch_sgd_test = theano.function(inputs=[], outputs=nn.calcAccuracy(x, y), givens={x: test_set_x, y:test_set_y}) indices = np.arange(num_samples, dtype=np.dtype('int32')) np.random.shuffle(indices) for n in xrange(epochs): np.random.shuffle(indices) for i in xrange(num_batches): batch = indices[i*batch_size: (i+1)*batch_size] batch_sgd_train(batch) # y_np = y.get_value() # print y.eval() print "epoch:", n, " validation accuracy:", batch_sgd_valid() print batch_sgd_test()
def create_TrainFunc_tranPES(simfn, embeddings, marge=0.5, alpha=1., beta=1.): # parse the embedding data embedding = embeddings[0] # D x N matrix lembedding = embeddings[1] # declare the symbolic variables for training triples hp = S.csr_matrix('head positive') # N x batchsize matrix rp = S.csr_matrix('relation') tp = S.csr_matrix('tail positive') hn = S.csr_matrix('head negative') tn = S.csr_matrix('tail negative') lemb = T.scalar('embedding learning rate') lremb = T.scalar('relation learning rate') subtensorE = T.ivector('batch entities set') subtensorR = T.ivector('batch link set') # Generate the training positive and negative triples hpmat = S.dot(embedding.E, hp).T # batchsize x D dense matrix rpmat = S.dot(lembedding.E, rp).T tpmat = S.dot(embedding.E, tp).T hnmat = S.dot(embedding.E, hn).T tnmat = S.dot(embedding.E, tn).T # calculate the score pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat) negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat) negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat) costh, outh = margeCost(pos, negh, marge) costt, outt = margeCost(pos, negt, marge) embreg = regEmb(embedding, subtensorE, alpha) lembreg = regLink(lembedding, subtensorR, beta) cost = costh + costt + embreg[0] + lembreg out = T.concatenate([outh, outt]) outc = embreg[1] # list of inputs to the function list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR] # updating the embeddings using gradient descend emb_grad = T.grad(cost, embedding.E) New_embedding = embedding.E - lemb*emb_grad remb_grad = T.grad(cost, lembedding.E) New_rembedding = lembedding.E - lremb * remb_grad updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding}) return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg], updates=updates, on_unused_input='ignore')
def test_multMatVect(): A1 = tensor.lmatrix('A1') s1 = tensor.ivector('s1') m1 = tensor.iscalar('m1') A2 = tensor.lmatrix('A2') s2 = tensor.ivector('s2') m2 = tensor.iscalar('m2') g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2) f0 = theano.function([A1, s1, m1, A2, s2, m2], g0) i32max = numpy.iinfo(numpy.int32).max A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s1 = numpy.random.randint(0, i32max, 3).astype('int32') m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64') s2 = numpy.random.randint(0, i32max, 3).astype('int32') m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32") f0.input_storage[0].storage[0] = A1 f0.input_storage[1].storage[0] = s1 f0.input_storage[2].storage[0] = m1 f0.input_storage[3].storage[0] = A2 f0.input_storage[4].storage[0] = s2 f0.input_storage[5].storage[0] = m2 r_a1 = rng_mrg.matVecModM(A1, s1, m1) r_a2 = rng_mrg.matVecModM(A2, s2, m2) f0.fn() r_b = f0.output_storage[0].value assert numpy.allclose(r_a1, r_b[:3]) assert numpy.allclose(r_a2, r_b[3:])
def directRNN(): ####################### NumPy x0=0.5 s=0.5 times=[1,10,20,30,40,50] yhat=direct(x0, s, times) ############################### Symbolic x0_ = T.scalar("x0") c_= T.log((1-x0_)/x0_) times_ = T.ivector("times") S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S') yhat_= T.nnet.sigmoid(S__*times_/2-c_) Predict_ = theano.function(inputs=[x0_,times_], outputs=yhat_) ############################### Symbolic Recursive x0_ = T.scalar("x0") times_ = T.ivector("times") S__=theano.shared(np.asarray(s, dtype = theano.config.floatX), 'S') # predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: (s*x_prev*x_prev+s*x_prev +2*x_prev)/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1]) predall_, updatesRecurrence_ = theano.scan(lambda x_prev, s: x_prev+(s*x_prev*(1-x_prev))/(2*s*x_prev+2), outputs_info=x0_,non_sequences=S__,n_steps=times_[-1]) pred_=predall_[times_-1] #we only have target at some generations e.g. 10,20,... Feedforward_ = theano.function(inputs=[x0_,times_], outputs=pred_, updates=updatesRecurrence_) ############################# Comparison x_0=0.5 x_1=x_0+(s*x_0*(1-x_0))/(2*s*x_0+2) print '{:20s}{}'.format('NumPy', yhat) print '{:20s}{}'.format('Symbolic Direct', Predict_(x0,list(times))) print '{:20s}{}'.format('Symbolic Recursive', Feedforward_(x0,list(times))) print '{:20s}[ {} ]'.format('x_1', x_1)
def multMatVect(v, A, m1, B, m2): """ multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2 Note: The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time then running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix("A") s_sym = tensor.ivector("s") m_sym = tensor.iscalar("m") A2_sym = tensor.lmatrix("A2") s2_sym = tensor.ivector("s2") m2_sym = tensor.iscalar("m2") o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function([A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def __init__(self, numpy_rng, theano_rng=None, y=None, alpha=0.9, sample_rate=0.1, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1], allX=None,allY=None,srng=None): self.sigmoid_layers = [] self.sugar_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) self.allXs = [] if y == None: self.y = tensor.ivector(name='y') else: self.y = y assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = tensor.matrix('x') self.x = tensor.matrix('x') self.y = tensor.ivector('y') self.y = tensor.ivector('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output if i == 0: self.allXs.append(allX) else: self.allXs.append(tensor.dot(self.allXs[i-1], self.sigmoid_layers[-1].W) + self.sigmoid_layers[-1].b) sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=tensor.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) sugar_layer = sugar(numpy_rng=numpy_rng, alpha=alpha, sample_rate=sample_rate, x=layer_input, y=self.y, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b, allX=self.allXs[i], allY=allY, srng=srng) self.sugar_layers.append(sugar_layer) self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y)
def __init__(self, dnodex,inputdim,dim): X=T.ivector() Y=T.ivector() Z=T.lscalar() eta = T.scalar() temperature=T.scalar() self.dnodex=dnodex num_input = inputdim dnodex.umatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.nuser,inputdim, inputdim)))) dnodex.pmatrix=theano.shared(floatX(np.random.randn(*(self.dnodex.npoi,inputdim)))) dnodex.p_l2_norm=(dnodex.pmatrix**2).sum() dnodex.u_l2_norm=(dnodex.umatrix**2).sum() num_hidden = dim num_output = inputdim inputs = InputPLayer(dnodex.pmatrix[X,:], dnodex.umatrix[Z,:,:], name="inputs") lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1") lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2") lstm3 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm2, name="lstm3") softmax = SoftmaxPLayer(num_hidden, num_output, dnodex.umatrix[Z,:,:], input_layer=lstm3, name="yhat", temperature=temperature) Y_hat = softmax.output() self.layers = inputs, lstm1,lstm2,lstm3,softmax params = get_params(self.layers) #caches = make_caches(params) cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, T.dot(dnodex.pmatrix[Y,:],dnodex.umatrix[Z,:,:])))+eta*dnodex.p_l2_norm+eta*dnodex.u_l2_norm updates = PerSGD(cost,params,eta,X,Z,dnodex)#momentum(cost, params, caches, eta) self.train = theano.function([X,Y,Z, eta, temperature], cost, updates=updates, allow_input_downcast=True) predict_updates = one_step_updates(self.layers) self.predict_char = theano.function([X, Z, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
def __theano_build__(self): params = self.params param_names = self.param_names hidden_dim = self.hidden_dim x1 = T.imatrix('x1') # first sentence x2 = T.imatrix('x2') # second sentence x1_mask = T.fmatrix('x1_mask') #mask x2_mask = T.fmatrix('x2_mask') y = T.ivector('y') # label y_c = T.ivector('y_c') # class weights # Embdding words _E1 = params["E"].dot(params["W"][0]) + params["B"][0] _E2 = params["E"].dot(params["W"][1]) + params["B"][1] statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim]) statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim]) def rnn_cell(x, mx, ph, Wh): h = T.tanh(ph.dot(Wh) + x) h = mx[:, None] * h + (1-mx[:, None]) * ph return [h] [h1], updates = theano.scan( fn=rnn_cell, sequences=[statex1, x1_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))], non_sequences=params["W"][2]) [h2], updates = theano.scan( fn=rnn_cell, sequences=[statex2, x2_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=h1[-1])], non_sequences=params["W"][3]) #predict _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"]) _p = T.argmax(_s, axis=1) _c = T.nnet.categorical_crossentropy(_s, y) _c = T.sum(_c * y_c) _l = T.sum(params["lrW"]**2) _cost = _c + 0.01 * _l # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # Gradients and updates _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay) # Assign functions self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads) self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c) self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s) self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p) self.sgd_step = theano.function( [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay], updates=_updates)
def main(num_epochs=NUM_EPOCHS): print("Building network ...") # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, SEQ_LENGTH, num_inputs) #The network model l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, SEQ_LENGTH, num_inputs)) l_forward_1 = lasagne.layers.LSTMLayer(l_in, N_HIDDEN, grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.tanh) l_forward_2 = lasagne.layers.LSTMLayer(l_forward_1, N_HIDDEN, grad_clipping=GRAD_CLIP,nonlinearity=lasagne.nonlinearities.tanh) l_shp = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN)) l_dense = lasagne.layers.DenseLayer(l_shp, num_units=num_inputs, lasagne.nonlinearity=linear) l_out = lasagne.layers.ReshapeLayer(l_dense, (-1, SEQ_LENGTH, num_inputs)) # create output out of input in order to save memory? network_output = lasagne.layers.get_output(l_out) cost = lasagne.objectives.squared_error(network_output,target_values).mean() all_params = lasagne.layers.get_all_params(l_out,trainable=True) updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) input_values = T.ivector('target_output') target_values = T.ivector('target_output') # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([l_in.input_var, target_values], cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function([l_in.input_var, target_values], cost, allow_input_downcast=True)
def __init__(self, input_params=None, sentenceLayerNodesNum=[150, 120], sentenceLayerNodesSize=[(2, 200), (3, 1)], negativeLambda=1, poolingSize=[(2, 1)], mode="max"): """ mode is in {'max', 'average_inc_pad', 'average_exc_pad', 'sum'} """ rng = numpy.random.RandomState(23455) self._corpusWithEmbeddings = T.matrix("wordIndeices") self._dialogSentenceCount = T.ivector("dialogSentenceCount") self._sentenceWordCount = T.ivector("sentenceWordCount") # for list-type data self._layer0 = layer0 = SentenceEmbeddingMultiNN(self._corpusWithEmbeddings, self._dialogSentenceCount, self._sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=sentenceLayerNodesNum, \ sentenceLayerNodesSize=sentenceLayerNodesSize, poolingSize=poolingSize, mode=mode) layer1 = HiddenLayer( rng, input=layer0.output, n_in=layer0.outputDimension, n_out=layer0.outputDimension, activation=T.tanh ) self._nextSentence = layer1.output self._params = layer1.params + layer0.params self._setParameters(input_params) self.negativeLambda = negativeLambda zero_count = 1 for sentence, pooling in zip(sentenceLayerNodesSize[-1::-1], [(1, 1)] + poolingSize[-1::-1]): zero_count *= pooling[0] zero_count += sentence[0] - 1 self.zero_count = zero_count - 1
def __theano_build__(self): U, V, W = self.U, self.V, self.W x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_t_prev, U, V, W): s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev)) o_t = T.nnet.softmax(V.dot(s_t)) return [o_t[0], s_t] [o,s], updates = theano.scan( forward_prop_step, sequences=x, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], non_sequences=[U, V, W], truncate_gradient=self.bptt_truncate, strict=True) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Gradients dU = T.grad(o_error, U) dV = T.grad(o_error, V) dW = T.grad(o_error, W) # Assign functions self.forward_propagation = theano.function([x], o) self.predict = theano.function([x], prediction) self.ce_error = theano.function([x, y], o_error) self.bptt = theano.function([x, y], [dU, dV, dW]) # SGD learning_rate = T.scalar('learning_rate') self.sgd_step = theano.function([x,y,learning_rate], [], updates=[(self.U, self.U - learning_rate * dU), (self.V, self.V - learning_rate * dV), (self.W, self.W - learning_rate * dW)])
def build_finetune_functions(self, learning_rate): is_train = T.iscalar('is_train') X = T.matrix('X') AtRisk = T.ivector('AtRisk') Observed = T.ivector('Observed') #call the optimization function opt = Opt() forward = theano.function( on_unused_input='ignore', inputs=[X, Observed, AtRisk, is_train], outputs=[self.riskLayer.cost(self.o, self.AtRisk), self.riskLayer.output, self.riskLayer.input], givens={ self.x: X, self.o: Observed, self.AtRisk: AtRisk, self.is_train:is_train }, name='forward' ) backward = theano.function( on_unused_input='ignore', inputs=[X, Observed, AtRisk, is_train], updates=opt.SGD(self.riskLayer.cost(self.o, self.AtRisk), self.params, learning_rate), outputs=T.grad(self.riskLayer.cost(self.o, self.AtRisk), self.params), givens={ self.x: X, self.o: Observed, self.AtRisk: AtRisk, self.is_train:is_train }, name='forward' ) return forward, backward
def test_unwrapper(): emb_size = 5 y_time = tt.ivector() y_seq_id = tt.ivector() x = tt.tensor3() emb = IdentityInput(x, size=5) sequn = SeqUnwrapper(20) sequn.connect(emb, y_time, y_seq_id) rng = np.random.RandomState(23455) conv = LeNetConvPoolLayer() conv.connect(sequn, rng, (3, 1, 5, emb_size), (1, 1, )) #prev_layer = conv f = theano.function([x, y_time, y_seq_id], conv.output()) xx = np.random.randn(20, 4, emb_size) y_time = [3, 7, 10, 12] y_seq_id = [0, 0, 0, 0] res = f(xx, y_time, y_seq_id) print res.shape print res import ipdb; ipdb.set_trace()
def get_training_functions(self, x_lab_np=None, y_lab_np=None, x_unlab_np=None): # assert xlab.shape[0] == len(y_lab) assert self.x_lab_np.shape[0] == len(y_lab) self.x_lab = self._shared_dataset(self.x_lab_np) self.y_lab = self._shared_dataset(self.y_lab_np) self.x_unlab = self._shared_dataset(self.x_unlab_np) self.alpha = float(xlab.shape[0] / xunlab.shape[0]) index_unlab = T.ivector('index_unlab') index_lab = T.ivector('index_lab') momentum = T.scalar('momentum') learning_rate = T.scalar('learning_rate') # cost, updates = self.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab) self.batch_size_lab = self.batch_size * self.alpha self.batch_size_unlab = self.batch_size * (1-self.alpha) x_lab = T.matrix('x_lab') x_unlab = T.matrix('x_unlab') y_lab = T.ivector('y_lab') self.num_labels = self.x_lab_np.shape[0] self.num_unlabels = self.x_unlab_np[0] self.num_samples = num_labels + num_unlabels num_batches = num_samples / float(self.batch_size) pretraining_fns = [] for i in xrange(len(hidden_layers)): ssda = self.layers[i] exit() cost, updates = ssda.get_cost_updates(self.x_lab, self.x_unlab, self.y_lab) train_fn = theano.function(inputs=[index_lab, index_unlab], updates=updates, outputs=[cost], givens={self.x_lab:self.x_lab[index_lab], self.x_unlab:self.x_unlab[index_unlab], self.y_lab:self.y_lab[index_lab]}) pretraining_fns.append(train_fn) return pretraining_fns
def EnergyVecFn(fnsim, embeddings, leftop, rightop): embedding, relationl, relationr = parse_embeddings(embeddings) idxl, idxo, idxr = T.ivector('idxl'), T.ivector('idxo'), T.ivector('idxr') lhs, rhs = embedding.E[:, idxl].T, embedding.E[:, idxr].T rell, relr = relationl.E[:, idxo].T, relationr.E[:, idxo].T energy = - fnsim(leftop(lhs, rell), rightop(rhs, relr)) return theano.function([idxl, idxr, idxo], [energy], on_unused_input='ignore')
def train(self, word_emb): X_local = T.ivector(name="X_local") X = T.iscalar(name="X") X_neg = T.ivector(name="X_neg") X_g = T.dvector(name="X_g") [o_error], updates = theano.scan(self.target_function, sequences=X_neg,\ non_sequences=[word_emb, X_local, X, X_g]) error_sum = T.sum(o_error) self.c_error = theano.function([X_local, X, X_neg, X_g], error_sum) d_word_emb = T.grad(error_sum, word_emb) d_W1 = T.grad(error_sum, self.W1) d_b1 = T.grad(error_sum, self.b1) d_W2 = T.grad(error_sum, self.W2) d_b2 = T.grad(error_sum, self.b2) d_Wg1 = T.grad(error_sum, self.Wg1) d_bg1 = T.grad(error_sum, self.bg1) d_Wg2 = T.grad(error_sum, self.Wg2) d_bg2 = T.grad(error_sum, self.bg2) self.train_step = theano.function([X_local, X, X_neg, X_g], [], \ updates=[(word_emb-d_word_emb) (self.W1-d_W1), (self.b1-d_b1), (self.W2-d_W2), (self.b2-d_b2), (self.Wg1-d_Wg1), (self.bg1-d_bg1), (self.Wg2-d_Wg2), (self.bg2-d_bg2)])
def multMatVect(v, A, m1, B, m2): # TODO : need description for parameter and return """ Multiply the first half of v by A with a modulo of m1 and the second half by B with a modulo of m2. Notes ----- The parameters of dot_modulo are passed implicitly because passing them explicitly takes more time than running the function's C-code. """ if multMatVect.dot_modulo is None: A_sym = tensor.lmatrix('A') s_sym = tensor.ivector('s') m_sym = tensor.iscalar('m') A2_sym = tensor.lmatrix('A2') s2_sym = tensor.ivector('s2') m2_sym = tensor.iscalar('m2') o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym) multMatVect.dot_modulo = function( [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False) # This way of calling the Theano fct is done to bypass Theano overhead. f = multMatVect.dot_modulo f.input_storage[0].storage[0] = A f.input_storage[1].storage[0] = v[:3] f.input_storage[2].storage[0] = m1 f.input_storage[3].storage[0] = B f.input_storage[4].storage[0] = v[3:] f.input_storage[5].storage[0] = m2 f.fn() r = f.output_storage[0].storage[0] return r
def compile(self): ''' Forward pass and Gradients ''' # Get nicer names for parameters W1, W2, W3 = [self.W1] + self.params # FORWARD PASS # Embedding layer subspace self.z0 = T.ivector() # tweet in one hot # Use an intermediate sigmoid z1 = W1[:, self.z0] # embedding z2 = T.nnet.sigmoid(T.dot(W2, z1)) # subspace # Hidden layer z3 = T.dot(W3, z2) z4 = T.sum(z3, 1) # Bag of words self.hat_y = T.nnet.softmax(z4.T).T self.fwd = theano.function([self.z0], self.hat_y) # TRAINING COST AND GRADIENTS # Train cost minus log probability self.y = T.ivector() # reference out self.F = -T.mean(T.log(self.hat_y)[self.y]) # For softmax out # Update only last three parameters self.nablas = [] # Symbolic gradients self.grads = [] # gradients for W in self.params: self.nablas.append(T.grad(self.F, W)) self.grads.append(theano.function([self.z0, self.y], T.grad(self.F, W))) self.cost = theano.function([self.z0, self.y], self.F)
def initialize(self): users = T.ivector() items = T.ivector() ratings = T.vector() self.U = theano.shared( np.array( np.random.normal(scale=0.001, size=(self.n_users, self.n_factors)), dtype=theano.config.floatX ) ) self.I = theano.shared( np.array( np.random.normal(scale=0.001, size=(self.n_items, self.n_factors)), dtype=theano.config.floatX ) ) predictions = (self.U[users] * self.I[items]).sum(axis=1) train_error = ( ((predictions - ratings) ** 2).mean() + self.regularization * ( T.sum(self.U ** 2) + T.sum(self.I ** 2) ) ) test_error = ((predictions - ratings) ** 2).mean() params = [self.U, self.I] learning_rate = theano.shared(np.array(self.learning_rate, dtype=theano.config.floatX)) updates = self.optimizer(train_error, params, learning_rate=learning_rate) self.train_theano = theano.function([users, items, ratings], train_error, updates=updates) self.test_theano = theano.function([users, items, ratings], test_error) self.predict_theano = theano.function([users, items], predictions)
def __init__(self, vocabulary_size, hidden_size, output_size): X = tensor.ivector() Y = tensor.ivector() keep_prob = tensor.fscalar() learning_rate = tensor.fscalar() emb_layer = Embedding(vocabulary_size, hidden_size) lstm_layer = BiLSTM(hidden_size, hidden_size) dropout_layer = Dropout(keep_prob) fc_layer = FullConnect(2*hidden_size, output_size) crf = CRF(output_size) # graph defination X_emb = emb_layer(X) scores = fc_layer(tensor.tanh(lstm_layer(dropout_layer(X_emb)))) loss, predict = crf(scores, Y, isTraining=True) # loss, predict and accuracy accuracy = tensor.sum(tensor.eq(predict, Y)) * 1.0 / Y.shape[0] params = emb_layer.params + lstm_layer.params + fc_layer.params + crf.params updates = MomentumSGD(loss, params, lr=learning_rate) print("Compiling train function: ") train = theano.function(inputs=[X, Y, keep_prob, learning_rate], outputs=[predict, accuracy, loss], updates=updates, allow_input_downcast=True) print("Compiling evaluate function: ") evaluate = theano.function(inputs=[X_emb, Y, keep_prob], outputs=[predict, accuracy, loss], allow_input_downcast=True) self.embedding_tensor = emb_layer.params[0] self.train = train self.evaluate = evaluate self.params = params
def set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag): print '\nBuilding a neural model: %s\n' % args.model """ neural architecture parameters """ c_emb_dim = args.c_emb_dim w_hidden_dim = args.w_hidden_dim c_hidden_dim = args.c_hidden_dim output_dim = vocab_tag.size() window = args.window opt = args.opt """ symbol definition """ x = T.ivector() c = T.ivector() b = T.ivector() y = T.ivector() lr = T.fscalar('lr') if args.model == 'char': return nn_char.Model(name=args.model, w=x, c=c, b=b, y=y, lr=lr, init_w_emb=init_w_emb, vocab_w_size=vocab_word.size(), vocab_c_size=vocab_char.size(), w_emb_dim=w_emb_dim, c_emb_dim=c_emb_dim, w_hidden_dim=w_hidden_dim, c_hidden_dim=c_hidden_dim, output_dim=output_dim, window=window, opt=opt) else: return nn_word.Model(name=args.model, x=x, y=y, lr=lr, init_emb=init_w_emb, vocab_size=vocab_word.size(), emb_dim=w_emb_dim, hidden_dim=w_hidden_dim, output_dim=output_dim, window=window, opt=opt)
def create_iter_funcs_valid(l_out, bs=None, N=50, mc_dropout=False): X = T.tensor4('X') y = T.ivector('y') X_batch = T.tensor4('X_batch') y_batch = T.ivector('y_batch') if not mc_dropout: y_hat = layers.get_output(l_out, X, deterministic=True) else: if bs is None: raise ValueError('a fixed batch size is required for mc dropout') X_repeat = T.extra_ops.repeat(X, N, axis=0) y_sample = layers.get_output( l_out, X_repeat, deterministic=False) sizes = [X_repeat.shape[0] / X.shape[0]] * bs y_sample_split = T.as_tensor_variable( T.split(y_sample, sizes, bs, axis=0)) y_hat = T.mean(y_sample_split, axis=1) valid_loss = T.mean( T.nnet.categorical_crossentropy(y_hat, y)) valid_acc = T.mean( T.eq(y_hat.argmax(axis=1), y)) valid_iter = theano.function( inputs=[theano.Param(X_batch), theano.Param(y_batch)], outputs=[valid_loss, valid_acc], givens={ X: X_batch, y: y_batch, }, ) return valid_iter
def create_iter_funcs_train(l_out, lr, mntm, wd): X = T.tensor4('X') y = T.ivector('y') X_batch = T.tensor4('X_batch') y_batch = T.ivector('y_batch') y_hat = layers.get_output(l_out, X, deterministic=False) # softmax loss train_loss = T.mean( T.nnet.categorical_crossentropy(y_hat, y)) # L2 regularization train_loss += wd * regularize_network_params(l_out, l2) train_acc = T.mean( T.eq(y_hat.argmax(axis=1), y)) all_params = layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.nesterov_momentum( train_loss, all_params, lr, mntm) train_iter = theano.function( inputs=[theano.Param(X_batch), theano.Param(y_batch)], outputs=[train_loss, train_acc], updates=updates, givens={ X: X_batch, y: y_batch, }, ) return train_iter
def evaluate_ready(self, ispro = True): var_x = T.ivector() var_y = T.ivector() print "adopt mention level evaluate ???????????????????? "+str(self.ismention) if self.model_type == "softmax" or self.model_type == "softmax_reg": if self.istransition: output = self.structure1(var_x, ispro = False) self.evafunc = theano.function([var_x], output) else: output = self.structure1(var_x, ispro) self.evafunc = theano.function([var_x], output) elif self.model_type == "maxneg": out1, out2 = self.structure2(var_x,ispro) self.evafunc = theano.function([var_x], [out1,out2]) elif self.model_type == "maxout": out1, out2 = self.structure2(var_x,False) self.evafunc = theano.function([var_x], [out1,out2]) else: raise Exception
def __init__(self, input_params=None): rng = numpy.random.RandomState(23455) self._corpusWithEmbeddings = T.matrix("wordIndeices") self._dialogSentenceCount = T.ivector("dialogSentenceCount") self._sentenceWordCount = T.ivector("sentenceWordCount") # for list-type data self._layer0 = SentenceEmbeddingNN(self._corpusWithEmbeddings, self._dialogSentenceCount, self._sentenceWordCount, rng, wordEmbeddingDim=200, \ sentenceLayerNodesNum=1000, \ sentenceLayerNodesSize=[5, 200]) self._average_layer = sentenceEmbeddingAverage(self._corpusWithEmbeddings, self._dialogSentenceCount, self._sentenceWordCount, rng, wordEmbeddingDim=200) # Get sentence layer W semanicTransformW = theano.shared( numpy.asarray( rng.uniform(low=-0.2, high=0.2, size=(self._layer0.outputDimension, 200)), dtype=config.globalFloatType() ), borrow=True ) self._nextSentence = T.dot(self._layer0.output, semanicTransformW) # construct the parameter array. self._params = [semanicTransformW] + self._layer0.params self._setParameters(input_params)
def build(self): x=T.ivector('x') y=T.ivector('y') lr=T.scalar('learning_rate') def _recurrence(x_t,s_tm1): s_t=T.tanh(self.U[:,x_t]+T.dot(s_tm1,self.W)) o_t=T.nnet.softmax(T.dot(s_t,self.V)) return [o_t[0],s_t] [o,s],updates=theano.scan(fn=_recurrence, sequences=x, outputs_info=[None,dict(initial=T.zeros(self.hidden_dim))], truncate_gradient=self.bptt_truncate, strict=True) prediction=T.argmax(o,axis=1) o_error=T.sum(T.nnet.categorical_crossentropy(o,y)) # Gradients gparams=T.grad(o_error,self.params) updates=[(param,param-lr*gparam) for param,gparam in zip(self.params,gparams)] # Assign functions self.forward_propagation=theano.function([x],o) self.predict=theano.function([x],prediction) self.train=theano.function(intputs=[x,y,lr], outputs=o_error, updates=updates)
def main(model='mlp', num_epochs=500): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") if model == 'mlp': network = build_mlp(input_var) elif model.startswith('custom_mlp:'): depth, width, drop_in, drop_hid = model.split(':', 1)[1].split(',') network = build_custom_mlp(input_var, int(depth), int(width), float(drop_in), float(drop_hid)) elif model == 'cnn': network = build_cnn(input_var) else: print("Unrecognized model type %r." % model) return # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.01, momentum=0.9) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))
def __init__(self, numpy_rng, theano_rng=None, n_ins=None, hidden_layers_sizes=[50], iBNhl = -1, n_outs=None): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], #activation=T.nnet.sigmoid) activation=T.tanh) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y) self.y_pred = self.logLayer.y_pred self.p_y = self.logLayer.p_y_given_x #print(len(self.sigmoid_layers)) #for l in self.sigmoid_layers: # print('0 ',l.output.shape) self.BN_f = self.sigmoid_layers[iBNhl].output
def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, dataset='../data/mnist.pkl.gz', batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best' ' model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def train_cnn_model(self, X_train, y_train): print 'Training CNN model....' print X_train.shape num_classes = len(set(y_train)) ######### Build CNN ######################################### #self.data_width = int(X_train.shape[2]/10) #X_train = X_train.reshape(X_train.shape[0], 3, -1) print X_train.shape l_in = lasagne.layers.InputLayer(shape=(None, X_train.shape[1], X_train.shape[2])) conv_network = l_in #Build Convolution layers for l in range(self.params['conv_layers']): conv_network = lasagne.layers.Conv1DLayer(conv_network, num_filters=self.params['conv_filter_num'][l], filter_size=self.params['conv_filter_dims'], nonlinearity=lasagne.nonlinearities.rectify) print 'l_conv%d output: '%l+str(lasagne.layers.get_output_shape(conv_network)) conv_network = lasagne.layers.MaxPool1DLayer(conv_network, pool_size=self.params['pool_size']) print 'l_pool%d output: '%l+str(lasagne.layers.get_output_shape(conv_network)) conv_output = lasagne.layers.get_output(conv_network) network = conv_network #Build fully connected hidden layers for i in range(self.params['hid_layers']): units = self.params['hid_units'][i] network = lasagne.layers.DenseLayer(network, num_units=units, nonlinearity=lasagne.nonlinearities.tanh) network = lasagne.layers.DropoutLayer(network, p=0.5) #Build output layer network = lasagne.layers.DenseLayer(network, num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax) input_var = T.tensor4('inputs') target_var = T.ivector('targets') predictions = lasagne.layers.get_output(network) conv_weights = lasagne.layers.get_output(conv_network) self.classifier = theano.function([l_in.input_var],predictions) self.cnn_weights = theano.function([l_in.input_var], conv_output) loss = lasagne.objectives.categorical_crossentropy(predictions, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.nesterov_momentum(loss,params, learning_rate=0.01, momentum=0.9) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([l_in.input_var, target_var], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: self.val_fn = theano.function([l_in.input_var, target_var], [test_loss, test_acc]) num_epochs = self.params['epochs'] for epoch in range(num_epochs): start_time = time.time() train_err = 0 train_batches = 0 for batch in self.iterate_batches(X_train, y_train): inputs, targets = batch train_err += train_fn(inputs, targets.astype(np.int32)) train_batches += 1 #val_err = 0 #val_acc = 0 #val_batches = 0 #for batch in self.iterate_batches(X_val, y_val): #inputs, targets = batch #err, acc = val_fn(X_val, y_val) #val_err += err #val_acc += acc #val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) #print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) #print(" validation accuracy:\t\t{:.2f} %".format( #val_acc / val_batches * 100)) cnn_X_train = self.cnn_weights(X_train) cnn_X_train = cnn_X_train.reshape([cnn_X_train.shape[0], -1]) self.svm = SVC() self.svm = self.svm.fit(cnn_X_train, y_train)
def mlp_run(train_set, valid_set, test_set, learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, batch_size=20, n_hidden=500): """ Demonstrate stochastic gradient descent optimization of a log-linear model """ print 'loading ', train_set, ' for train' train_set_x, train_set_y = load_data(train_set) print 'loading ', valid_set, ' for valid' valid_set_x, valid_set_y = load_data(valid_set) print 'loading ', test_set, ' for test' if test_set != valid_set: test_set_x, test_set_y = load_data(test_set) else: test_set_x, test_set_y = valid_set_x, valid_set_y # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print "train_set_x size:", train_set_x.get_value(borrow=True).shape[0] print "batch_size:", batch_size print "n_train_batches:", n_train_batches ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 total_dim = train_set_x.get_value(borrow=True).shape[1] rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=total_dim, n_hidden=n_hidden, n_out=2) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) # end-snippet-4 # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) validate_auc = theano.function(inputs=[], outputs=classifier.auc(y), givens={ x: valid_set_x, y: valid_set_y }) test_auc = theano.function(inputs=[], outputs=classifier.auc(y), givens={ x: test_set_x, y: test_set_y }) # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-3 ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = FLAGS.iter # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch print "n_train_batches:", n_train_batches print "validation_frequency:", validation_frequency best_validation_loss = numpy.inf best_auc = 0 test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 if epoch == 10: learning_rate *= 0.8 if epoch == 20: learning_rate *= 0.5 if epoch == 30: learning_rate = 0.01 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) auc_values = [validate_auc()] auc = numpy.mean(auc_values) print "current valid auc: ", auc, " best auc: ", best_auc, " imporve: ", auc - best_auc, " significant?: ", auc - best_auc > FLAGS.min_improvement #print validate_auc(0) if auc > best_auc: if auc - best_auc > FLAGS.min_improvement: print 'before patience:', patience, ' iter:', iter patience = max(patience, iter * patience_increase) print 'after patience:', patience best_auc = auc auc_values = [test_auc()] testauc = numpy.mean(auc_values) print "test auc: ", testauc #cPickle.dump(classifier, open('best_model.pkl', 'wb')) if patience <= iter: done_looping = True print "patience:", patience, "iter:", iter, "done_looping:", done_looping break end_time = timeit.default_timer() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print 'best valid auc is ', best_auc print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def __init__( self, numpy_rng, theano_rng=None, cfg=None, # the network configuration dnn_shared=None, shared_layers=[], input=None): self.layers = [] self.params = [] self.delta_params = [] self.rnn_layerX = 2 print "Use DRN" self.cfg = cfg self.n_ins = cfg.n_ins self.n_outs = cfg.n_outs self.hidden_layers_sizes = cfg.hidden_layers_sizes self.hidden_layers_number = len(self.hidden_layers_sizes) self.activation = cfg.activation self.do_maxout = cfg.do_maxout self.pool_size = cfg.pool_size self.max_col_norm = cfg.max_col_norm self.l1_reg = cfg.l1_reg self.l2_reg = cfg.l2_reg if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data if input == None: self.x = T.matrix('x') else: self.x = input self.y = T.ivector('y') for i in xrange(self.hidden_layers_number): # construct the hidden layer if i == 0: input_size = self.n_ins layer_input = self.x else: input_size = self.hidden_layers_sizes[i - 1] layer_input = self.layers[-1].output W = None b = None if (i in shared_layers): W = dnn_shared.layers[i].W b = dnn_shared.layers[i].b if i == self.rnn_layerX: hidden_layer = RnnLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation) else: if self.do_maxout == True: hidden_layer = HiddenLayer( rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i] * self.pool_size, W=W, b=b, activation=(lambda x: 1.0 * x), do_maxout=True, pool_size=self.pool_size) else: hidden_layer = HiddenLayer( rng=numpy_rng, input=layer_input, n_in=input_size, n_out=self.hidden_layers_sizes[i], W=W, b=b, activation=self.activation) # add the layer to our list of layers self.layers.append(hidden_layer) self.params.extend(hidden_layer.params) self.delta_params.extend(hidden_layer.delta_params) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(input=self.layers[-1].output, n_in=self.hidden_layers_sizes[-1], n_out=self.n_outs) if self.n_outs > 0: self.layers.append(self.logLayer) self.params.extend(self.logLayer.params) self.delta_params.extend(self.logLayer.delta_params) # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) self.errors = self.logLayer.errors(self.y) if self.l1_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l1_reg * (abs(W).sum()) if self.l2_reg is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W self.finetune_cost += self.l2_reg * T.sqr(W).sum()
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset=DataSet, nkerns=[cls1, cls2], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print type(train_set_x) #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540]) #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540]) #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540]) #train_set_x = train_set_x / 100 #valid_set_x = valid_set_x / 100 #test_set_x = test_set_x / 100 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size #n_test_batches = (n_test_batches/batch_size) + (n_test_batches % batch_size > 0) print (n_test_batches) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch Alr = T.scalar('Alr') x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (nFB, nFs) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer dFeatureV = 2*nFB*nFs xinp = x[:,:dFeatureV] # print (x.shahpe) layer0_input = xinp.reshape((batch_size, 2, nFB, nFs)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 2, nFB, nFs), filter_shape=(nkerns[0], 2, fsx, fsy), poolsize=(p1, 1)) cl2x = (nFB - fsx + 1)/p1 cl2y = (nFs - fsy + 1) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], cl2x, cl2y), filter_shape=(nkerns[1], nkerns[0], fsx, 1), poolsize=(p2, 1)) hl1 = (cl2x - fsx + 1)/p2 # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) layer2_inputT = T.concatenate([layer2_input,x[:,dFeatureV:]],axis = 1) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_inputT, n_in=(nkerns[1] * hl1 * 1)+12, n_out=nhu1, activation=T.tanh) layer22 = HiddenLayer(rng, input=layer2.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer22.output, n_in=nhu1, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) #yPred = layer3.ypred(layer2.output) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): #updates.append((param_i, param_i - learning_rate * grad_i)) updates.append((param_i, param_i - Alr * grad_i)) train_model = theano.function([index, Alr], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size][:], y: train_set_y[index * batch_size: (index + 1) * batch_size][:]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch #best_params = None best_params = [] best_validation_loss = numpy.inf prev_validation_loss = 200 best_iter = 0 test_score = 0. start_time = time.clock() Alrc = 0.1 AlrE = 0.00001 epochC = 0 epoch = 0 done_looping = False for param in params: best_params.append(param.get_value()) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 epochC = epochC + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index, Alrc) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) lossratio = (this_validation_loss - prev_validation_loss)/(prev_validation_loss+1) print (lossratio) print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100., Alrc)) # if we got the best validation score until now #if this_validation_loss < best_validation_loss: if lossratio <= 0.0: for i in range(len(params)): best_params[i] = params[i].get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss prev_validation_loss = this_validation_loss best_iter = iter # test it on the test set #tm = test_model(0) yP = numpy.asarray([]) test_losses = [test_model(i)[0] for i in xrange(n_test_batches)] for i in xrange(n_test_batches): yP = numpy.concatenate((yP,test_model(i)[1])) print yP.shape test_score = numpy.mean(test_losses) #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value()) y = test_set_y.owner.inputs[0].get_value()[:3000] print (yP.shape) print (y.shape) I1 = numpy.nonzero(y==0.0) I2 = numpy.nonzero(y==1.0) I3 = numpy.nonzero(y==2.0) I4 = numpy.nonzero(y==3.0) print (I1[0].shape) print (I2[0].shape) print (I3[0].shape) print (I4[0].shape) I11 = numpy.nonzero(yP[I1[0]]==0) I12 = numpy.nonzero(yP[I1[0]]==1) I13 = numpy.nonzero(yP[I1[0]]==2) I14 = numpy.nonzero(yP[I1[0]]==3) I21 = numpy.nonzero(yP[I2[0]]==0) I22 = numpy.nonzero(yP[I2[0]]==1) I23 = numpy.nonzero(yP[I2[0]]==2) I24 = numpy.nonzero(yP[I2[0]]==3) I31 = numpy.nonzero(yP[I3[0]]==0) I32 = numpy.nonzero(yP[I3[0]]==1) I33 = numpy.nonzero(yP[I3[0]]==2) I34 = numpy.nonzero(yP[I3[0]]==3) I41 = numpy.nonzero(yP[I4[0]]==0) I42 = numpy.nonzero(yP[I4[0]]==1) I43 = numpy.nonzero(yP[I4[0]]==2) I44 = numpy.nonzero(yP[I4[0]]==3) acc1 = float(float(I11[0].size)/float(I1[0].size)) acc2 = float(float(I22[0].size)/float(I2[0].size)) if n_out == 3: acc3 = float(float(I33[0].size)/float(I3[0].size)) elif n_out == 4: acc3 = float(float(I33[0].size)/float(I3[0].size)) acc4 = float(float(I44[0].size)/float(I4[0].size)) else: acc3 = 0 acc4 = 0 print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, acc4 = %f, I11 = %i, I12 = %i, I13 = %i, I14 = %i, I21 = %i, I22 = %i, I23 = %i, I24 = %i, I31 = %i, I32 = %i, I33 = %i, I34 = %i, I41 = %i, I42 = %i, I43 = %i, I44 = %i %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, acc4 * 100, I11[0].size, I12[0].size, I13[0].size, I14[0].size, I21[0].size, I22[0].size, I23[0].size, I24[0].size, I31[0].size, I32[0].size, I33[0].size, I34[0].size, I41[0].size, I42[0].size, I43[0].size, I44[0].size)) #print((' epoch %i, minibatch %i/%i, test error of best ' # 'model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) else: if Alrc <= AlrE: done_looping = True break elif epochC > 40: Alrc = Alrc/2 for param, best_param in zip(params,best_params): param.set_value(best_param) epochC = 0 #if patience <= iter: # done_looping = True # break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def fit(self, X, Y, learning_rate=10e-5, mu=0.9, decay=0.99, epochs=10, batch_sz=100, eps=10e-10, display_cost=False): #learning_rate=10e-7, mu=0.99, decay=0.999, epochs=100, batch_sz=30, l2=0.0, eps=10e-10 learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) eps = np.float32(eps) ''' In Theano we can't actually 'drop' the nodes; that would result in a different computational graph, we are instead to multiply nodes by 1 and 0; for each layer we then need to create a 'mask' - array of 0s and 1s; Theano graph nodes don't have values, so we can't multiply them by numpy array 'mask'; instead we want Theano to generate random values every time it's called; thus we create an instance of RandomStreams object: ''' self.rng = RandomStreams() # first, make a validation set: X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:, :], Y[-1000:] X, Y = X[:-1000, :], Y[:-1000] #initialize the hidden layers: N, D = X.shape K = len(set(Y)) self.hidden_layers = [] # the size of the first dimension of the first matrix: M1 = D count = 0 # for the id of the weigts/biases for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 # update the first dimension size fir the next iteration count += 1 # for the last weight/bias matrix (vector): W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W%s' % count) self.b = theano.shared(b, 'b%s' % count) # collect all the parameters we are going to use during Gradient Descent: self.parameters = [self.W, self.b] for h in self.hidden_layers[::-1]: self.parameters += h.params # in order to use Momentum, # we are to keep track of all the changes (dW's and db's): dparams = [ theano.shared(np.zeros_like(p.get_value(), dtype=np.float32)) for p in self.parameters ] # for RMSProp, # we are to keep track of caches (cache_W's and cache_b's) as well: caches = [ theano.shared(np.ones_like(p.get_value(), dtype=np.float32)) for p in self.parameters ] # define theano variables and functions: thX = T.matrix('X') thY = T.ivector('Y') # a vector of integers # since we do dropout, we drop the nodes only on training step, # when evaluating we just scale them; # so we need to define two expressions for the output and cost calculations: pY_train = self.forward_train(thX) pY_predict = self.forward_predict(thX) cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) # will do sort of T.argmax(pY, axis=1) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) # the updates for the train function: updates = [ (cache, decay * cache + (np.float32(1.0) - decay) * T.grad(cost, p)**2) for p, cache in zip(self.parameters, caches) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p) / T.sqrt(cache + eps)) for dp, p, cache in zip(dparams, self.parameters, caches) ] + [(p, p + dp) for p, dp in zip(self.parameters, dparams)] #updates = rmsprop(cost, self.parameters, learning_rate, mu, decay, eps) train_op = theano.function(inputs=[thX, thY], updates=updates) # batch SGD: n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print('\ni: %d, j: %d, cost: %.6f, \nerror: %.6f' % (i, j, c, e)) if display_cost: plt.plot(costs) plt.show()
mini_updates = [] micro_updates = [] last_upd = [] update = [] # shared variables learning_rate = shared(float32(lr.init)) if use.mom: momentum = shared(float32(mom.momentum)) drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) idx_mini = T.lscalar(name="idx_mini") # minibatch index idx_micro = T.lscalar(name="idx_micro") # microbatch index x = ndtensor(len(tr.in_shape))(name='x') # video input y = T.ivector(name='y') # labels x_ = _shared(empty(tr.in_shape)) y_ = _shared(empty(tr.batch_size)) y_int32 = T.cast(y_, 'int32') # in shape: #frames * gray/depth * body/hand * 4 maps import cPickle f = open(os.path.join(load_path, 'SK_normalization.pkl'), 'rb') SK_normalization = cPickle.load(f) Mean1 = SK_normalization['Mean1'] Std1 = SK_normalization['Std1'] f = open('CNN_normalization.pkl', 'rb') CNN_normalization = cPickle.load(f) Mean_CNN = CNN_normalization['Mean_CNN'] Std_CNN = CNN_normalization['Std_CNN']
def __init__(self, number_samples): # set up weights and biases d = 1 # depth of x n = number_samples # init Wx = np.asarray( rng.uniform(low=-np.sqrt(2. / (d + n_hidden)), high=np.sqrt(2. / (d + n_hidden)), size=(d, n_hidden))) self.Wx = theano.shared(Wx, name='Wx', borrow=True) Wh = np.asarray( rng.uniform(low=-np.sqrt(2. / (d + n_hidden)), high=np.sqrt(2 / (d + n_hidden)), size=(n_hidden, n_hidden))) self.Wh = theano.shared(Wh, name='Wh', borrow=True) bh = np.zeros(n_hidden) self.bh = theano.shared(bh, name='bh', borrow=True) ho = np.zeros(n_hidden) self.ho = theano.shared(ho, name='ho', borrow=True) Wo = np.asarray( rng.uniform(low=-np.sqrt(2. / (n_hidden + n_out)), high=np.sqrt(2. / (n_hidden + n_out)), size=(n_hidden, n_out))) self.Wo = theano.shared(Wo, name='Wo', borrow=True) bo = np.zeros(n_out) self.bo = theano.shared(bo, name='bo', borrow=True) # values to adjust with back propagation self.parameters = [ self.Wx, self.Wh, self.bh, self.ho, self.Wo, self.bo ] # recurrence functions thX = T.fmatrix('x') thY = T.ivector('y') # feed forward equations def recurrence(x_t, h_t1): h_t = T.nnet.relu( T.dot(x_t, self.Wx) + T.dot(h_t1, self.Wh) + self.bh) y_t = T.nnet.softmax(T.dot(h_t, self.Wo) + self.bo) return h_t, y_t # loop over feed forward equations once for each bit in the sequence # send previous hidden output back through and collect prediction [h, y_predicted], _ = theano.scan( fn=recurrence, outputs_info=[self.ho, None], sequences=thX, n_steps=thX.shape[0], ) # probability of x given y py_x = y_predicted[:, 0, :] prediction = T.argmax(py_x, axis=1) # fetch most likely prediction # cost functions for gradients and tracking progress cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) # cross entropy gradients = T.grad(cost, self.parameters) # derivatives updates = [(p, p - learning_rate * g) for p, g in zip(self.parameters, gradients)] # training and prediction functions self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=cost, updates=updates)
def build_model(shared_params, options): trng = RandomStreams(1234) drop_ratio = options['drop_ratio'] batch_size = options['batch_size'] n_dim = options['n_dim'] w_emb = shared_params['w_emb'] dropout = theano.shared(numpy.float32(0.)) image_feat = T.ftensor3('image_feat') # batch_size x T input_idx = T.imatrix('input_idx') input_mask = T.matrix('input_mask') # label is the TRUE label label = T.ivector('label') empty_word = theano.shared(value=np.zeros((1, options['n_emb']), dtype='float32'), name='empty_word') w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0) input_emb = w_emb_extend[input_idx] # a trick here, set the maxpool_h/w to be large # maxpool_shape = (options['maxpool_h'], options['maxpool_w']) # turn those appending words into zeros # batch_size x T x n_emb input_emb = input_emb * input_mask[:, :, None] if options['sent_drop']: input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio) if options['use_unigram_conv']: unigram_conv_feat = fflayer(shared_params, input_emb, options, prefix='conv_unigram', act_func=options.get('sent_conv_act', 'tanh')) unigram_pool_feat = unigram_conv_feat.max(axis=1) if options['use_bigram_conv']: idx = T.concatenate([T.arange(input_emb.shape[1])[:-1], T.arange(input_emb.shape[1])[1:]]).reshape((2, input_emb.shape[1] - 1)).transpose().flatten() bigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0], input_emb.shape[1] - 1, 2 * input_emb.shape[2])) bigram_conv_feat = fflayer(shared_params, bigram_emb, options, prefix='conv_bigram', act_func=options.get('sent_conv_act', 'tanh')) bigram_pool_feat = bigram_conv_feat.max(axis=1) if options['use_trigram_conv']: idx = T.concatenate([T.arange(input_emb.shape[1])[:-2], T.arange(input_emb.shape[1])[1:-1], T.arange(input_emb.shape[1])[2:]]).reshape((3, input_emb.shape[1] - 2)).transpose().flatten() trigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0], input_emb.shape[1] - 2, 3 * input_emb.shape[2])) trigram_conv_feat = fflayer(shared_params, trigram_emb, options, prefix='conv_trigram', act_func=options.get('sent_conv_act', 'tanh')) trigram_pool_feat = trigram_conv_feat.max(axis=1) # pool_feat = T.concatenate([unigram_pool_feat, bigram_pool_feat, trigram_pool_feat], axis=1) image_feat_down = fflayer(shared_params, image_feat, options, prefix='image_mlp', act_func=options.get('image_mlp_act', 'tanh')) if options.get('use_before_attention_drop', False): image_feat_down = dropout_layer(image_feat_down, dropout, trng, drop_ratio) pool_feat = dropout_layer(pool_feat, dropout, trng, drop_ratio) # attention model begins here # first layer attention model image_feat_attention_1 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_1', act_func=options.get('image_att_mlp_act', 'tanh')) pool_feat_attention_1 = fflayer(shared_params, pool_feat, options, prefix='sent_att_mlp_1', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_1 = image_feat_attention_1 + \ pool_feat_attention_1[:, None, :] if options['use_attention_drop']: combined_feat_attention_1 = dropout_layer(combined_feat_attention_1, dropout, trng, drop_ratio) combined_feat_attention_1 = fflayer(shared_params, combined_feat_attention_1, options, prefix='combined_att_mlp_1', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0]) image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1) combined_hidden_1 = image_feat_ave_1 + pool_feat # second layer attention model image_feat_attention_2 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_2', act_func=options.get('image_att_mlp_act', 'tanh')) pool_feat_attention_2 = fflayer(shared_params, combined_hidden_1, options, prefix='sent_att_mlp_2', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_2 = image_feat_attention_2 + \ pool_feat_attention_2[:, None, :] if options['use_attention_drop']: combined_feat_attention_2 = dropout_layer(combined_feat_attention_2, dropout, trng, drop_ratio) combined_feat_attention_2 = fflayer(shared_params, combined_feat_attention_2, options, prefix='combined_att_mlp_2', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0]) image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1) if options.get('use_final_image_feat_only', False): combined_hidden = image_feat_ave_2 + pool_feat else: combined_hidden = image_feat_ave_2 + combined_hidden_1 for i in range(options['combined_num_mlp']): if options.get('combined_mlp_drop_%d'%(i), False): combined_hidden = dropout_layer(combined_hidden, dropout, trng, drop_ratio) if i == options['combined_num_mlp'] - 1: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func='linear') else: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func=options.get('combined_mlp_act_%d'%(i), 'tanh')) # drop the image output prob = T.nnet.softmax(combined_hidden) prob_y = prob[T.arange(prob.shape[0]), label] pred_label = T.argmax(prob, axis=1) # sum or mean? cost = -T.mean(T.log(prob_y)) accu = T.mean(T.eq(pred_label, label)) # return image_feat, input_idx, input_mask, \ # label, dropout, cost, accu return image_feat, input_idx, input_mask, \ label, dropout, cost, accu, pred_label, \ prob_attention_1, prob_attention_2
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=200, dataset='mnist.pkl.gz', batch_size=20, n_hidden=500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) classifier.hiddenLayer.printWts() classifier.hiddenLayer2.printWts() # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # start-snippet-5 # compute the gradient of cost with respect to theta (sorted in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-5 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) classifier.hiddenLayer.printWts() classifier.hiddenLayer2.printWts()
def __init__(self, num_emb, emb_dim, hidden_dim, output_dim, degree=2, dep_types=3, learning_rate=0.01, momentum=0.9, trainable_embeddings=True, labels_on_nonroot_nodes=False, eval_on_entities=True, num_entities=2): assert emb_dim > 1 and hidden_dim > 1 self.num_emb = num_emb self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.degree = degree self.learning_rate = learning_rate self.momentum = momentum self.num_entities = num_entities self.arc_type = dep_types self.params = [] self.embeddings = theano.shared( self.init_matrix([self.num_emb, self.emb_dim])) if trainable_embeddings: self.params.append(self.embeddings) self.x = T.ivector(name='x') # word indices if labels_on_nonroot_nodes: print 'matrix!!!' self.y = T.fmatrix( name='y') # output shape [None, self.output_dim] self.y_exists = T.fvector(name='y_exists') # shape [None] else: #print 'vector!!!' # Modifying this part too for the -log_prob loss print 'scalar!!!' self.y = T.iscalar(name='y') #self.y = T.fvector(name='y') # output shape [self.output_dim] self.num_words = self.x.shape[ 0] # total number of nodes (leaves + internal) in tree emb_x = self.embeddings[self.x] #emb_x = emb_x * T.neq(self.x, -1).dimshuffle(0, 'x') # zero-out non-existent embeddings if labels_on_nonroot_nodes: self.tree = T.imatrix(name='tree') # shape [None, self.degree] self.tree_states = self.compute_tree(emb_x, self.tree) self.output_fn = self.create_output_fn_multi() self.pred_y = self.output_fn(self.tree_states) self.loss = self.loss_fn_multi(self.y, self.pred_y, self.y_exists) elif eval_on_entities: #self.tree = T.tensor3(name='tree') self.tree = T.matrix(name='tree') self.tree_states = self.compute_tree(emb_x, self.tree) self.output_fn = self.create_entity_output_fn() self.entities = [ T.ivector(name='entt' + str(i)) for i in range(self.num_entities) ] self.entity_tv = T.sum(self.tree_states[self.entities[0]], axis=0) for enidx in self.entities[1:]: self.entity_tv = T.concatenate( [self.entity_tv, T.sum(self.tree_states[enidx], axis=0)]) self.pred_y = self.output_fn(self.entity_tv) self.loss = self.loss_fn(self.y, self.pred_y) else: self.tree = T.imatrix(name='tree') # shape [None, self.degree] self.tree_states = self.compute_tree(emb_x, self.tree) self.final_state = self.tree_states[-1] self.output_fn = self.create_output_fn() self.pred_y = self.output_fn(self.final_state) self.loss = self.loss_fn(self.y, self.pred_y) self.tree_states = None updates = self.gradient_descent(self.loss) grads = T.grad(self.loss, self.params) train_inputs = [self.x, self.tree, self.y] pred_inputs = [self.x, self.tree] if labels_on_nonroot_nodes: train_inputs.append(self.y_exists) if eval_on_entities: train_inputs.extend(self.entities) pred_inputs.extend(self.entities) print 'train_inputs:', train_inputs print 'pred_inputs:', pred_inputs self._train = theano.function(train_inputs, [self.loss, self.pred_y], updates=updates) #, #allow_input_downcast=True) self._predict = theano.function(pred_inputs, self.pred_y) #,
def __init__(self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', opt_params={ 'lr': 1e-3, 'b1': 0.9, 'b2': 0.99 }): self.numpy_rng = np.random.RandomState(1234) self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) self.n_dim = n_dim self.n_out = n_out self.n_superbatch = n_superbatch self.alg = opt_alg self.n_class = 10 lr = opt_params.get('lr') n_batch = opt_params.get('nb') train_set_x = theano.shared( np.empty((n_superbatch, n_chan, n_dim, n_dim), dtype=theano.config.floatX), borrow=False, ) val_set_x = theano.shared( np.empty((n_superbatch, n_chan, n_dim, n_dim), dtype=theano.config.floatX), borrow=False, ) train_set_y = theano.shared( np.empty((n_superbatch, ), dtype=theano.config.floatX), borrow=False, ) val_set_y = theano.shared( np.empty((n_superbatch, ), dtype=theano.config.floatX), borrow=False, ) train_set_y_int = T.cast(train_set_y, 'int32') val_set_y_int = T.cast(val_set_y, 'int32') train_rbm_px_mu = theano.shared( np.empty((n_superbatch, self.n_aux), dtype=theano.config.floatX), borrow=False, ) X = T.tensor4(dtype=theano.config.floatX) S = T.tensor3(dtype=theano.config.floatX) Y = T.ivector() px_mu = T.lscalar(dtype=config.floatX) idx1, idx2 = T.lscalar(), T.lscalar() alpha = T.scalar(dtype=theano.config.floatX) # learning rate self.inputs = (X, Y, idx1, idx2, S, px_mu) # ---------------------------- # Begin RBM-only self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan) persistent_chain = theano.shared( np.zeros((n_batch, self.n_hidden), dtype=theano.config.floatX), borrow=True, ) rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates( alpha, lr=lr, persistent=persistent_chain, ) self.rbm_objectives = (rbm_cost, rbm_acc) self.rbm_train = theano.function( [idx1, idx2, alpha], [rbm_cost, rbm_acc], updates=rbm_updates, givens={ X: train_set_x[idx1:idx2], Y: train_set_y_int[idx1:idx2] }, on_unused_input='warn', ) # End RBM-only # ---------------------------- # Begin DADGM-only tau = theano.shared( np.float32(5.0), name='temperature', allow_downcast=True, borrow=False, ) self.tau = tau self.dadgm_network = self.create_dadgm_model( X, Y, n_dim, n_out, n_chan, ) dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False) self.dadgm_objectives = (dadgm_loss, dadgm_acc) dadgm_params = self.get_dadgm_params() dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False) dadgm_updates = self.create_dadgm_updates( dadgm_grads, dadgm_params, alpha, opt_alg, opt_params, ) self.dadgm_train = theano.function( [idx1, idx2, alpha], [dadgm_loss, dadgm_acc], updates=dadgm_updates, givens={ X: train_set_x[idx1:idx2], Y: train_set_y_int[idx1:idx2], px_mu: train_rbm_px_mu, }, on_unused_input='warn', ) self.dadgm_loss = theano.function( [X, Y], [dadgm_loss, dadgm_acc], on_unused_input='warn', ) # End DADGM-only # ---------------------------- self.n_batch = n_batch # parameters for sampling self.n_chain = 100 # save data variables self.train_set_x = train_set_x self.train_set_y = train_set_y self.val_set_x = val_set_x self.val_set_y = val_set_y self.train_rbm_px_mu = train_rbm_px_mu self.data_loaded = False
from lasagne.layers import InputLayer, DenseLayer import lasagne from lasagne.updates import sgd, total_norm_constraint import theano.tensor as T x = T.matrix() y = T.ivector() l_in = InputLayer((5, 10)) l1 = DenseLayer(l_in, num_units=7, nonlinearity=T.nnet.softmax) output = lasagne.layers.get_output(l1, x) cost = T.mean(T.nnet.categorical_crossentropy(output, y)) all_params = lasagne.layers.get_all_params(l1) all_grads = T.grad(cost, all_params) scaled_grads = total_norm_constraint(all_grads[i], 5) updates = sgd(scaled_grads, all_params, learning_rate=0.1)
def build_and_train(self, X_train, y_train, X_val=None, y_val=None, display=False, save_model=True, aug_params=None): """ Builds the model and runs the training loop. Parameters ---------- X_train : numpy array Training data y_train : numpy array Training targets. X_val : numpy array, None, optional Validation data y_val : numpy array, None, optional Validation targets Display : bool, optional Display on-fly plots of training and validation results. Save_model : bool, optional Save model weights. aug_params : dict, None, optional Dict containing the data augmentation parameters. Returns ------- Test function of the net. """ # ====================================================================== # Model compilation # ====================================================================== print("Building model and compiling functions...") # Create Theano variables for input and target minibatch input_var = T.tensor4( 'X', dtype=theano.config.floatX) # shape (batchsize,3,224,224) target_var = T.ivector('y') # shape (batchsize,) # Load model weights and metadata d = pickle.load( open( os.path.join(homedir, 'data', 'pretrained_weights', 'resnet50.pkl'))) # Build the network and fill with pretrained weights except for the last fc layer net = build_model(input_var, self.output_dim) lasagne.layers.set_all_param_values(net['pool5'], d['values'][:-2]) # create loss function and accuracy prediction = lasagne.layers.get_output(net['prob']) loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) loss = loss.mean( ) + self.reg * lasagne.regularization.regularize_network_params( net['prob'], lasagne.regularization.l2) train_acc = T.mean(T.eq(T.argmax(prediction, axis=1), target_var), dtype=theano.config.floatX) # Create parameter update expressions with fine tuning updates = {} for name, layer in net.items(): layer_params = layer.get_params(trainable=True) if name == 'fc1000' or name == 'prob': layer_lr = self.lr else: layer_lr = self.lr * self.finetuning layer_updates = lasagne.updates.adam(loss, layer_params, learning_rate=layer_lr) updates.update(layer_updates) updates = collections.OrderedDict(updates) # Create a loss expression for validation/testing. test_prediction = lasagne.layers.get_output(net['prob'], deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile training and validation functions train_fn = theano.function([input_var, target_var], [loss, train_acc], updates=updates) val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) test_fn = theano.function([input_var], test_prediction) # ====================================================================== # Training routine # ====================================================================== print("Starting training...") track = { 'train_err': [], 'train_acc': [], 'val_err': [], 'val_acc': [] } if display: fig, (ax1, ax2) = plt.subplots(1, 2) line1, = ax1.plot([], [], 'r-') line2, = ax2.plot([], [], 'r-') ax1.set_xlabel('Epochs') ax1.set_ylabel('Training loss') ax1.set_yscale('log') ax1.set_title('Training loss') ax2.set_xlabel('Epochs') ax2.set_ylabel('Validation loss') ax2.set_yscale('log') ax2.set_title('Validation loss') # Batchsize and augmentation parameters if aug_params is None: aug_params = {} train_batchsize = min(len(y_train), self.batchsize) train_aug_params = aug_params.copy() train_aug_params.update({'mode': 'standard'}) if X_val is not None: val_batchsize = min(len(y_val), self.batchsize) val_aug_params = aug_params.copy() val_aug_params.update({'mode': 'minimal', 'tags': None}) for epoch in range(self.num_epochs): start_time = time.time() # Learning rate schedule decay if epoch in self.lr_decay_schedule: self.lr.set_value(self.lr.get_value() * self.lr_decay) print('############# Leaning rate: {} ####################' ).format(self.lr.get_value()) # Full pass over training data train_err, train_batches = 0, 0 for batch in iterate_minibatches(X_train, y_train, train_batchsize, shuffle=True, **train_aug_params): inputs, targets = batch[0], batch[1] tmp_train_err, tmp_train_acc = train_fn(inputs, targets) track['train_err'].append(tmp_train_err) track['train_acc'].append(tmp_train_acc) train_err += tmp_train_err train_batches += 1 print 'Training epoch {} - {:.1f}% completed | Loss: {:.4f} ; Accuracy: {:.1f}%'.format( epoch, train_batches * self.batchsize * 100. / len(y_train), float(tmp_train_err), float(tmp_train_acc) * 100) if np.isnan(train_err): print( 'Your net exploded, try decreasing the learning rate.') return None # Full pass over the validation data (if any) if X_val is not None: val_err, val_batches = 0, 0 for batch in iterate_minibatches(X_val, y_val, val_batchsize, shuffle=False, **val_aug_params): inputs, targets = batch[0], batch[1] tmp_val_err, tmp_val_acc = val_fn(inputs, targets) track['val_err'].append(tmp_val_err) track['val_acc'].append(tmp_val_acc) val_err += tmp_val_err val_batches += 1 print 'Validation epoch {} - {:.1f}% completed | Loss: {:.4f} ; Accuracy: {:.1f}%'.format( epoch, val_batches * self.batchsize * 100. / len(y_val), float(tmp_val_err), float(tmp_val_acc) * 100) # Print the results for this epoch print("Epoch {} of {} took {:.3f}s".format( epoch + 1, self.num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) if X_val is not None: print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) # Display training and validation accuracy in plot if display: line1.set_xdata(np.append(line1.get_xdata(), epoch)) line1.set_ydata( np.append(line1.get_ydata(), train_err / train_batches)) ax1.relim(), ax1.autoscale_view() if X_val is not None: line2.set_xdata(np.append(line2.get_xdata(), epoch)) line2.set_ydata( np.append(line2.get_ydata(), val_err / val_batches)) ax2.relim(), ax2.autoscale_view() fig.canvas.draw() # Save training information and net parameters print("Saving the model parameters and training information ...") train_info = { 'training_params': { 'output_dim': self.output_dim, 'lr_init': self.lr_init, 'lr_decay': float(self.lr_decay), 'lr_schedule': self.lr_decay_schedule.tolist(), 'reg': self.reg, 'num_epochs': self.num_epochs, 'batchsize': self.batchsize, 'finetuning': self.finetuning } } a = inspect.getargspec(data_augmentation) augmentation_params = dict( zip(a.args[-len(a.defaults):], a.defaults)) # default augmentation params augmentation_params.update(aug_params) # update with user's choice for k, v in augmentation_params.items(): if type(v) == np.ndarray: augmentation_params[k] = np.array(v).tolist() train_info.update({'augmentation_params': augmentation_params}) for k, v in track.items(): track[k] = np.array(v).tolist() train_info.update(track) if save_model: filename = 'resnet50_' + str(self.output_dim) + 'classes_' + str( self.num_epochs) + 'epochs' with open( os.path.join(homedir, 'conus_classification', 'training_info', filename + '.json'), 'w') as outfile: json.dump(train_info, outfile) np.savez( os.path.join(homedir, 'conus_classification', 'training_weights', filename + '.npz'), *lasagne.layers.get_all_param_values(net['prob'])) return test_fn
def __init__(self, *args, **kwargs): self.k = TT.ivector('k') super(Classifier, self).__init__(*args, **kwargs) self.y = self.softmax(self.y)
def single_layer_lstm(n_in, n_out): Wxb = theano.shared(np.random.randn(n_in, n_out), ) Whb = theano.shared(np.random.randn(n_out, n_out), ) bb = theano.shared(np.random.randn(n_out)) Wxi = theano.shared(np.random.randn(n_in, n_out), ) Whi = theano.shared(np.random.randn(n_out, n_out), ) bi = theano.shared(np.random.randn(n_out)) Wxf = theano.shared(np.random.randn(n_in, n_out), ) Whf = theano.shared(np.random.randn(n_out, n_out), ) bf = theano.shared(np.random.randn(n_out)) Wxo = theano.shared(np.random.randn(n_in, n_out), ) Who = theano.shared(np.random.randn(n_out, n_out), ) bo = theano.shared(np.random.randn(n_out)) Wo = theano.shared(np.random.randn(n_out, n_out)) bout = theano.shared(np.random.randn(n_out)) params = [Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo, Wo, bout] def step(x,htm1,ctm1,Wxb,Whb,bb,\ Wxi,Whi,bi,\ Wxf,Whf,bf,\ Wxo,Who,bo,Wo,bout): z = T.tanh(T.dot(x, Wxb) + T.dot(htm1, Whb) + bb) i = T.nnet.sigmoid(T.dot(x, Wxi) + T.dot(htm1, Whi) + bi) f = T.nnet.sigmoid(T.dot(x, Wxf) + T.dot(htm1, Whf) + bf) c = i * z + f * ctm1 o = T.nnet.sigmoid(T.dot(x, Wxo) + T.dot(htm1, Who) + bo) h = o * T.tanh(c) y = T.dot(h, Wo) + bout return [h, c, y] X = T.matrix() h0 = T.vector() c0 = T.vector() yt = T.ivector() lr = T.scalar() mom = T.scalar() [h, c, y], _ = theano.scan(step, sequences=X, outputs_info=[h0, c0, None], non_sequences=[ Wxb, Whb, bb, Wxi, Whi, bi, Wxf, Whf, bf, Wxo, Who, bo, Wo, bout ]) yout = T.nnet.softmax(y) L2 = T.scalar() L2 = 0 for param in params: L2 += (param**2).sum() L2 = 0.001 * L2 def loss(y_pred, y_true): return -T.mean(T.log(y_pred)[T.arange(y_true.shape[0]), y_true]) #oloss = loss(yout,yt) #cost = theano.function( [X,h0,c0,yt], oloss ) funch = theano.function([X, h0, c0], c) funcy = theano.function([X, h0, c0], y) oloss = loss(yout, yt) + L2 cost = loss(yout, yt) gparams = [] for param in params: gparams.append(T.grad(oloss, param)) # zip just concatenate two lists updates_t = {} for param in params: updates_t[param] = theano.shared(value=np.zeros( param.get_value(borrow=True).shape, dtype=theano.config.floatX), name='updates') updates = {} for param, gparam in zip(params, gparams): weight_update = updates_t[param] upd = mom * weight_update - lr * gparam updates[weight_update] = upd updates[param] = param + upd """ for param, gparam in zip(params, gparams): #mparam = theano.shared(param.get_value()*0.) upd = -lr*gparam# + mom*mparam# - 0.01*param# + #updates[mparam] = upd updates[param] = param + upd """ """ weight_update = updates[param] upd = -lr * gparam - 0.01*param updates[weight_update] = upd updates[param] = param + upd """ #gWxo = T.grad(oloss,Wxo) #fgradwxo = theano.function( [X,h0,c0,yt], gWxo ) trainer = theano.function([X, h0, c0, yt, lr, mom], [cost], updates=updates) return funcy, trainer
def trainConvNet(data_xy, inp_dim =10, n_epochs = 3, nkerns=[5, 10], batch_size=500, learning_rate=0.1): with open("metrics.txt", "a") as f: f.write("**********\n") f.write("Learning rate: {0}\n".format(learning_rate)) train_x, train_y, test_x, test_y, valid_x, valid_y = data_xy n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size print '...building the model' kern0_dim = 3 kern1_dim = 2 pool0_dim = 2 pool1_dim = 1 if inp_dim==20: kern0_dim = 3 kern1_dim = 2 pool0_dim = 2 pool1_dim = 1 if inp_dim==24: kern0_dim = 5 kern1_dim = 3 pool0_dim = 2 pool1_dim = 1 if inp_dim==30: kern0_dim = 7 kern1_dim = 5 pool0_dim = 2 pool1_dim = 1 index = T.lscalar() x = T.tensor4('x') y = T.ivector('y') rng = numpy.random.RandomState(23455) layer0_input = x.reshape((batch_size, THREE, inp_dim, inp_dim)) layer0 = LeNetConvPoolLayer( rng, input = layer0_input, image_shape=(batch_size, THREE, inp_dim, inp_dim), filter_shape=(nkerns[0], 3, kern0_dim, kern0_dim), poolsize=(pool0_dim, pool0_dim) ) inp1_dim = (inp_dim-kern0_dim+1)/pool0_dim layer1 = LeNetConvPoolLayer( rng, input = layer0.output, image_shape=(batch_size, nkerns[0], inp1_dim, inp1_dim), filter_shape=(nkerns[1], nkerns[0], kern1_dim, kern1_dim), poolsize=(pool1_dim, pool1_dim) ) layer2_input = layer1.output.flatten(2) inp2_dim = (inp1_dim-kern1_dim+1)/pool1_dim layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1]*inp2_dim*inp2_dim, n_out=300, activation=T.tanh ) layer3 = LogisticRegression(input=layer2.output, n_in=300, n_out=10) cost = layer3.negative_log_likelihood(y) test_model = theano.function([index], layer3.errors(y), givens={ x: test_x[index*batch_size: (index+1)*batch_size], y: test_y[index*batch_size: (index+1)*batch_size] }) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_x[index*batch_size: (index+1)*batch_size], y: valid_y[index*batch_size: (index+1)*batch_size] }) params = layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function([index], cost, updates=updates, givens={ x: train_x[index*batch_size: (index+1)*batch_size], y: train_y[index*batch_size: (index+1)*batch_size] }) print 'training... ' patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%\n' %(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) f.write("Epoch: {0}\n".format(epoch)) f.write("Validation loss: {0}\n".format(this_validation_loss*100)) f.write("Cost: {0}\n".format(cost_ij)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') %(epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience<=iter: done_looping=True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print ('saving params for patch width: %i...' %(inp_dim)) save_file = open('param'+str(inp_dim)+'.pkl', 'wb') W0 = layer0.params[0]; b0 = layer0.params[1] W1 = layer1.params[0]; b1 = layer1.params[1] cPickle.dump(W0.get_value(borrow=True), save_file, -1) cPickle.dump(b0.get_value(borrow=True), save_file, -1) cPickle.dump(W1.get_value(borrow=True), save_file, -1) cPickle.dump(b1.get_value(borrow=True), save_file, -1) save_file.close()
import theano from confusionmatrix import ConfusionMatrix from lasagne.objectives import * from lasagne.updates import * import theano.tensor as T from theano.tensor import * from theano.tensor.signal import downsample import lasagne import numpy as np import try_DP as DP from theano.tensor import nnet import lasagne.layers.dnn dtensor5 = TensorType('float32', (False,)*5) input_var = T.ftensor4('XY') target_var = T.ivector('Y_train') x1 = T.matrix('x1') PS = 29 # Build Neural Network: # Conv Net XY Plane input = lasagne.layers.InputLayer((None, 1, PS, PS), input_var=input_var) l_conv_1 = lasagne.layers.dnn.Conv2DDNNLayer(input, 20, (9,9)) l_maxpool_1 = lasagne.layers.dnn.Pool2DDNNLayer(l_conv_1, (3,3)) l_conv_2 = lasagne.layers.dnn.Conv2DDNNLayer(l_maxpool_1, 20,(5,5)) l_conv_3 = lasagne.layers.dnn.Conv2DDNNLayer(l_conv_2, 20, (3,3))
def train_rep(learning_rate=0.002, L1_reg=0.0002, L2_reg=0.005, n_epochs=200, nkerns=[20, 50], batch_size=25): rng = numpy.random.RandomState(23455) train_dir = '../out/h5/' valid_dir = '../out/h5/' weights_dir = './weights/' print '... load input data' filename = train_dir + 'rep_train_data_1.gzip.h5' datasets = load_initial_data(filename) train_set_x, train_set_y, shared_train_set_y = datasets filename = valid_dir + 'rep_valid_data_1.gzip.h5' datasets = load_initial_data(filename) valid_set_x, valid_set_y, shared_valid_set_y = datasets mydatasets = load_initial_test_data() test_set_x, test_set_y, shared_test_set_y, valid_ds = mydatasets # compute number of minibatches for training, validation and testing n_all_train_batches = 30000 n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_all_train_batches /= batch_size n_train_batches /= batch_size n_valid_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # image size layer0_w = 50 layer0_h = 50 layer1_w = (layer0_w - 4) / 2 layer1_h = (layer0_h - 4) / 2 layer2_w = (layer1_w - 2) / 2 layer2_h = (layer1_h - 2) / 2 layer3_w = (layer2_w - 2) / 2 layer3_h = (layer2_h - 2) / 2 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # image sizes batchsize = batch_size in_channels = 20 in_width = 50 in_height = 50 #filter sizes flt_channels = 40 flt_time = 20 flt_width = 5 flt_height = 5 signals_shape = (batchsize, in_channels, in_height, in_width) filters_shape = (flt_channels, in_channels, flt_height, flt_width) layer0_input = x.reshape(signals_shape) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=signals_shape, filter_shape=filters_shape, poolsize=(2, 2)) # TODO: incase of flt_time < in_time the output dimension will be different layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, flt_channels, layer1_w, layer1_h), filter_shape=(60, flt_channels, 3, 3), poolsize=(2, 2)) layer2 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, 60, layer2_w, layer2_h), filter_shape=(90, 60, 3, 3), poolsize=(2, 2)) layer3_input = layer2.output.flatten(2) layer3 = HiddenLayer(rng, input=layer3_input, n_in=90 * layer3_w * layer3_h, n_out=500, activation=T.tanh) layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=8) classify = theano.function( [index], outputs=layer4.get_output_labels(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer4.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params # symbolic Theano variable that represents the L1 regularization term L1 = T.sum(abs(layer4.params[0])) + T.sum(abs(layer3.params[0])) + T.sum( abs(layer2.params[0])) + T.sum(abs(layer1.params[0])) + T.sum( abs(layer0.params[0])) # symbolic Theano variable that represents the squared L2 term L2_sqr = T.sum(layer4.params[0]**2) + T.sum(layer3.params[0]**2) + T.sum( layer2.params[0]**2) + T.sum(layer1.params[0]**2) + T.sum( layer0.params[0]**2) # the loss cost = layer4.negative_log_likelihood(y) + L1_reg * L1 + L2_reg * L2_sqr # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' start_time = time.clock() epoch = 0 done_looping = False cost_ij = 0 train_files_num = 600 val_files_num = 100 startc = time.clock() while (epoch < n_epochs) and (not done_looping): endc = time.clock() print('epoch %i, took %.2f minutes' % \ (epoch, (endc - startc) / 60.)) startc = time.clock() epoch = epoch + 1 for nTrainSet in xrange(1, train_files_num + 1): # load next train data if nTrainSet % 50 == 0: print 'training @ nTrainSet = ', nTrainSet, ', cost = ', cost_ij filename = train_dir + 'rep_train_data_' + str( nTrainSet) + '.gzip.h5' datasets = load_next_data(filename) ns_train_set_x, ns_train_set_y = datasets train_set_x.set_value(ns_train_set_x, borrow=True) shared_train_set_y.set_value(numpy.asarray( ns_train_set_y, dtype=theano.config.floatX), borrow=True) n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size # train for minibatch_index in xrange(n_train_batches): # training itself # -------------------------------------- cost_ij = train_model(minibatch_index) # ------------------------- # at the end of each epoch run validation this_validation_loss = 0 for nValSet in xrange(1, val_files_num + 1): filename = valid_dir + 'rep_valid_data_' + str( nValSet) + '.gzip.h5' datasets = load_next_data(filename) ns_valid_set_x, ns_valid_set_y = datasets valid_set_x.set_value(ns_valid_set_x, borrow=True) shared_valid_set_y.set_value(numpy.asarray( ns_valid_set_y, dtype=theano.config.floatX), borrow=True) n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss += numpy.mean(validation_losses) this_validation_loss /= (val_files_num) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # save snapshots print 'saving weights state, epoch = ', epoch f = file(weights_dir + 'weights_epoch' + str(epoch) + '.save', 'wb') state_L0 = layer0.__getstate__() cPickle.dump(state_L0, f, protocol=cPickle.HIGHEST_PROTOCOL) state_L1 = layer1.__getstate__() cPickle.dump(state_L1, f, protocol=cPickle.HIGHEST_PROTOCOL) state_L2 = layer2.__getstate__() cPickle.dump(state_L2, f, protocol=cPickle.HIGHEST_PROTOCOL) state_L3 = layer3.__getstate__() cPickle.dump(state_L3, f, protocol=cPickle.HIGHEST_PROTOCOL) state_L4 = layer4.__getstate__() cPickle.dump(state_L4, f, protocol=cPickle.HIGHEST_PROTOCOL) f.close() end_time = time.clock() print('Optimization complete.') print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, d_v, d_e, d_t, optimizer, optimizer_args, np_rng, th_rng, n_classes=0, encoder_layers=1, generator_layers=0, generator_transform=None, use_interactions=False, clip_gradients=False, init_bias=None, train_bias=False, scale=6.0, encode_labels=False, l1_inter_factor=1.0, time_penalty=False, encoder_shortcut=False, generator_shortcut=False): self.d_v = d_v # vocabulary size self.d_e = d_e # dimensionality of encoder self.d_t = d_t # number of topics self.n_classes = n_classes # number of classes assert encoder_layers == 1 or encoder_layers == 2 self.n_encoder_layers = encoder_layers assert generator_layers == 0 or generator_layers == 1 or generator_layers == 2 or generator_layers == 4 self.n_generator_layers = generator_layers # set various options self.generator_transform = generator_transform # transform to apply after the generator self.use_interactions = use_interactions # use interactions between topics and labels self.encode_labels = encode_labels # feed labels into the encoder self.l1_inter_factor = l1_inter_factor # factor by which to multiply L1 penalty on interactions self.encoder_shortcut = encoder_shortcut self.generator_shortcut = generator_shortcut # create parameter matrices and biases self.W_encoder_1 = common_theano.init_param('W_encoder_1', (d_e, d_v), np_rng, scale=scale) self.b_encoder_1 = common_theano.init_param('b_encoder_1', (d_e, ), np_rng, scale=0.0) if n_classes > 1: self.W_encoder_label = common_theano.init_param('W_encoder_label', (d_e, n_classes), np_rng, scale=scale) else: self.W_encoder_label = common_theano.init_param( 'W_encoder_label', (d_e, n_classes), np_rng, values=np.zeros((d_e, n_classes), dtype=np.float32)) self.W_encoder_2 = common_theano.init_param('W_encoder_2', (d_e, d_e), np_rng, scale=scale) self.b_encoder_2 = common_theano.init_param('b_encoder_2', (d_e, ), np_rng, scale=0.0) self.W_encoder_shortcut = common_theano.init_param( 'W_encoder_shortcut', (d_e, d_v), np_rng, scale=scale) self.W_mu = common_theano.init_param('W_mu', (d_t, d_e), np_rng, scale=scale) self.b_mu = common_theano.init_param('b_mu', (d_t, ), np_rng, scale=0.0) self.W_sigma = common_theano.init_param('W_sigma', (d_t, d_e), np_rng, scale=scale, values=np.zeros((d_t, d_e))) self.b_sigma = common_theano.init_param('b_sigma', (d_t, ), np_rng, scale=0.0, values=np.array([-4] * d_t)) self.W_generator_1 = common_theano.init_param('W_generator_1', (d_t, d_t), np_rng, scale=scale) self.b_generator_1 = common_theano.init_param('b_generator_1', (d_t, ), np_rng, scale=0.0) self.W_generator_2 = common_theano.init_param('W_generator_2', (d_t, d_t), np_rng, scale=scale) self.b_generator_2 = common_theano.init_param('b_generator_2', (d_t, ), np_rng, scale=0.0) self.W_generator_3 = common_theano.init_param('W_generator_3', (d_t, d_t), np_rng, scale=scale) self.b_generator_3 = common_theano.init_param('b_generator_3', (d_t, ), np_rng, scale=0.0) self.W_generator_4 = common_theano.init_param('W_generator_4', (d_t, d_t), np_rng, scale=scale) self.b_generator_4 = common_theano.init_param('b_generator_4', (d_t, ), np_rng, scale=0.0) self.W_decoder = common_theano.init_param('W_decoder', (d_v, d_t), np_rng, scale=scale) self.b_decoder = common_theano.init_param('b_decoder', (d_v, ), np_rng, scale=0.0) self.W_decoder_label = common_theano.init_param('W_decoder_label', (d_v, n_classes), np_rng, scale=scale) self.W_decoder_inter = common_theano.init_param('W_decoder_inter', (d_v, d_t * n_classes), np_rng, scale=scale) # set the decoder bias to the background frequency if init_bias is not None: self.b_decoder = common_theano.init_param('b_decoder', (d_v, ), np_rng, values=init_bias) # create basic sets of parameters which we will use to tell the model what to update self.params = [ self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu, self.W_sigma, self.b_sigma, self.W_decoder ] self.param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ), (d_t, d_e), (d_t, ), (d_v, d_t)] self.encoder_params = [ self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu, self.W_sigma, self.b_sigma ] self.encoder_param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ), (d_t, d_e), (d_t, )] self.generator_params = [] self.generator_param_shapes = [] # add additional parameters to sets, depending on configuration if train_bias: self.params.append(self.b_decoder) self.param_shapes.append((d_v, )) self.decoder_params = [self.W_decoder, self.b_decoder] self.decoder_param_shapes = [(d_v, d_t), (d_v, )] else: self.decoder_params = [self.W_decoder] self.decoder_param_shapes = [(d_v, d_t)] # add parameters for labels (covariates) if self.n_classes > 1: self.params.append(self.W_decoder_label) self.param_shapes.append((d_v, n_classes)) self.decoder_params.extend([self.W_decoder_label]) self.decoder_param_shapes.extend([(d_v, n_classes)]) if use_interactions: self.params.append(self.W_decoder_inter) self.param_shapes.append((d_v, d_t * n_classes)) self.decoder_params.extend([self.W_decoder_inter]) self.decoder_param_shapes.extend([(d_v, d_t * n_classes)]) if encode_labels: self.params.append(self.W_encoder_label) self.param_shapes.append((d_e, n_classes)) self.encoder_params.extend([self.W_encoder_label]) self.encoder_param_shapes.extend([(d_e, n_classes)]) self.label_only_params = [self.W_decoder_label] self.label_only_param_shapes = [(d_v, n_classes)] # add encoder parameters depending on number of layers if self.n_encoder_layers > 1: self.params.extend([self.W_encoder_2, self.b_encoder_2]) self.param_shapes.extend([(d_e, d_e), (d_e, )]) self.encoder_params.extend([self.W_encoder_2, self.b_encoder_2]) self.encoder_param_shapes.extend([(d_e, d_e), (d_e, )]) if self.encoder_shortcut: self.params.extend([self.W_encoder_shortcut]) self.param_shapes.extend([(d_e, d_v)]) self.encoder_params.extend([self.W_encoder_shortcut]) self.encoder_param_shapes.extend([(d_e, d_v)]) # add generator parameters depending on number of layers if self.n_generator_layers > 0: self.params.extend([self.W_generator_1, self.b_generator_1]) self.param_shapes.extend([(d_t, d_t), (d_t, )]) self.generator_params.extend( [self.W_generator_1, self.b_generator_1]) self.generator_param_shapes.extend([(d_t, d_t), (d_t, )]) if self.n_generator_layers > 1: self.params.extend([self.W_generator_2, self.b_generator_2]) self.param_shapes.extend([(d_t, d_t), (d_t, )]) self.generator_params.extend( [self.W_generator_2, self.b_generator_2]) self.generator_param_shapes.extend([(d_t, d_t), (d_t, )]) if self.n_generator_layers > 2: self.params.extend([ self.W_generator_3, self.b_generator_3, self.W_generator_4, self.b_generator_4 ]) self.param_shapes.extend([(d_t, d_t), (d_t, ), (d_t, d_t), (d_t, )]) self.generator_params.extend([ self.W_generator_3, self.b_generator_3, self.W_generator_4, self.b_generator_4 ]) self.generator_param_shapes.extend([(d_t, d_t), (d_t, ), (d_t, d_t), (d_t, )]) # declare variables that will be given as inputs to functions to be declared below x = T.vector('x', dtype=theano.config.floatX ) # normalized vector of counts for one item y = T.vector( 'y', dtype=theano.config.floatX) # vector of labels for one item indices = T.ivector( 'x') # vector of vocab indices (easier to evaluate log prob) lr = T.fscalar('lr') # learning rate l1_strength = T.fscalar('l1_strength') # l1_strength kl_strength = T.fscalar('kl_strength') # l1_strength n_words = T.shape(indices) # the two variables below are just for debugging n_words_print = theano.printing.Print('n_words')( T.shape(indices)[0]) # for debugging x_sum = theano.printing.Print('x_sum')(T.sum(x)) # for debugging # encode one item to mean and variance vectors mu, log_sigma_sq = self.encoder(x, y) # take a random sample from the corresponding multivariate normal h = self.sampler(mu, log_sigma_sq, th_rng) # compute the KL divergence from the prior KLD = -0.5 * T.sum(1 + log_sigma_sq - T.square(mu) - T.exp(log_sigma_sq)) # generate a document representation of dimensionality == n_topics r = self.generator(h) # decode back into a distribution over the vocabulary p_x_given_h = self.decoder(r, y) # evaluate the likelihood nll_term = -T.sum( T.log(p_x_given_h[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # compute the loss loss = nll_term + KLD * kl_strength # add an L1 penalty to the decoder terms if time_penalty and n_classes > 1: penalty = common_theano.col_diff_L1(l1_strength, self.W_decoder_label, n_classes) else: penalty = common_theano.L1(l1_strength, self.W_decoder) if n_classes > 1: penalty += common_theano.L1(l1_strength, self.W_decoder_label) if use_interactions: penalty += common_theano.L1( l1_strength * self.l1_inter_factor, self.W_decoder_inter) # declare some alternate function for decoding from the mean r_mu = self.generator(mu) p_x_given_x = self.decoder(r_mu, y) nll_term_mu = -T.sum( T.log(p_x_given_x[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # declare some alternate functions for pretraining from a fixed document representation (r) pretrain_r = T.vector('pretrain_r', dtype=theano.config.floatX) p_x_given_pretrain_h = self.decoder(pretrain_r, y) pretrain_loss = -T.sum( T.log(p_x_given_pretrain_h[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # declare some alternate functions for only using labels p_x_given_y_only = self.decoder_label_only(y) nll_term_y_only = -T.sum( T.log(p_x_given_y_only[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # compute gradients gradients = [ T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.params ] encoder_gradients = [ T.cast(T.grad(loss, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.encoder_params ] generator_gradients = [ T.cast(T.grad(loss, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.generator_params ] decoder_gradients = [ T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.decoder_params ] pretrain_gradients = [ T.cast(T.grad(pretrain_loss + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.decoder_params ] label_only_gradients = [ T.cast(T.grad(nll_term_y_only + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.label_only_params ] # optionally clip gradients if clip_gradients: gradients = common_theano.clip_gradients(gradients, 5) encoder_gradients = common_theano.clip_gradients( encoder_gradients, 5) generator_gradients = common_theano.clip_gradients( generator_gradients, 5) decoder_gradients = common_theano.clip_gradients( decoder_gradients, 5) pretrain_gradients = common_theano.clip_gradients( pretrain_gradients, 5) label_only_gradients = common_theano.clip_gradients( label_only_gradients, 5) # create the updates for various sets of parameters updates = optimizer(self.params, self.param_shapes, gradients, lr, optimizer_args) encoder_updates = optimizer(self.encoder_params, self.encoder_param_shapes, encoder_gradients, lr, optimizer_args) generator_updates = optimizer(self.generator_params, self.generator_param_shapes, generator_gradients, lr, optimizer_args) decoder_updates = optimizer(self.decoder_params, self.decoder_param_shapes, decoder_gradients, lr, optimizer_args) other_updates = optimizer( self.encoder_params + self.generator_params, self.encoder_param_shapes + self.generator_param_shapes, encoder_gradients + generator_gradients, lr, optimizer_args) pretrain_updates = optimizer(self.decoder_params, self.decoder_param_shapes, pretrain_gradients, lr, optimizer_args) label_only_updates = optimizer(self.label_only_params, self.label_only_param_shapes, label_only_gradients, lr, optimizer_args) # declare the available methods for this class self.test_input = theano.function(inputs=[x, indices], outputs=[n_words_print, x_sum]) self.train = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=updates, on_unused_input='ignore') self.train_encoder = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=encoder_updates, on_unused_input='ignore') self.train_generator = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=generator_updates, on_unused_input='ignore') self.train_decoder = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=decoder_updates, on_unused_input='ignore') self.train_not_decoder = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=other_updates, on_unused_input='ignore') self.pretrain_decoder = theano.function( inputs=[indices, y, pretrain_r, lr, l1_strength, kl_strength], outputs=[pretrain_loss], updates=pretrain_updates, on_unused_input='ignore') self.encode = theano.function(inputs=[x, y], outputs=[mu, log_sigma_sq], on_unused_input='ignore') self.decode = theano.function(inputs=[pretrain_r, y], outputs=[p_x_given_pretrain_h], on_unused_input='ignore') self.sample = theano.function(inputs=[x, y], outputs=h, on_unused_input='ignore') self.get_mean_doc_rep = theano.function(inputs=[x, y], outputs=r_mu, on_unused_input='ignore') self.encode_and_decode = theano.function(inputs=[x, y], outputs=p_x_given_x, on_unused_input='ignore') self.neg_log_likelihood = theano.function(inputs=[x, indices, y], outputs=[nll_term, KLD], on_unused_input='ignore') self.neg_log_likelihood_mu = theano.function( inputs=[x, indices, y], outputs=[nll_term_mu, KLD], on_unused_input='ignore') self.train_label_only = theano.function( inputs=[indices, y, lr, l1_strength], outputs=[nll_term_y_only, penalty], updates=label_only_updates) self.neg_log_likelihood_label_only = theano.function( inputs=[indices, y], outputs=nll_term_y_only)
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-2, mu=0.9, decay=0.9, epochs=10, batch_sz=100, show_fig=False): X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = np.random.randn(M1, K) * np.sqrt(2.0 / M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) updates = momentum_updates(cost, self.params, learning_rate, mu) train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 50 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def train_conv_net(datasets, U, lr_decay=0.95, img_w=300, filter_hs=[3, 4, 5], conv_non_linear="relu", hidden_units=[100, 3], shuffle_batch=True, n_epochs=25, sqr_norm_lim=9, non_static=True, batch_size=50, activations=[Iden], dropout_rate=[0.5]): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0]) - 1 filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters #define model architecture index = T.lscalar() x = T.matrix('x') y = T.ivector('y') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], 1, x.shape[1], Words.shape[1])) conv_layers = [] layer1_inputs = [] print 'starting loop' for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) hidden_units[0] = feature_maps * len(filter_hs) classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) print 'defining params' #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate #extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) #divide train set into train/val sets test_set_x = datasets[1][:, :img_h] test_set_y = np.asarray(datasets[1][:, -1], "int32") train_set = new_data[:n_train_batches * batch_size, :] val_set = new_data[n_train_batches * batch_size:, :] train_set_x, train_set_y = shared_dataset( (train_set[:, :img_h], train_set[:, -1])) val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) #compile theano functions to get train/val/test errors test_model = theano.function( [index], classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (test_size, 1, img_h, Words.shape[1])) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([x, y], test_error, allow_input_downcast=True) #start training over mini-batches print 'sizes: ' print 'test: ' print test_size print '... training' print 'n_train_batches: ' + str(n_train_batches) epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 cost_epoch = 0 while (epoch < n_epochs): print 'epoch: ' + str(epoch) start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): if minibatch_index >= n_train_batches: minibatch_index -= 1 print 'if: minibatch_index: ' + str(minibatch_index) cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): if minibatch_index >= n_train_batches: minibatch_index -= 1 print 'else: minibatch_index: ' + str(minibatch_index) cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) print( 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%' % (epoch, time.time() - start_time, train_perf * 100., val_perf * 100.)) if val_perf >= best_val_perf: best_val_perf = val_perf test_loss = test_model_all(test_set_x, test_set_y) test_perf = 1 - test_loss return test_perf
def __init__(self, numpy_rng, theano_rng=None, n_ins=183, hidden_layers_sizes=[250, 250], n_outs=1, corruption_levels=[0.1, 0.1], dropout_rate=0.1, lambda1=0, lambda2=0, non_lin=None): """ :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the Model :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: sizes of intermediate layers. :type n_outs: int :param n_outs: dimension of the output of the network. Always 1 for a regression problem. :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer :type dropout_rate: float :param dropout_rate: probability of dropping a hidden unit :type non_lin: function :param non_lin: nonlinear activation function used in all layers """ # Initializes parameters. self.hidden_layers = [] self.dA_layers = [] self.params = [] self.dropout_masks = [] self.n_layers = len(hidden_layers_sizes) self.L1 = 0 self.L2_sqr = 0 self.n_hidden = hidden_layers_sizes[0] if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # Allocates symbolic variables for the data. self.x = T.matrix('x', dtype='float32') self.o = T.ivector('o') self.at_risk = T.ivector('at_risk') self.is_train = T.iscalar('is_train') self.masks = [ T.lmatrix('mask_' + str(i)) for i in range(self.n_layers) ] # Linear cox regression with no hidden layers. if self.n_layers == 0: self.risk_layer = RiskLayer(input=self.x, n_in=n_ins, n_out=n_outs, rng=numpy_rng) else: # Constructs the intermediate layers. for i in xrange(self.n_layers): if i == 0: input_size = n_ins layer_input = self.x else: input_size = hidden_layers_sizes[i - 1] layer_input = self.hidden_layers[-1].output if dropout_rate > 0: hidden_layer = DropoutHiddenLayer( rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=non_lin, dropout_rate=dropout_rate, is_train=self.is_train, mask=self.masks[i]) else: hidden_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=non_lin) # Adds the layer to the stack of layers. self.hidden_layers.append(hidden_layer) self.params.extend(hidden_layer.params) # Constructs an autoencoder that shares weights with this layer. dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=hidden_layer.W, bhid=hidden_layer.b, non_lin=non_lin) self.dA_layers.append(dA_layer) self.L1 += abs(hidden_layer.W).sum() self.L2_sqr += (hidden_layer.W**2).sum() # Adds a risk prediction layer on top of the stack. self.risk_layer = RiskLayer(input=self.hidden_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, rng=numpy_rng) self.L1 += abs(self.risk_layer.W).sum() self.L2_sqr += (self.risk_layer.W**2).sum() self.params.extend(self.risk_layer.params) self.regularizers = lambda1 * self.L1 + lambda2 * self.L2_sqr
def __init__(self, batch_size, kernels, input_dimensions, convolution_dimensions, pool_sizes, stride_sizes, layer_pattern, relu_pattern, dropout_rate, rng_seed=None, base_learning_rate=0.05, momentum=0.8, learning_decay_per_epoch=0.91, l2_norm=0, name="default", param_index=0, address='', n_epochs=200, batch_normalization_pattern=None, batch_norm_learning_rate=0.1, batch_norm_decay_per_epoch=0.95, batchnorm_vals_filename=None, batchnorm_slide_percent=0.): """ batch_size - int - size of each batch kernels - int array - number of general units each layer (incl. input/output) input_dimensions - int array[2] - dimensions of input convolution_dimensions - int array[2] array - dimensions of each convolution pool_sizes - int array[2] array - dimensions of pooling for each convolution stride_sizes - int array - length of strides for each convolutional layer (this overrides aspects of pooling behavior) layer_pattern - ['I','C',...,'C','F',...,'F','O'] - indicates pattern of layers relu_pattern - boolean array that describes if convolutional layers should be rectified; doesn't do anything for other types of layers (including input) dropout_rate - float - rate of dropout for network weights rng_seed - int - seed for random number generator; None defaults to random base_learning_rate - floatX - initial learning rate momentum - floatX - amount that learning rate carries over through iterations learning_decay_per_epoch - floatX - factor for decreasing learning rate over epochs name - string that describes the beginning of the filenames of the network pickle param_index - integer determined a priori to index the param configurations and show it in the filename batchnorm_vals_filename - has to be constructed by separate file; pre-defines mean and sd of each layer for a nn...might be preferred to use sliding instead, as batchnorm_slide_percent - sort of like momentum, but for calculations of batch-normalization means and standard deviations """ #initialize arrays containing basic information and hyperparameters self.layers = [] self.uses_batch_normalization = bool(batch_normalization_pattern) self.batch_norm_pattern = batch_normalization_pattern self.batchnorm_vals_filename = batchnorm_vals_filename self.batchnorm_slide_percent = batchnorm_slide_percent if not self.uses_batch_normalization: self.batch_norm_pattern = [False for _ in relu_pattern] self.address = address #replace future instances of self.kernel self.kernels = kernels self.input_dimensions = input_dimensions self.output_size = kernels[-1:][0] self.inputs = [] self.batch_size = batch_size self.x = x = T.ftensor4('x') self.y = y = T.ivector('y') self.rng = np.random.RandomState(rng_seed) self.name = name self.n_epochs = n_epochs self.shapes = [(input_dimensions[0], input_dimensions[1])] print "input shape: " + str(self.shapes) self.convolution_dimensions = convolution_dimensions self.rng_seed = rng_seed self.layer_pattern = layer_pattern self.current_batch_index = 0 self.batch_size = batch_size self.pool_sizes = pool_sizes self.stride_sizes = stride_sizes self.relu_pattern = relu_pattern #if the rate is a float, each layer has the same rate if type(dropout_rate) == type(1.1): dropout_rate = [dropout_rate for _ in layer_pattern] self.dropout_rate = dropout_rate self.learning_decay_per_epoch = learning_decay_per_epoch self.l2_norm = l2_norm #get some info from prepare_image_data.py #files_list, outputs, y_dim = prepare_image_data.get_data() #self.files_list = files_list #self.y_dim = y_dim #self.outputs=outputs self.fetcher = prepare_image_data.fetcher(self.batch_size) #indexing information self.ratios = np.asarray([0.6, 0.2, 0.2]) self.index = index = T.lscalar() #temporarily hardcoded self.n_train_batches = 400 self.n_valid_batches = 120 self.n_test_batches = 120 self.cat_labels = self.fetcher.valid_names self.y_dim = len(self.cat_labels) self.momentum = theano.shared(np.float32(momentum)) self.base_learning_rate = np.float32(base_learning_rate) self.learning_rate = theano.shared( np.float32(base_learning_rate * (1 - momentum))) self.index = index = T.lscalar() self.momentum_raw = momentum self.learning_rate_raw = self.learning_rate.get_value() if self.uses_batch_normalization: self.batch_norm_learning_rate_raw = batch_norm_learning_rate self.batch_norm_learning_rate = theano.shared( np.float32(self.batch_norm_learning_rate_raw)) self.epoch = 0 #initialize basic file shapes #recent change: changed kernel_sizes to self.kernels self.training_x = theano.shared(np.zeros( shape=(batch_size, self.kernels[0], input_dimensions[0], input_dimensions[1]), dtype=theano.config.floatX), borrow=True) self.input = self.x.reshape((self.batch_size, self.kernels[0], self.shapes[0][0], self.shapes[0][1])) #updated database-based retrieval self.training_y = theano.shared(np.zeros(shape=self.batch_size, dtype=np.int32), borrow=True) self.testing_x = theano.shared(np.zeros( shape=(self.batch_size, kernels[0], input_dimensions[0], input_dimensions[1]), dtype=theano.config.floatX), borrow=True) self.testing_y = theano.shared(np.zeros(shape=self.batch_size, dtype=np.int32), borrow=True) self.validation_x = theano.shared(np.zeros( shape=(self.batch_size, kernels[0], input_dimensions[0], input_dimensions[1]), dtype=theano.config.floatX), borrow=True) self.validation_y = theano.shared(np.zeros(shape=self.batch_size, dtype=np.int32), borrow=True) #load fixed mean and sd values if file exists if self.batchnorm_vals_filename <> None: self.batchnorm_fixed_values = pickle.load( self.batchnorm_vals_filename) else: self.batchnorm_fixed_values = [ None for _ in range(len(layer_pattern)) ] ###begin creation of layers #I = "input";C = "Convolutional"; F = "Fully-Connected", O = "Output" for i, pattern in enumerate(layer_pattern): if pattern == "I": self.inputs.append(self.input) print 'inserted input' elif pattern == "C": self.layers.append( NetConvPoolLayer( self.rng, input = self.inputs[i-1], image_shape=( batch_size,kernels[i-1], self.shapes[i-1][0], self.shapes[i-1][1] ), filter_shape=( kernels[i], kernels[i-1], self.convolution_dimensions[i-1][0], self.convolution_dimensions[i-1][1]), poolsize = pool_sizes[i-1], stride = stride_sizes[i-1], dropout_percent = self.dropout_rate[i], batch_norm = self.batch_norm_pattern[i], batchnorm_slide_percent = self.batchnorm_slide_percent, precalculated_batchnorm_values = self.\ batchnorm_fixed_values[i-1]) ) x_new = ( self.shapes[i-1][0] - self.convolution_dimensions[i-1][0] + \ 1 - (pool_sizes[i-1][0] - stride_sizes[i-1][0]))/\ (stride_sizes[i-1][0] ) y_new = ( self.shapes[i-1][1] - self.convolution_dimensions[i-1][1] + 1 -\ (pool_sizes[i-1][1] - stride_sizes[i-1][1]))/\ (stride_sizes[i-1][1] ) self.inputs.append(self.layers[i - 1].output) self.shapes.append((x_new, y_new)) print "self.shapes: " + str(self.shapes) print 'added convolution layer' elif pattern == "F": if layer_pattern[i - 1] == "C": next_input = self.inputs[i - 1].flatten(2) else: next_input = self.inputs[i - 1] self.layers.append( HiddenLayer(self.rng, input=next_input, n_in=kernels[i - 1] * self.shapes[i - 1][0] * self.shapes[i - 1][1], n_out=kernels[i], activation=T.tanh, dropout_rate=self.dropout_rate[i])) self.inputs.append(self.layers[i - 1].output) #the shape is only used to determine dimensions of the next layer self.shapes.append((1, 1)) #see if this fixes issue print 'added fully-connected hidden layer, shape=%s' %\ str(self.shapes[-1]) else: if layer_pattern[i - 1] == "C": next_input = self.inputs[i - 1].flatten(2) else: next_input = self.inputs[i - 1] self.layers.append( LogisticRegression(input=next_input, n_in=kernels[i - 1], n_out=self.output_size, rng=self.rng, dropout_rate=self.dropout_rate[i])) last_index = i - 1 print 'added logistic layer' zero = np.float32(0.) self.L2_penalty = theano.shared(np.float32(l2_norm)) self.params = params = [param for layer in self.layers \ for param in layer.params] self.cost = self.layers[last_index].negative_log_likelihood(self.y) +\ self.L2_penalty * ( T.sum([T.sum(self.layers[q].W * self.layers[q].W)\ for q in range(len(self.layers))])) #updating functions (incl. momentum) #update 1 (only used for derivation in update #4) self.old_updates = [theano.shared(zero * param_i.get_value())\ for param_i in params] self.current_delta = [theano.shared(np.float32(zero * param_i.get_value()))\ for param_i in params] self.grads = T.grad(self.cost, params) #update 2 self.current_change_update = [ (current_delta_i, self.learning_rate * grad_i +\ self.momentum * old_updates_i)\ for current_delta_i,grad_i, old_updates_i in\ zip(self.current_delta,self.grads,self.old_updates) ] #update 3 updates = [ ( param_i,param_i - current_delta_i) for param_i, current_delta_i in\ zip(params,self.current_delta)] #self.updates = [] #update 4 (derived from update #1) momentum_updates = [(old_updates_i, current_delta_i)\ for old_updates_i, current_delta_i in\ zip(self.old_updates,self.current_delta)] #self.momentum_updates = [] #now batch-normalization updates when needed batchnorm_sliding_updates = [] for layer in self.layers: if not isinstance(layer, NetConvPoolLayer): continue if layer.batchnorm_slide_percent <> 0.: batchnorm_sliding_updates += [ (layer.sd_input_old, layer.sd_input), (layer.means_old, layer.sd_input) ] #combined updates self.all_updates = self.current_change_update + updates +\ momentum_updates + batchnorm_sliding_updates #test model function self.test_model = theano.function([], self.layers[last_index].errors( self.y), givens={ x: self.testing_x, y: self.testing_y }) #validation model function self.validate_model = theano.function([], self.layers[last_index].errors( self.y), givens={ x: self.validation_x, y: self.validation_y }) #training function self.train_model = theano.function([], self.cost, updates=self.all_updates, givens={ x: self.training_x, y: self.training_y }) self.patience = 20000 self.patience_increase = 3 self.improvement_threshold = 0.995 self.validation_frequency = min(self.n_train_batches, self.patience // 2) self.best_validation_loss = np.inf self.best_iter = 0 #DEPRECATED self.itermode = 'train' self.test_score = 0. self.start_time = timeit.default_timer() self.epoch = 0 self.iter_i = 0 # renamed bc `iter` is reserved self.done_looping = False self.param_index = param_index #constant-defined stuff self.improvement_threshold = 0.995 self.validation_frequency = min(self.n_train_batches, self.patience // 2) self.done_looping = False print 'initialized neural network object'
def solving_logistic_regression(datapath, learning_rate = 0.54,batch = 500,n_epoch = 30): ##for MNIST DATA LOADING PROCESS print "loading data...." mnist_data = upload_data(datapath) train, valid, test = mnist_data ##creating theano buffer for python data print 'moving to data to shared conversion' train_x, train_y = to_shared(train) valid_x, valid_y = to_shared(valid) test_x, test_y = to_shared(test) n_train_batch = train[0].shape[0] // batch n_valid_batch = valid[0].shape[0] // batch n_test_batch = test[0].shape[0] // batch x = T.matrix('x') y = T.ivector('y') index = T.iscalar('index') logistic = LogisticRegression(input = x, n_in = 784, n_out = 10) fun_valid = function(inputs = [index], outputs = logistic.error(y), givens = [(x,valid_x[index*batch:(index+1)*batch,:]), (y,valid_y[index*batch:(index+1)*batch])] ) fun_test = function(inputs = [index], outputs = logistic.y_pred, givens = [(x,test_x[index*batch:(index+1)*batch,:])], ) print "calaculating cost function" cost = logistic.negative_log_likelihood(y) g_W = T.grad(cost = cost,wrt = logistic.W) g_b = T.grad(cost = cost,wrt = logistic.b) updates = [(logistic.W, logistic.W - g_W*learning_rate), (logistic.b, logistic.b - g_b*learning_rate)] fun_train = function(inputs =[index], outputs = logistic.params, updates = updates, givens = [(x,train_x[index*batch:(index+1)*batch,:]), (y,train_y[index*batch:(index+1)*batch])] ) ################ #TRAINING MODEL# ################.......................................... print 'training starts now -->' patience = 5000 patience_increase = 2 improvement = 0.96 validation_frequency = min(n_train_batch, patience//2) least_error = np.Inf epoch = 0 done_looping = False print 'EPOCH counting .....' start_time = timeit.default_timer() while epoch < n_epoch and (not done_looping): for current_batch in range(n_train_batch): total_batches = (epoch*n_train_batch) + current_batch fun_train(current_batch) if (total_batches+1) % validation_frequency == 0: this_error = [fun_valid(n) for n in range(n_valid_batch)] this_error = np.mean(this_error) if this_error < least_error*improvement: least_error = this_error patience = max(patience,total_batches * patience_increase) with open('/home/sameer/best_model.pkl', 'wb') as f: pickle.dump(logistic, f) if total_batches > patience: done_looping = True epoch += 1 if total_batches != 0: print least_error print 'the convergence ratio is %f' %(patience/float(total_batches)) end_time = timeit.default_timer() net_time = end_time - start_time print 'total time %f' %net_time print 'time per epoch %f' %(net_time/epoch) print 'the error is %f' %least_error print 'the total number of epoch %d' %epoch
import cPickle f = gzip.open('C:/nnets/mnist.pkl.gz', 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() n_train, n_test = map(lambda x: len(x[0]), [train_set, test_set]) dims = train_set[0].shape[1] n_classes = len(set(train_set[1])) import numpy import theano import theano.tensor as T X = T.dmatrix() y = T.ivector() prepare_data = lambda x: (theano.shared(x[0].astype('float64')), theano.shared(x[1].astype('int32'))) (training_x, training_y), (test_x, test_y), (validation_x, validation_y) = map( prepare_data, [train_set, test_set, valid_set]) W = theano.shared(numpy.zeros([dims, n_classes])) b = theano.shared(numpy.zeros(n_classes)) y_hat = T.nnet.softmax(T.dot(X, W) + b) y_pred = T.argmax(y_hat, axis=1) test_error = T.mean(T.neq(y_pred, y)) training_error = -T.mean(T.log(y_hat)[T.arange(y.shape[0]), y]) learning_rate = 0.2
def fit(self, X, Y, learning_rate=1e-2, mu=0.99, reg=1e-12, epochs=400, batch_sz=20, print_period=1, show_fig=False): # X = X.astype(np.float32) Y = Y.astype(np.int32) # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = init_weight(M1, K) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # for momentum dparams = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] # for rmsprop cache = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) grads = T.grad(cost, self.params) # momentum only updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % print_period == 0: costs.append(c) e = np.mean(Ybatch != p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def __init__(self, embeddings, height, filter_hs, conv_activation, feature_maps, output_units, batch_size, dropout_rates, activations=[Iden]): """ :param embeddings: word embeddings :param height: sentence length (padded as necessary) :param filter_hs: filter window sizes :param conv_activation: activation functin for the convolutional layer :param feature_maps: the size of feature maps (per filter window) :param output_units: number of output variables """ rng = np.random.RandomState(3435) self.batch_size = batch_size # define model architecture self.index = T.lscalar() # minibatch number self.x = T.imatrix('x') # a minibatch of words self.y = T.ivector('y') # corresponding outputs width = embeddings.shape[1] self.emb_layer = EmbeddingLayer(embeddings, name='Words') # inputs to the ConvNet go to all convolutional filters: image_shape = (batch_size, 1, height, width) # e.g. (50, 1, 66, 300) layer0_input = self.emb_layer.output(self.x).reshape(image_shape) #(self.x.shape[0], 1, self.x.shape[1], width)) self.conv_layers = [] # outputs of the convolutional filters layer1_inputs = [] filter_w = width for filter_h in filter_hs: filter_shape = (feature_maps, 1, filter_h, filter_w ) # e.g. (100, 1, 7, 300) pool_size = (height - filter_h + 1, 1) # e.g. (60, 1) conv_layer = LeNetConvPoolLayer(rng, image_shape=image_shape, filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_activation) layer1_input = conv_layer.output(layer0_input).flatten(2) self.conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) # inputs to the MLP layer1_input = T.concatenate(layer1_inputs, 1) layer_sizes = [feature_maps * len(filter_hs), output_units] # initiailze MLPDropout MLPDropout.__init__(self, rng, input=layer1_input, layer_sizes=layer_sizes, activations=activations, dropout_rates=dropout_rates) # add embeddings self.params += self.emb_layer.params # add parameters from convolutional layers for conv_layer in self.conv_layers: self.params += conv_layer.params