def __init__(self, params, data): self.get_pos_map(data) self.cap = params.cap self.lowercase = params.lowercase self.featuretype = params.featuretype chardim = params.chardim #dimension of character network layer worddim = params.worddim #dimension of character embedding and word LSTM layer if not params.nntype == "charagram": self.chars = self.get_character_dict(data) Ce = lasagne.init.Uniform(range=0.5/len(self.chars)) Ce_np = Ce.sample((len(self.chars),params.worddim)) Ce = theano.shared(np.asarray(Ce_np, dtype=config.floatX)) char = T.imatrix(); charmask = T.matrix() word = T.imatrix(); wordmask = T.matrix() idxs = T.ivector() Y = T.matrix() l_in_char = lasagne.layers.InputLayer((None, None)) if params.nntype == "charlstm": l_mask_char = lasagne.layers.InputLayer(shape=(None, None)) l_emb_char = lasagne.layers.EmbeddingLayer(l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) l_lstm_char = lasagne.layers.LSTMLayer(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) if not params.outgate: l_lstm_char = lasagne_lstm_nooutput(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) l_We = lasagne.layers.SliceLayer(l_lstm_char, -1, 1) We = lasagne.layers.get_output(l_We, {l_in_char: char, l_mask_char: charmask}) elif params.nntype == "charagram": char = T.matrix() self.featuremap = self.get_feature_map(data, params.featuretype, params.cutoff, params.lowercase) print "Number of features: ", len(self.featuremap) l_in_char = lasagne.layers.InputLayer((None, len(self.featuremap)+1)) if self.cap: l_in_char = lasagne.layers.InputLayer((None, len(self.featuremap)+2)) l_1 = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) if params.numlayers == 1: l_We = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) elif params.numlayers == 2: l_We = lasagne.layers.DenseLayer(l_1, chardim, nonlinearity=params.act) else: raise ValueError('Only 1-2 layers are supported currently.') We = lasagne.layers.get_output(l_We, {l_in_char:char}) elif params.nntype == "charcnn": l_emb_char = lasagne.layers.EmbeddingLayer(l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) emb = lasagne.layers.DimshuffleLayer(l_emb_char, (0, 2, 1)) conv_params = None if params.conv_type == 1: conv_params = [(175,2),(175,3),(175,4)] else: conv_params = [(25,1),(50,2),(75,3),(100,4),(125,5),(150,6)] layers = [] for num_filters, filter_size in conv_params: conv = lasagne.layers.Conv1DLayer(emb, num_filters, filter_size, nonlinearity=params.act) pl = lasagne.layers.GlobalPoolLayer(conv,theano.tensor.max) pl = lasagne.layers.FlattenLayer(pl) layers.append(pl) concat = lasagne.layers.ConcatLayer(layers) l_We = lasagne.layers.DenseLayer(concat, num_units=chardim, nonlinearity=params.act) We = lasagne.layers.get_output(l_We, {l_in_char: char}) else: l_We = None We = None l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word = lasagne_embedding_layer_2(l_in_word, chardim, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word, backwards = True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,(-1,worddim)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,(-1,worddim)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_emb = lasagne.layers.DenseLayer(concat2, num_units=worddim, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer(l_emb, num_units=len(self.tags), nonlinearity=lasagne.nonlinearities.softmax) embg = lasagne.layers.get_output(l_out, {l_in_word: word, l_mask_word: wordmask}) embg = embg[idxs] prediction = T.argmax(embg, axis=1) self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_We, trainable=True) reg = 0.5*params.LC*sum(lasagne.regularization.l2(x) for x in self.all_params) cost = T.nnet.categorical_crossentropy(embg,Y) cost = T.mean(cost) + reg self.feedforward_function = None self.scoring_function = None self.cost_function = None self.train_function = None if params.nntype == "charlstm": self.feedforward_function = theano.function([char, charmask, word, wordmask, idxs], embg) self.scoring_function = theano.function([char, charmask, word, wordmask, idxs], prediction) self.cost_function = theano.function([char, charmask, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum(grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function([char, charmask, word, wordmask, idxs, Y], cost, updates=updates) elif params.nntype == "charcnn" or params.nntype == "charagram": self.feedforward_function = theano.function([char, word, wordmask, idxs], embg) self.scoring_function = theano.function([char, word, wordmask, idxs], prediction) self.cost_function = theano.function([char, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum(grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function([char, word, wordmask, idxs, Y], cost, updates=updates)
def __init__(self, We_initial, We_pos_initial, params): if params.maxval: self.nout = params.maxval - params.minval + 1 p = None if params.traintype == "reg" or params.traintype == "rep": p = cPickle.load(file(params.regfile, 'rb')) print p #contains [<TensorType(float64, matrix)>, # W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, # W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, # b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, # W_cell_to_forgetgate, W_cell_to_outgate] if params.traintype == "reg": print "regularizing to parameters" if params.traintype == "rep": print "not updating embeddings" #params initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We_pos = theano.shared(np.asarray(We_pos_initial, dtype=config.floatX)) if params.traintype == "reg": initial_We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) updatewords = True if params.traintype == "rep": We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) updatewords = False #symbolic params g1batchindices = T.imatrix() g2batchindices = T.imatrix() g1mask = T.matrix() g2mask = T.matrix() g1posbatchindices = T.imatrix() g2posbatchindices = T.imatrix() scores = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_pos = lasagne.layers.InputLayer((None, None, 1)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_pos_emb = lasagne.layers.EmbeddingLayer( l_pos, input_size=We_pos.get_value().shape[0], output_size=We_pos.get_value().shape[1], W=We_pos) #attention llGate = gateLayer([l_in, l_emb], name='llGate') #25*50*300 #attention-vector llDot = DotSumLayer([llGate, l_pos_emb], name='llDot') #25*50 llSoftMax = softMaxLayer2([l_in, llDot], name='llSoftMax') #25*30 mask #llSoftMax_out = lasagne.layers.get_output(llSoftMax, {l_in:g1batchindices, l_pos:g1posbatchindices}) #self.look = theano.function([g1batchindices,g1posbatchindices], llSoftMax_out) llAttend = MulLayer([llSoftMax, llGate], name='llAttend') #-------------------------- l_lstm = None if params.useoutgate: l_lstm = lasagne.layers.LSTMLayer(llAttend, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input=l_mask) else: l_lstm = lasagne_lstm_nooutput(llAttend, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input=l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg1 = lasagne.layers.get_output(l_out, { l_in: g1batchindices, l_pos: g1posbatchindices, l_mask: g1mask }) embg2 = lasagne.layers.get_output(l_out, { l_in: g2batchindices, l_pos: g2posbatchindices, l_mask: g2mask }) g1_dot_g2 = embg1 * embg2 g1_abs_g2 = abs(embg1 - embg2) lin_dot = lasagne.layers.InputLayer((None, params.layersize)) lin_abs = lasagne.layers.InputLayer((None, params.layersize)) l_sum = lasagne.layers.ConcatLayer([lin_dot, lin_abs]) l_sigmoid = lasagne.layers.DenseLayer( l_sum, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid) l_softmax = lasagne.layers.DenseLayer(l_sigmoid, self.nout, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, { lin_dot: g1_dot_g2, lin_abs: g1_abs_g2 }) Y = T.log(X) cost = scores * (T.log(scores) - Y) cost = cost.sum(axis=1) / (float(self.nout)) prediction = 0. i = params.minval while i <= params.maxval: prediction = prediction + i * X[:, i - 1] i += 1 self.network_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) self.network_params.pop(0) self.all_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) reg = self.getRegTerm(params, We, initial_We, l_out, l_softmax, p) self.trainable = self.getTrainableParams(params) cost = T.mean(cost) + reg self.feedforward_function = theano.function( [g1batchindices, g1posbatchindices, g1mask], embg1) self.scoring_function = theano.function([ g1batchindices, g1posbatchindices, g1mask, g2batchindices, g2posbatchindices, g2mask ], prediction) self.cost_function = theano.function([ scores, g1batchindices, g1posbatchindices, g1mask, g2batchindices, g2posbatchindices, g2mask ], cost) grads = theano.gradient.grad(cost, self.trainable) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.trainable, params.eta) self.train_function = theano.function([ scores, g1batchindices, g1posbatchindices, g1mask, g2batchindices, g2posbatchindices, g2mask ], cost, updates=updates)
def __init__(self, params): self.chars = utils.get_character_dict(params.character_file) Ce = lasagne.init.Uniform(range=0.5 / len(self.chars)) Ce = Ce.sample((len(self.chars), params.chardim)) Ce = theano.shared(np.asarray(Ce, dtype=config.floatX)) g1batchindices = T.imatrix() g2batchindices = T.imatrix() p1batchindices = T.imatrix() p2batchindices = T.imatrix() g1mask = T.matrix() g2mask = T.matrix() p1mask = T.matrix() p2mask = T.matrix() l_in = lasagne.layers.InputLayer((None, None)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) l_lstm = None if params.outgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.worddim, peepholes=params.peepholes, learn_init=False, mask_input=l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.worddim, peepholes=params.peepholes, learn_init=False, mask_input=l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg1 = lasagne.layers.get_output(l_out, { l_in: g1batchindices, l_mask: g1mask }) embg2 = lasagne.layers.get_output(l_out, { l_in: g2batchindices, l_mask: g2mask }) embp1 = lasagne.layers.get_output(l_out, { l_in: p1batchindices, l_mask: p1mask }) embp2 = lasagne.layers.get_output(l_out, { l_in: p2batchindices, l_mask: p2mask }) g1g2 = (embg1 * embg2).sum(axis=1) g1g2norm = T.sqrt(T.sum(embg1**2, axis=1)) * T.sqrt( T.sum(embg2**2, axis=1)) + 1E-6 g1g2 = g1g2 / g1g2norm p1g1 = (embp1 * embg1).sum(axis=1) p1g1norm = T.sqrt(T.sum(embp1**2, axis=1)) * T.sqrt( T.sum(embg1**2, axis=1)) + 1E-6 p1g1 = p1g1 / p1g1norm p2g2 = (embp2 * embg2).sum(axis=1) p2g2norm = T.sqrt(T.sum(embp2**2, axis=1)) * T.sqrt( T.sum(embg2**2, axis=1)) + 1E-6 p2g2 = p2g2 / p2g2norm costp1g1 = params.margin - g1g2 + p1g1 costp1g1 = costp1g1 * (costp1g1 > 0) costp2g2 = params.margin - g1g2 + p2g2 costp2g2 = costp2g2 * (costp2g2 > 0) cost = costp1g1 + costp2g2 self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) l2 = 0.5 * params.LC * sum( lasagne.regularization.l2(x) for x in self.all_params) cost = T.mean(cost) + l2 self.feedforward_function = theano.function([g1batchindices, g1mask], embg1) self.cost_function = theano.function([ g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask ], cost) prediction = g1g2 self.scoring_function = theano.function( [g1batchindices, g2batchindices, g1mask, g2mask], prediction) grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([ g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask ], cost, updates=updates)
def __init__(self, params): self.chars = utils.get_character_dict(params.character_file) Ce = lasagne.init.Uniform(range=0.5/len(self.chars)) Ce = Ce.sample((len(self.chars),params.chardim)) Ce = theano.shared(np.asarray(Ce, dtype=config.floatX)) g1batchindices = T.imatrix(); g2batchindices = T.imatrix() p1batchindices = T.imatrix(); p2batchindices = T.imatrix() g1mask = T.matrix(); g2mask = T.matrix() p1mask = T.matrix(); p2mask = T.matrix() l_in = lasagne.layers.InputLayer((None, None)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) l_lstm = None if params.outgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.worddim, peepholes=params.peepholes, learn_init=False, mask_input=l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.worddim, peepholes=params.peepholes, learn_init=False, mask_input=l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg1 = lasagne.layers.get_output(l_out, {l_in: g1batchindices, l_mask: g1mask}) embg2 = lasagne.layers.get_output(l_out, {l_in: g2batchindices, l_mask: g2mask}) embp1 = lasagne.layers.get_output(l_out, {l_in: p1batchindices, l_mask: p1mask}) embp2 = lasagne.layers.get_output(l_out, {l_in: p2batchindices, l_mask: p2mask}) g1g2 = (embg1*embg2).sum(axis=1) g1g2norm = T.sqrt(T.sum(embg1**2,axis=1)) * T.sqrt(T.sum(embg2**2,axis=1)) + 1E-6 g1g2 = g1g2 / g1g2norm p1g1 = (embp1*embg1).sum(axis=1) p1g1norm = T.sqrt(T.sum(embp1**2,axis=1)) * T.sqrt(T.sum(embg1**2,axis=1)) + 1E-6 p1g1 = p1g1 / p1g1norm p2g2 = (embp2*embg2).sum(axis=1) p2g2norm = T.sqrt(T.sum(embp2**2,axis=1)) * T.sqrt(T.sum(embg2**2,axis=1)) + 1E-6 p2g2 = p2g2 / p2g2norm costp1g1 = params.margin - g1g2 + p1g1 costp1g1 = costp1g1*(costp1g1 > 0) costp2g2 = params.margin - g1g2 + p2g2 costp2g2 = costp2g2*(costp2g2 > 0) cost = costp1g1 + costp2g2 self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) l2 = 0.5 * params.LC * sum(lasagne.regularization.l2(x) for x in self.all_params) cost = T.mean(cost) + l2 self.feedforward_function = theano.function([g1batchindices, g1mask], embg1) self.cost_function = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask], cost) prediction = g1g2 self.scoring_function = theano.function([g1batchindices, g2batchindices, g1mask, g2mask], prediction) grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads] updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask], cost, updates=updates)
def __init__(self, We_initial, params): p = None if params.traintype == "reg" or params.traintype == "rep": p = cPickle.load(file(params.regfile, 'rb')) print p #contains [<TensorType(float64, matrix)>, # W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, # W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, # b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, # W_cell_to_forgetgate, W_cell_to_outgate] if params.traintype == "reg": print "regularizing to parameters" if params.traintype == "rep": print "not updating embeddings" #params initial_We = theano.shared(np.asarray(We_initial, dtype = config.floatX)) We = theano.shared(np.asarray(We_initial, dtype = config.floatX)) if params.traintype == "reg": initial_We = theano.shared(np.asarray(p[0].get_value(), dtype = config.floatX)) We = theano.shared(np.asarray(p[0].get_value(), dtype = config.floatX)) updatewords = True if params.traintype == "rep": We = theano.shared(np.asarray(p[0].get_value(), dtype = config.floatX)) updatewords = False g1batchindices = T.imatrix() g1mask = T.matrix() scores = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_lstm = None if params.useoutgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) if params.traintype == "reg" or params.traintype == "rep": if params.useoutgate: W_in_to_ingate = np.asarray(p[1].get_value(), dtype = config.floatX) W_hid_to_ingate = np.asarray(p[2].get_value(), dtype = config.floatX) b_ingate = np.asarray(p[3].get_value(), dtype = config.floatX) W_in_to_forgetgate = np.asarray(p[4].get_value(), dtype = config.floatX) W_hid_to_forgetgate = np.asarray(p[5].get_value(), dtype = config.floatX) b_forgetgate = np.asarray(p[6].get_value(), dtype = config.floatX) W_in_to_cell = np.asarray(p[7].get_value(), dtype = config.floatX) W_hid_to_cell = np.asarray(p[8].get_value(), dtype = config.floatX) b_cell = np.asarray(p[9].get_value(), dtype = config.floatX) W_in_to_outgate = np.asarray(p[10].get_value(), dtype = config.floatX) W_hid_to_outgate = np.asarray(p[11].get_value(), dtype = config.floatX) b_outgate = np.asarray(p[12].get_value(), dtype = config.floatX) W_cell_to_ingate = np.asarray(p[13].get_value(), dtype = config.floatX) W_cell_to_forgetgate = np.asarray(p[14].get_value(), dtype = config.floatX) W_cell_to_outgate = np.asarray(p[15].get_value(), dtype = config.floatX) ingate = lasagne.layers.Gate(W_in=W_in_to_ingate, W_hid=W_hid_to_ingate, W_cell=W_cell_to_ingate, b=b_ingate) forgetgate = lasagne.layers.Gate(W_in=W_in_to_forgetgate, W_hid=W_hid_to_forgetgate, W_cell=W_cell_to_forgetgate, b=b_forgetgate) outgate = lasagne.layers.Gate(W_in=W_in_to_outgate, W_hid=W_hid_to_outgate, W_cell=W_cell_to_outgate, b=b_outgate) cell = lasagne.layers.Gate(W_in=W_in_to_cell, W_hid=W_hid_to_cell, W_cell=None, b=b_cell, nonlinearity=lasagne.nonlinearities.tanh) l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, ingate = ingate, forgetgate = forgetgate, outgate = outgate, cell = cell, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) else: W_in_to_ingate = np.asarray(p[1].get_value(), dtype = config.floatX) W_hid_to_ingate = np.asarray(p[2].get_value(), dtype = config.floatX) b_ingate = np.asarray(p[3].get_value(), dtype = config.floatX) W_in_to_forgetgate = np.asarray(p[4].get_value(), dtype = config.floatX) W_hid_to_forgetgate = np.asarray(p[5].get_value(), dtype = config.floatX) b_forgetgate = np.asarray(p[6].get_value(), dtype = config.floatX) W_in_to_cell = np.asarray(p[7].get_value(), dtype = config.floatX) W_hid_to_cell = np.asarray(p[8].get_value(), dtype = config.floatX) b_cell = np.asarray(p[9].get_value(), dtype = config.floatX) W_cell_to_ingate = np.asarray(p[10].get_value(), dtype = config.floatX) W_cell_to_forgetgate = np.asarray(p[11].get_value(), dtype = config.floatX) ingate = lasagne.layers.Gate(W_in=W_in_to_ingate, W_hid=W_hid_to_ingate, W_cell=W_cell_to_ingate, b=b_ingate) forgetgate = lasagne.layers.Gate(W_in=W_in_to_forgetgate, W_hid=W_hid_to_forgetgate, W_cell=W_cell_to_forgetgate, b=b_forgetgate) cell = lasagne.layers.Gate(W_in=W_in_to_cell, W_hid=W_hid_to_cell, W_cell=None, b=b_cell, nonlinearity=lasagne.nonlinearities.tanh) l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, ingate = ingate, forgetgate = forgetgate, cell = cell, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg = lasagne.layers.get_output(l_out, {l_in:g1batchindices, l_mask:g1mask}) l_in2 = lasagne.layers.InputLayer((None, We.get_value().shape[1])) l_sigmoid = lasagne.layers.DenseLayer(l_in2, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid) l_softmax = lasagne.layers.DenseLayer(l_sigmoid, 2, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, {l_in2:embg}) cost = T.nnet.categorical_crossentropy(X,scores) prediction = T.argmax(X, axis=1) self.network_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_softmax, trainable=True) self.network_params.pop(0) self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_softmax, trainable=True) reg = self.getRegTerm(params, We, initial_We, l_out, l_softmax, p) self.trainable = self.getTrainableParams(params) cost = T.mean(cost) + reg self.feedforward_function = theano.function([g1batchindices,g1mask], embg) self.scoring_function = theano.function([g1batchindices, g1mask],prediction) self.cost_function = theano.function([scores, g1batchindices, g1mask], cost) grads = theano.gradient.grad(cost, self.trainable) if params.clip: grads = [lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads] updates = params.learner(grads, self.trainable, params.eta) self.train_function = theano.function([scores, g1batchindices, g1mask], cost, updates=updates)
def __init__(self, We_initial, params): initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) g1batchindices = T.imatrix() g2batchindices = T.imatrix() p1batchindices = T.imatrix() p2batchindices = T.imatrix() g1mask = T.matrix() g2mask = T.matrix() p1mask = T.matrix() p2mask = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_lstm = None if params.outgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, peepholes=params.peephole, learn_init=False, mask_input=l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, peepholes=params.peephole, learn_init=False, mask_input=l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg1 = lasagne.layers.get_output(l_out, { l_in: g1batchindices, l_mask: g1mask }) embg2 = lasagne.layers.get_output(l_out, { l_in: g2batchindices, l_mask: g2mask }) embp1 = lasagne.layers.get_output(l_out, { l_in: p1batchindices, l_mask: p1mask }) embp2 = lasagne.layers.get_output(l_out, { l_in: p2batchindices, l_mask: p2mask }) g1g2 = (embg1 * embg2).sum(axis=1) g1g2norm = T.sqrt(T.sum(embg1**2, axis=1)) * T.sqrt( T.sum(embg2**2, axis=1)) g1g2 = g1g2 / g1g2norm p1g1 = (embp1 * embg1).sum(axis=1) p1g1norm = T.sqrt(T.sum(embp1**2, axis=1)) * T.sqrt( T.sum(embg1**2, axis=1)) p1g1 = p1g1 / p1g1norm p2g2 = (embp2 * embg2).sum(axis=1) p2g2norm = T.sqrt(T.sum(embp2**2, axis=1)) * T.sqrt( T.sum(embg2**2, axis=1)) p2g2 = p2g2 / p2g2norm costp1g1 = params.margin - g1g2 + p1g1 costp1g1 = costp1g1 * (costp1g1 > 0) costp2g2 = params.margin - g1g2 + p2g2 costp2g2 = costp2g2 * (costp2g2 > 0) cost = costp1g1 + costp2g2 network_params = lasagne.layers.get_all_params(l_lstm, trainable=True) network_params.pop(0) self.all_params = lasagne.layers.get_all_params(l_lstm, trainable=True) l2 = 0.5 * params.LC * sum( lasagne.regularization.l2(x) for x in network_params) if params.updatewords: word_reg = 0.5 * params.LW * lasagne.regularization.l2(We - initial_We) cost = T.mean(cost) + l2 + word_reg else: cost = T.mean(cost) + l2 self.feedforward_function = theano.function([g1batchindices, g1mask], embg1) self.cost_function = theano.function([ g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask ], cost) prediction = g1g2 self.scoring_function = theano.function( [g1batchindices, g2batchindices, g1mask, g2mask], prediction) self.train_function = None if params.updatewords: grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([ g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask ], cost, updates=updates) else: self.all_params = network_params grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([ g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask ], cost, updates=updates)
def __init__(self, params, data): self.get_pos_map(data) self.cap = params.cap self.lowercase = params.lowercase self.featuretype = params.featuretype chardim = params.chardim #dimension of character network layer worddim = params.worddim #dimension of character embedding and word LSTM layer if not params.nntype == "charagram": self.chars = self.get_character_dict(data) Ce = lasagne.init.Uniform(range=0.5 / len(self.chars)) Ce_np = Ce.sample((len(self.chars), params.worddim)) Ce = theano.shared(np.asarray(Ce_np, dtype=config.floatX)) char = T.imatrix() charmask = T.matrix() word = T.imatrix() wordmask = T.matrix() idxs = T.ivector() Y = T.matrix() l_in_char = lasagne.layers.InputLayer((None, None)) if params.nntype == "charlstm": l_mask_char = lasagne.layers.InputLayer(shape=(None, None)) l_emb_char = lasagne.layers.EmbeddingLayer( l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) l_lstm_char = lasagne.layers.LSTMLayer(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) if not params.outgate: l_lstm_char = lasagne_lstm_nooutput(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) l_We = lasagne.layers.SliceLayer(l_lstm_char, -1, 1) We = lasagne.layers.get_output(l_We, { l_in_char: char, l_mask_char: charmask }) elif params.nntype == "charagram": char = T.matrix() self.featuremap = self.get_feature_map(data, params.featuretype, params.cutoff, params.lowercase) print "Number of features: ", len(self.featuremap) l_in_char = lasagne.layers.InputLayer( (None, len(self.featuremap) + 1)) if self.cap: l_in_char = lasagne.layers.InputLayer( (None, len(self.featuremap) + 2)) l_1 = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) if params.numlayers == 1: l_We = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) elif params.numlayers == 2: l_We = lasagne.layers.DenseLayer(l_1, chardim, nonlinearity=params.act) else: raise ValueError('Only 1-2 layers are supported currently.') We = lasagne.layers.get_output(l_We, {l_in_char: char}) elif params.nntype == "charcnn": l_emb_char = lasagne.layers.EmbeddingLayer( l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) emb = lasagne.layers.DimshuffleLayer(l_emb_char, (0, 2, 1)) conv_params = None if params.conv_type == 1: conv_params = [(175, 2), (175, 3), (175, 4)] else: conv_params = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] layers = [] for num_filters, filter_size in conv_params: conv = lasagne.layers.Conv1DLayer(emb, num_filters, filter_size, nonlinearity=params.act) pl = lasagne.layers.GlobalPoolLayer(conv, theano.tensor.max) pl = lasagne.layers.FlattenLayer(pl) layers.append(pl) concat = lasagne.layers.ConcatLayer(layers) l_We = lasagne.layers.DenseLayer(concat, num_units=chardim, nonlinearity=params.act) We = lasagne.layers.get_output(l_We, {l_in_char: char}) else: l_We = None We = None l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word = lasagne_embedding_layer_2(l_in_word, chardim, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, worddim)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, worddim)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_emb = lasagne.layers.DenseLayer( concat2, num_units=worddim, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer( l_emb, num_units=len(self.tags), nonlinearity=lasagne.nonlinearities.softmax) embg = lasagne.layers.get_output(l_out, { l_in_word: word, l_mask_word: wordmask }) embg = embg[idxs] prediction = T.argmax(embg, axis=1) self.all_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_We, trainable=True) reg = 0.5 * params.LC * sum( lasagne.regularization.l2(x) for x in self.all_params) cost = T.nnet.categorical_crossentropy(embg, Y) cost = T.mean(cost) + reg self.feedforward_function = None self.scoring_function = None self.cost_function = None self.train_function = None if params.nntype == "charlstm": self.feedforward_function = theano.function( [char, charmask, word, wordmask, idxs], embg) self.scoring_function = theano.function( [char, charmask, word, wordmask, idxs], prediction) self.cost_function = theano.function( [char, charmask, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum( grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function( [char, charmask, word, wordmask, idxs, Y], cost, updates=updates) elif params.nntype == "charcnn" or params.nntype == "charagram": self.feedforward_function = theano.function( [char, word, wordmask, idxs], embg) self.scoring_function = theano.function( [char, word, wordmask, idxs], prediction) self.cost_function = theano.function( [char, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum( grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function( [char, word, wordmask, idxs, Y], cost, updates=updates)
def __init__(self, We_initial, params): if params.maxval: self.nout = params.maxval - params.minval + 1 p = None if params.traintype == "reg" or params.traintype == "rep": p = cPickle.load(file(params.regfile, 'rb')) print p #contains [<TensorType(float64, matrix)>, # W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, # W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, # b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, # W_cell_to_forgetgate, W_cell_to_outgate] if params.traintype == "reg": print "regularizing to parameters" if params.traintype == "rep": print "not updating embeddings" #params initial_We = theano.shared(np.asarray(We_initial, dtype = config.floatX)) We = theano.shared(np.asarray(We_initial, dtype = config.floatX)) if params.traintype == "reg": initial_We = theano.shared(np.asarray(p[0].get_value(), dtype = config.floatX)) We = theano.shared(np.asarray(p[0].get_value(), dtype = config.floatX)) updatewords = True if params.traintype == "rep": We = theano.shared(np.asarray(p[0].get_value(), dtype = config.floatX)) updatewords = False #symbolic params g1batchindices = T.imatrix(); g2batchindices = T.imatrix() g1mask = T.matrix(); g2mask = T.matrix() scores = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_lstm = None if params.useoutgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) if params.traintype == "reg" or params.traintype == "rep": if params.useoutgate: W_in_to_ingate = np.asarray(p[1].get_value(), dtype = config.floatX) W_hid_to_ingate = np.asarray(p[2].get_value(), dtype = config.floatX) b_ingate = np.asarray(p[3].get_value(), dtype = config.floatX) W_in_to_forgetgate = np.asarray(p[4].get_value(), dtype = config.floatX) W_hid_to_forgetgate = np.asarray(p[5].get_value(), dtype = config.floatX) b_forgetgate = np.asarray(p[6].get_value(), dtype = config.floatX) W_in_to_cell = np.asarray(p[7].get_value(), dtype = config.floatX) W_hid_to_cell = np.asarray(p[8].get_value(), dtype = config.floatX) b_cell = np.asarray(p[9].get_value(), dtype = config.floatX) W_in_to_outgate = np.asarray(p[10].get_value(), dtype = config.floatX) W_hid_to_outgate = np.asarray(p[11].get_value(), dtype = config.floatX) b_outgate = np.asarray(p[12].get_value(), dtype = config.floatX) W_cell_to_ingate = np.asarray(p[13].get_value(), dtype = config.floatX) W_cell_to_forgetgate = np.asarray(p[14].get_value(), dtype = config.floatX) W_cell_to_outgate = np.asarray(p[15].get_value(), dtype = config.floatX) ingate = lasagne.layers.Gate(W_in=W_in_to_ingate, W_hid=W_hid_to_ingate, W_cell=W_cell_to_ingate, b=b_ingate) forgetgate = lasagne.layers.Gate(W_in=W_in_to_forgetgate, W_hid=W_hid_to_forgetgate, W_cell=W_cell_to_forgetgate, b=b_forgetgate) outgate = lasagne.layers.Gate(W_in=W_in_to_outgate, W_hid=W_hid_to_outgate, W_cell=W_cell_to_outgate, b=b_outgate) cell = lasagne.layers.Gate(W_in=W_in_to_cell, W_hid=W_hid_to_cell, W_cell=None, b=b_cell, nonlinearity=lasagne.nonlinearities.tanh) l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, ingate = ingate, forgetgate = forgetgate, outgate = outgate, cell = cell, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) else: W_in_to_ingate = np.asarray(p[1].get_value(), dtype = config.floatX) W_hid_to_ingate = np.asarray(p[2].get_value(), dtype = config.floatX) b_ingate = np.asarray(p[3].get_value(), dtype = config.floatX) W_in_to_forgetgate = np.asarray(p[4].get_value(), dtype = config.floatX) W_hid_to_forgetgate = np.asarray(p[5].get_value(), dtype = config.floatX) b_forgetgate = np.asarray(p[6].get_value(), dtype = config.floatX) W_in_to_cell = np.asarray(p[7].get_value(), dtype = config.floatX) W_hid_to_cell = np.asarray(p[8].get_value(), dtype = config.floatX) b_cell = np.asarray(p[9].get_value(), dtype = config.floatX) W_cell_to_ingate = np.asarray(p[10].get_value(), dtype = config.floatX) W_cell_to_forgetgate = np.asarray(p[11].get_value(), dtype = config.floatX) ingate = lasagne.layers.Gate(W_in=W_in_to_ingate, W_hid=W_hid_to_ingate, W_cell=W_cell_to_ingate, b=b_ingate) forgetgate = lasagne.layers.Gate(W_in=W_in_to_forgetgate, W_hid=W_hid_to_forgetgate, W_cell=W_cell_to_forgetgate, b=b_forgetgate) cell = lasagne.layers.Gate(W_in=W_in_to_cell, W_hid=W_hid_to_cell, W_cell=None, b=b_cell, nonlinearity=lasagne.nonlinearities.tanh) l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, ingate = ingate, forgetgate = forgetgate, cell = cell, peepholes=params.usepeep, learn_init=False, mask_input = l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg1 = lasagne.layers.get_output(l_out, {l_in:g1batchindices, l_mask:g1mask}) embg2 = lasagne.layers.get_output(l_out, {l_in:g2batchindices, l_mask:g2mask}) g1_dot_g2 = embg1*embg2 g1_abs_g2 = abs(embg1-embg2) lin_dot = lasagne.layers.InputLayer((None, params.layersize)) lin_abs = lasagne.layers.InputLayer((None, params.layersize)) l_sum = lasagne.layers.ConcatLayer([lin_dot, lin_abs]) l_sigmoid = lasagne.layers.DenseLayer(l_sum, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid) if params.task == "sim": l_softmax = lasagne.layers.DenseLayer(l_sigmoid, self.nout, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, {lin_dot:g1_dot_g2, lin_abs:g1_abs_g2}) Y = T.log(X) cost = scores*(T.log(scores) - Y) cost = cost.sum(axis=1)/(float(self.nout)) prediction = 0. i = params.minval while i<= params.maxval: prediction = prediction + i*X[:,i-1] i += 1 elif params.task == "ent": l_softmax = lasagne.layers.DenseLayer(l_sigmoid, 3, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, {lin_dot:g1_dot_g2, lin_abs:g1_abs_g2}) cost = theano.tensor.nnet.categorical_crossentropy(X,scores) prediction = T.argmax(X, axis=1) else: raise ValueError('Params.task not set correctly.') self.network_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_softmax, trainable=True) self.network_params.pop(0) self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_softmax, trainable=True) reg = self.getRegTerm(params, We, initial_We, l_out, l_softmax, p) self.trainable = self.getTrainableParams(params) cost = T.mean(cost) + reg self.feedforward_function = theano.function([g1batchindices,g1mask], embg1) self.scoring_function = theano.function([g1batchindices, g2batchindices, g1mask, g2mask],prediction) self.cost_function = theano.function([scores, g1batchindices, g2batchindices, g1mask, g2mask], cost) grads = theano.gradient.grad(cost, self.trainable) if params.clip: grads = [lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads] updates = params.learner(grads, self.trainable, params.eta) self.train_function = theano.function([scores, g1batchindices, g2batchindices, g1mask, g2mask], cost, updates=updates)
def __init__(self, We_initial, params): p = None if params.traintype == "reg" or params.traintype == "rep": p = cPickle.load(file(params.regfile, 'rb')) print p #contains [<TensorType(float64, matrix)>, # W_in_to_ingate, W_hid_to_ingate, b_ingate, W_in_to_forgetgate, # W_hid_to_forgetgate, b_forgetgate, W_in_to_cell, W_hid_to_cell, # b_cell, W_in_to_outgate, W_hid_to_outgate, b_outgate, W_cell_to_ingate, # W_cell_to_forgetgate, W_cell_to_outgate] if params.traintype == "reg": print "regularizing to parameters" if params.traintype == "rep": print "not updating embeddings" #params initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) if params.traintype == "reg": initial_We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) updatewords = True if params.traintype == "rep": We = theano.shared( np.asarray(p[0].get_value(), dtype=config.floatX)) updatewords = False g1batchindices = T.imatrix() g1mask = T.matrix() scores = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer( l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_lstm = None if params.useoutgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input=l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, peepholes=params.usepeep, learn_init=False, mask_input=l_mask) if params.traintype == "reg" or params.traintype == "rep": if params.useoutgate: W_in_to_ingate = np.asarray(p[1].get_value(), dtype=config.floatX) W_hid_to_ingate = np.asarray(p[2].get_value(), dtype=config.floatX) b_ingate = np.asarray(p[3].get_value(), dtype=config.floatX) W_in_to_forgetgate = np.asarray(p[4].get_value(), dtype=config.floatX) W_hid_to_forgetgate = np.asarray(p[5].get_value(), dtype=config.floatX) b_forgetgate = np.asarray(p[6].get_value(), dtype=config.floatX) W_in_to_cell = np.asarray(p[7].get_value(), dtype=config.floatX) W_hid_to_cell = np.asarray(p[8].get_value(), dtype=config.floatX) b_cell = np.asarray(p[9].get_value(), dtype=config.floatX) W_in_to_outgate = np.asarray(p[10].get_value(), dtype=config.floatX) W_hid_to_outgate = np.asarray(p[11].get_value(), dtype=config.floatX) b_outgate = np.asarray(p[12].get_value(), dtype=config.floatX) W_cell_to_ingate = np.asarray(p[13].get_value(), dtype=config.floatX) W_cell_to_forgetgate = np.asarray(p[14].get_value(), dtype=config.floatX) W_cell_to_outgate = np.asarray(p[15].get_value(), dtype=config.floatX) ingate = lasagne.layers.Gate(W_in=W_in_to_ingate, W_hid=W_hid_to_ingate, W_cell=W_cell_to_ingate, b=b_ingate) forgetgate = lasagne.layers.Gate(W_in=W_in_to_forgetgate, W_hid=W_hid_to_forgetgate, W_cell=W_cell_to_forgetgate, b=b_forgetgate) outgate = lasagne.layers.Gate(W_in=W_in_to_outgate, W_hid=W_hid_to_outgate, W_cell=W_cell_to_outgate, b=b_outgate) cell = lasagne.layers.Gate( W_in=W_in_to_cell, W_hid=W_hid_to_cell, W_cell=None, b=b_cell, nonlinearity=lasagne.nonlinearities.tanh) l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, ingate=ingate, forgetgate=forgetgate, outgate=outgate, cell=cell, peepholes=params.usepeep, learn_init=False, mask_input=l_mask) else: W_in_to_ingate = np.asarray(p[1].get_value(), dtype=config.floatX) W_hid_to_ingate = np.asarray(p[2].get_value(), dtype=config.floatX) b_ingate = np.asarray(p[3].get_value(), dtype=config.floatX) W_in_to_forgetgate = np.asarray(p[4].get_value(), dtype=config.floatX) W_hid_to_forgetgate = np.asarray(p[5].get_value(), dtype=config.floatX) b_forgetgate = np.asarray(p[6].get_value(), dtype=config.floatX) W_in_to_cell = np.asarray(p[7].get_value(), dtype=config.floatX) W_hid_to_cell = np.asarray(p[8].get_value(), dtype=config.floatX) b_cell = np.asarray(p[9].get_value(), dtype=config.floatX) W_cell_to_ingate = np.asarray(p[10].get_value(), dtype=config.floatX) W_cell_to_forgetgate = np.asarray(p[11].get_value(), dtype=config.floatX) ingate = lasagne.layers.Gate(W_in=W_in_to_ingate, W_hid=W_hid_to_ingate, W_cell=W_cell_to_ingate, b=b_ingate) forgetgate = lasagne.layers.Gate(W_in=W_in_to_forgetgate, W_hid=W_hid_to_forgetgate, W_cell=W_cell_to_forgetgate, b=b_forgetgate) cell = lasagne.layers.Gate( W_in=W_in_to_cell, W_hid=W_hid_to_cell, W_cell=None, b=b_cell, nonlinearity=lasagne.nonlinearities.tanh) l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, ingate=ingate, forgetgate=forgetgate, cell=cell, peepholes=params.usepeep, learn_init=False, mask_input=l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg = lasagne.layers.get_output(l_out, { l_in: g1batchindices, l_mask: g1mask }) l_in2 = lasagne.layers.InputLayer((None, We.get_value().shape[1])) l_sigmoid = lasagne.layers.DenseLayer( l_in2, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid) l_softmax = lasagne.layers.DenseLayer(l_sigmoid, 2, nonlinearity=T.nnet.softmax) X = lasagne.layers.get_output(l_softmax, {l_in2: embg}) cost = T.nnet.categorical_crossentropy(X, scores) prediction = T.argmax(X, axis=1) self.network_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) self.network_params.pop(0) self.all_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_softmax, trainable=True) reg = self.getRegTerm(params, We, initial_We, l_out, l_softmax, p) self.trainable = self.getTrainableParams(params) cost = T.mean(cost) + reg self.feedforward_function = theano.function([g1batchindices, g1mask], embg) self.scoring_function = theano.function([g1batchindices, g1mask], prediction) self.cost_function = theano.function([scores, g1batchindices, g1mask], cost) grads = theano.gradient.grad(cost, self.trainable) if params.clip: grads = [ lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads ] updates = params.learner(grads, self.trainable, params.eta) self.train_function = theano.function([scores, g1batchindices, g1mask], cost, updates=updates)
def __init__(self, We_initial, params): initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) We = theano.shared(np.asarray(We_initial, dtype=config.floatX)) g1batchindices = T.imatrix() g2batchindices = T.imatrix() p1batchindices = T.imatrix() p2batchindices = T.imatrix() g1mask = T.matrix() g2mask = T.matrix() p1mask = T.matrix() p2mask = T.matrix() l_in = lasagne.layers.InputLayer((None, None, 1)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=We.get_value().shape[0], output_size=We.get_value().shape[1], W=We) l_lstm = None if params.outgate: l_lstm = lasagne.layers.LSTMLayer(l_emb, params.layersize, peepholes=params.peephole, learn_init=False, mask_input=l_mask) else: l_lstm = lasagne_lstm_nooutput(l_emb, params.layersize, peepholes=params.peephole, learn_init=False, mask_input=l_mask) l_out = lasagne.layers.SliceLayer(l_lstm, -1, 1) embg1 = lasagne.layers.get_output(l_out, {l_in: g1batchindices, l_mask: g1mask}) embg2 = lasagne.layers.get_output(l_out, {l_in: g2batchindices, l_mask: g2mask}) embp1 = lasagne.layers.get_output(l_out, {l_in: p1batchindices, l_mask: p1mask}) embp2 = lasagne.layers.get_output(l_out, {l_in: p2batchindices, l_mask: p2mask}) g1g2 = (embg1 * embg2).sum(axis=1) g1g2norm = T.sqrt(T.sum(embg1 ** 2, axis=1)) * T.sqrt(T.sum(embg2 ** 2, axis=1)) g1g2 = g1g2 / g1g2norm p1g1 = (embp1 * embg1).sum(axis=1) p1g1norm = T.sqrt(T.sum(embp1 ** 2, axis=1)) * T.sqrt(T.sum(embg1 ** 2, axis=1)) p1g1 = p1g1 / p1g1norm p2g2 = (embp2 * embg2).sum(axis=1) p2g2norm = T.sqrt(T.sum(embp2 ** 2, axis=1)) * T.sqrt(T.sum(embg2 ** 2, axis=1)) p2g2 = p2g2 / p2g2norm costp1g1 = params.margin - g1g2 + p1g1 costp1g1 = costp1g1 * (costp1g1 > 0) costp2g2 = params.margin - g1g2 + p2g2 costp2g2 = costp2g2 * (costp2g2 > 0) cost = costp1g1 + costp2g2 network_params = lasagne.layers.get_all_params(l_lstm, trainable=True) network_params.pop(0) self.all_params = lasagne.layers.get_all_params(l_lstm, trainable=True) l2 = 0.5 * params.LC * sum(lasagne.regularization.l2(x) for x in network_params) if params.updatewords: word_reg = 0.5 * params.LW * lasagne.regularization.l2(We - initial_We) cost = T.mean(cost) + l2 + word_reg else: cost = T.mean(cost) + l2 self.feedforward_function = theano.function([g1batchindices, g1mask], embg1) self.cost_function = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask], cost) prediction = g1g2 self.scoring_function = theano.function([g1batchindices, g2batchindices, g1mask, g2mask], prediction) self.train_function = None if params.updatewords: grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads] updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask], cost, updates=updates) else: self.all_params = network_params grads = theano.gradient.grad(cost, self.all_params) if params.clip: grads = [lasagne.updates.norm_constraint(grad, params.clip, range(grad.ndim)) for grad in grads] updates = params.learner(grads, self.all_params, params.eta) self.train_function = theano.function([g1batchindices, g2batchindices, p1batchindices, p2batchindices, g1mask, g2mask, p1mask, p2mask], cost, updates=updates)