def build_rnn(x_sym, hid_init_sym, hid2_init_sym, seq_length, vocab_size, rnn_size): l_input = L.InputLayer(input_var=x_sym, shape=(None, seq_length)) l_input_hid = L.InputLayer(input_var=hid_init_sym, shape=(None, rnn_size)) l_input_hid2 = L.InputLayer(input_var=hid2_init_sym, shape=(None, rnn_size)) l_input = L.EmbeddingLayer(l_input, input_size=vocab_size, output_size=rnn_size) l_rnn = L.LSTMLayer(l_input, num_units=rnn_size, hid_init=l_input_hid) #, cell_init=l_init_cell) h = L.DropoutLayer(l_rnn, p=dropout_prob) l_rnn2 = L.LSTMLayer(h, num_units=rnn_size, hid_init=l_input_hid2) #, cell_init=l_init_cell2) h = L.DropoutLayer(l_rnn2, p=dropout_prob) # Before the decoder layer, we need to reshape the sequence into the batch dimension, # so that timesteps are decoded independently. l_shp = L.ReshapeLayer(h, (-1, rnn_size)) pred = NCELayer(l_shp, num_units=vocab_size, Z=Z) pred = L.ReshapeLayer(pred, (-1, seq_length, vocab_size)) return l_rnn, l_rnn2, pred
def build_network(self, vocab_size, input_var, mask_var, W_init): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=W_init) l_fwd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_1 = L.LSTMLayer(l_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_all_1 = L.concat([l_fwd_1, l_bkd_1], axis=2) l_fwd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_bkd_2 = L.LSTMLayer(l_all_1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_bkd_1_slice = L.SliceLayer(l_bkd_1, 0, 1) y_1 = L.ElemwiseSumLayer([l_fwd_1_slice, l_bkd_1_slice]) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) l_bkd_2_slice = L.SliceLayer(l_bkd_2, 0, 1) y_2 = L.ElemwiseSumLayer([l_fwd_2_slice, l_bkd_2_slice]) y = L.concat([y_1, y_2], axis=1) g = L.DenseLayer(y, num_units=EMBED_DIM, nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def __init__( self, n_words, dim_emb, num_units, n_classes, w_emb=None, dropout=0.2, use_final=False, lr=0.001, pretrain=None, ): self.n_words = n_words self.dim_emb = dim_emb self.num_units = num_units self.n_classes = n_classes self.lr = lr if w_emb is None: w_emb = init.Normal() self.l_x = layers.InputLayer((None, None)) self.l_m = layers.InputLayer((None, None)) self.l_emb = layers.EmbeddingLayer(self.l_x, n_words, dim_emb, W=w_emb) self.l_ebd = self.l_emb if dropout: self.l_emb = layers.dropout(self.l_emb, dropout) if use_final: self.l_enc = layers.LSTMLayer(self.l_emb, num_units, mask_input=self.l_m, only_return_final=True, grad_clipping=10.0, gradient_steps=400) self.l_rnn = self.l_enc else: self.l_enc = layers.LSTMLayer(self.l_emb, num_units, mask_input=self.l_m, only_return_final=False, grad_clipping=10.0, gradient_steps=400) self.l_rnn = self.l_enc self.l_enc = MeanLayer(self.l_enc, self.l_m) if dropout: self.l_enc = layers.dropout(self.l_enc, dropout) self.l_y = layers.DenseLayer(self.l_enc, n_classes, nonlinearity=nonlinearities.softmax) if pretrain: self.load_pretrain(pretrain)
def recurrent(input_var=None, num_units=512, batch_size=64, seq_length=1, grad_clip=100): recurrent = [] theano_rng = RandomStreams(rng.randint(2**15)) # we want noise to match tanh range of activation ([-1,1]) noise = theano_rng.uniform(size=(batch_size, seq_length, num_units), low=-1.0, high=1.0) input_var = noise if input_var is None else input_var recurrent.append( ll.InputLayer(shape=(batch_size, seq_length, num_units), input_var=input_var)) recurrent.append( ll.LSTMLayer(recurrent[-1], num_units, grad_clipping=grad_clip)) #tanh is default recurrent.append(ll.SliceLayer(recurrent[-1], -1, 1)) recurrent.append(ll.ReshapeLayer(recurrent[-1], ([0], 1, [1]))) for layer in recurrent: print layer.output_shape print "" return recurrent
def __init__(self, input, n_hidden=500, grad_clip=100., only_return_final=True): self.input = input gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=initialize_parameters()[1]) cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=initialize_parameters()[1], nonlinearity=lasagne.nonlinearities.tanh) self.output = layers.LSTMLayer(self.input, n_hidden, ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, grad_clipping=grad_clip, only_return_final=only_return_final)
def ptb_lstm(input_var, vocabulary_size, hidden_size, seq_len, num_layers, dropout, batch_size): l_input = L.InputLayer(shape=(batch_size, seq_len), input_var=input_var) l_embed = L.EmbeddingLayer(l_input, vocabulary_size, hidden_size, W=init.Uniform(1.0)) l_lstms = [] for i in range(num_layers): l_lstm = L.LSTMLayer(l_embed if i == 0 else l_lstms[-1], hidden_size, ingate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal()), forgetgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), b=init.Constant(1.0)), cell=L.Gate( W_in=init.GlorotUniform(), W_hid=init.Orthogonal(), W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=L.Gate(W_in=init.GlorotUniform(), W_hid=init.Orthogonal())) l_lstms.append(l_lstm) l_drop = L.DropoutLayer(l_lstms[-1], dropout) l_out = L.DenseLayer(l_drop, num_units=vocabulary_size, num_leading_axes=2) l_out = L.ReshapeLayer( l_out, (l_out.output_shape[0] * l_out.output_shape[1], l_out.output_shape[2])) l_out = L.NonlinearityLayer(l_out, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def layer_LSTM(l_hid, hiddensize, nonlinearity, backwards=False, grad_clipping=50, name=""): ''' That's a custom LSTM layer that seems to converge faster. ''' ingate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0)) forgetgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0)) outgate = ll.Gate(W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0)) cell = ll.Gate(W_cell=None, W_in=lasagne.init.Orthogonal(1.0), W_hid=lasagne.init.Orthogonal(1.0), nonlinearity=nonlinearity) # The final nonline should be TanH otherwise it doesn't converge (why?) # by default peepholes=True fwd = ll.LSTMLayer(l_hid, num_units=hiddensize, backwards=backwards, ingate=ingate, forgetgate=forgetgate, outgate=outgate, cell=cell, grad_clipping=grad_clipping, nonlinearity=lasagne.nonlinearities.tanh, name=name) return fwd
def makeRNN(xInputRNN, hiddenInitRNN, hidden2InitRNN, sequenceLen, vocabularySize, neuralNetworkSz): input_Layer = L.InputLayer(input_var = xInputRNN, shape = (None, sequenceLen)) hidden_Layer = L.InputLayer(input_var = hiddenInitRNN, shape = (None, neuralNetworkSz)) hidden_Layer2 = L.InputLayer(input_var = hidden2InitRNN, shape = (None, neuralNetworkSz)) input_Layer = L.EmbeddingLayer(input_Layer, input_size = vocabularySize, output_size = neuralNetworkSz) RNN_Layer = L.LSTMLayer(input_Layer, num_units = neuralNetworkSz, hid_init = hidden_Layer) h = L.DropoutLayer(RNN_Layer, p = dropOutProbability) RNN_Layer2 = L.LSTMLayer(h, num_units = neuralNetworkSz, hid_init = hidden_Layer2) h = L.DropoutLayer(RNN_Layer2, p = dropOutProbability) layerShape = L.ReshapeLayer(h, (-1, neuralNetworkSz)) predictions = NCE(layerShape, num_units = vocabularySize, Z = Z) predictions = L.ReshapeLayer(predictions, (-1, sequenceLen, vocabularySize)) return RNN_Layer, RNN_Layer2, predictions
def __init__(self, incoming, num_units, mask_input, grad_clipping=0, **kwargs): incomings = [incoming, mask_input] super(MeanLstmLayer, self).__init__(incomings) self.num_units = num_units self.lstm_layer = layers.LSTMLayer(incoming, num_units=self.num_units, mask_input=mask_input, grad_clipping=grad_clipping, **kwargs)
def integrate_captions(input_var=T.imatrix()): ''' :param batch_size: number of images :param nb_caption: number of caption used per image ''' ############################### # Build Network Configuration # ############################### print('... Integrating captions to the model') # Input of the network : shape = (nb_caption, seq_length) network = layers.InputLayer(shape=(None, None), input_var=input_var) # Embedding layer : shape = (nb_caption, seq_length, 400) vocab_length = get_vocab_length() network = layers.EmbeddingLayer(network, vocab_length, output_size=400) # LSTM layer : shape = (nb_caption, 500) gate_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_parameters = layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) network = layers.LSTMLayer(network, num_units=500, ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, grad_clipping=100., only_return_final=True) # Dense Layer : shape = (nb_caption, 500) network = layers.DenseLayer(network, num_units=500) # Reshape layer : shape = (nb_caption, 500, 1, 1) network = layers.ReshapeLayer(network, (-1, 500, 1, 1)) return network
def clone(src_net, dst_net, mask_input): """ Clones a lasagne neural network, keeping weights tied. For all layers of src_net in turn, starting at the first: 1. creates a copy of the layer, 2. reuses the original objects for weights and 3. appends the new layer to dst_net. InputLayers are ignored. Recurrent layers (LSTMLayer) are passed mask_input. """ logger.info("Net to be cloned:") for l in layers.get_all_layers(src_net): logger.info(" - {} ({}):".format(l.name, l)) logger.info("Starting to clone..") for l in layers.get_all_layers(src_net): logger.info("src_net[...]: {} ({}):".format(l.name, l)) if type(l) == layers.InputLayer: logger.info(' - skipping') continue if type(l) == layers.DenseLayer: dst_net = layers.DenseLayer( dst_net, num_units=l.num_units, W=l.W, b=l.b, nonlinearity=l.nonlinearity, name=l.name+'2', ) elif type(l) == layers.EmbeddingLayer: dst_net = layers.EmbeddingLayer( dst_net, l.input_size, l.output_size, W=l.W, name=l.name+'2', ) elif type(l) == layers.LSTMLayer: dst_net = layers.LSTMLayer( dst_net, l.num_units, ingate=layers.Gate( W_in=l.W_in_to_ingate, W_hid=l.W_hid_to_ingate, W_cell=l.W_cell_to_ingate, b=l.b_ingate, nonlinearity=l.nonlinearity_ingate ), forgetgate=layers.Gate( W_in=l.W_in_to_forgetgate, W_hid=l.W_hid_to_forgetgate, W_cell=l.W_cell_to_forgetgate, b=l.b_forgetgate, nonlinearity=l.nonlinearity_forgetgate ), cell=layers.Gate( W_in=l.W_in_to_cell, W_hid=l.W_hid_to_cell, W_cell=None, b=l.b_cell, nonlinearity=l.nonlinearity_cell ), outgate=layers.Gate( W_in=l.W_in_to_outgate, W_hid=l.W_hid_to_outgate, W_cell=l.W_cell_to_outgate, b=l.b_outgate, nonlinearity=l.nonlinearity_outgate ), nonlinearity=l.nonlinearity, cell_init=l.cell_init, hid_init=l.hid_init, backwards=l.backwards, learn_init=l.learn_init, peepholes=l.peepholes, gradient_steps=l.gradient_steps, grad_clipping=l.grad_clipping, unroll_scan=l.unroll_scan, precompute_input=l.precompute_input, # mask_input=l.mask_input, # AttributeError: 'LSTMLayer' object has no attribute 'mask_input' name=l.name+'2', mask_input=mask_input, ) elif type(l) == layers.SliceLayer: dst_net = layers.SliceLayer( dst_net, indices=l.slice, axis=l.axis, name=l.name+'2', ) else: raise ValueError("Unhandled layer: {}".format(l)) new_layer = layers.get_all_layers(dst_net)[-1] logger.info('dst_net[...]: {} ({})'.format(new_layer, new_layer.name)) logger.info("Result of cloning:") for l in layers.get_all_layers(dst_net): logger.info(" - {} ({}):".format(l.name, l)) return dst_net
input_sequence = T.matrix('token sequencea', 'int32') input_mask = T.neq(input_sequence, src_voc.PAD) target_values = T.matrix('actual next token', 'int32') target_mask = T.neq(target_values, dst_voc.PAD) CODE_SIZE = 512 l_in = lasagne.layers.InputLayer(shape=(None, None), input_var=input_sequence) l_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=input_mask) #encoder l_emb = L.EmbeddingLayer(l_in, src_voc.len, 128) l_rnn = L.LSTMLayer(l_emb, 256, nonlinearity=T.tanh, mask_input=l_mask) l_rnn = L.concat([l_emb, l_rnn], axis=-1) l_encoded = l_rnn = L.LSTMLayer(l_rnn, CODE_SIZE, nonlinearity=T.tanh, mask_input=l_mask) l_trans = L.InputLayer((None, None), input_var=target_values[:, :-1]) l_trans_mask = L.InputLayer((None, None), input_var=target_mask[:, :-1]) from agentnet.agent.recurrence import Recurrence from agentnet.memory import AttentionLayer, LSTMCell from agentnet.resolver import ProbabilisticResolver, GreedyResolver class AutoLSTMCell:
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') gx_init = sparse.csr_matrix('gx', dtype='float32') gy_init = T.ivector('gy') gz_init = T.vector('gz') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) x_to_label = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) x_to_emd = layers.SparseLayer(x_input, self.embedding_size) W = x_to_emd.W x_to_emd = layers.DenseLayer(x_to_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) x_concat = lgl.ConcatLayer([x_to_label, x_to_emd], axis=1) x_concat = layers.DenseLayer(x_concat, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) pred = lgl.get_output(x_concat) step_loss = lgo.categorical_crossentropy(pred, y_init).mean() hid_loss = lgl.get_output(x_to_label) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(x_to_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = lgl.get_all_params(x_concat) step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init], step_loss, updates=step_updates) self.test_fn = theano.function([x_init], pred) # supervised train gx_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_init) gx_to_emd = layers.SparseLayer(gx_input, self.embedding_size, W=W) gx_to_emd = lgl.DenseLayer(gx_to_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) gx_pred = lgl.get_output(gx_to_emd) g_loss = lgo.categorical_crossentropy(gx_pred, gy_init).sum() sup_params = lgl.get_all_params(gx_to_emd) sup_updates = lg.updates.sgd(g_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([gx_init, gy_init, gz_init], g_loss, updates=sup_updates, on_unused_input='ignore') # handle lstm input cross_entropy = lgo.categorical_crossentropy(gx_pred, gy_init) cross_entropy = T.reshape(cross_entropy, (1, self.subpath_num), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) sub_path_batch1 = sparse.csr_matrix('x', dtype='float32') sub_path_input1 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch1) sub_path_batch2 = sparse.csr_matrix('x', dtype='float32') sub_path_input2 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch2) sub_path_batch3 = sparse.csr_matrix('x', dtype='float32') sub_path_input3 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch3) sub_path_batch4 = sparse.csr_matrix('x', dtype='float32') sub_path_input4 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch4) sub_path_emd1 = layers.SparseLayer(sub_path_input1, self.embedding_size, W=W) sub_path_emd1 = T.reshape(lgl.get_output(sub_path_emd1), (self.subpath_num, 1, self.embedding_size)) sub_path_emd2 = layers.SparseLayer(sub_path_input2, self.embedding_size, W=W) sub_path_emd2 = T.reshape(lgl.get_output(sub_path_emd2), (self.subpath_num, 1, self.embedding_size)) sub_path_emd3 = layers.SparseLayer(sub_path_input3, self.embedding_size, W=W) sub_path_emd3 = T.reshape(lgl.get_output(sub_path_emd3), (self.subpath_num, 1, self.embedding_size)) sub_path_emd4 = layers.SparseLayer(sub_path_input4, self.embedding_size, W=W) sub_path_emd4 = T.reshape(lgl.get_output(sub_path_emd4), (self.subpath_num, 1, self.embedding_size)) sub_path_concat = T.concatenate([sub_path_emd1, sub_path_emd2, sub_path_emd3, sub_path_emd4], axis=1) sub_path_concat_layer = lgl.InputLayer(shape=(None, self.window_size + 1, self.embedding_size), input_var=sub_path_concat) # lstm layer lstm_layer = lgl.LSTMLayer(sub_path_concat_layer, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params = lgl.get_all_params(lstm_layer, trainable=True) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([gx_init, gy_init, gz_init, sub_path_batch1, sub_path_batch2, sub_path_batch3, sub_path_batch4, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([gx_init, gy_init, gz_init, sub_path_batch1, sub_path_batch2, sub_path_batch3, sub_path_batch4, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')
def build_network(self, vocab_size, input_var, mask_var, docidx_var, docidx_mask, skip_connect=True): l_in = L.InputLayer(shape=(None, None, 1), input_var=input_var) l_mask = L.InputLayer(shape=(None, None), input_var=mask_var) l_embed = L.EmbeddingLayer(l_in, input_size=vocab_size, output_size=EMBED_DIM, W=self.params['W_emb']) l_embed_noise = L.dropout(l_embed, p=DROPOUT_RATE) # NOTE: Moved initialization of forget gate biases to init_params #forget_gate_1 = L.Gate(b=lasagne.init.Constant(3)) #forget_gate_2 = L.Gate(b=lasagne.init.Constant(3)) # NOTE: LSTM layer provided by Lasagne is slightly different from that used in DeepMind's paper. # In the paper the cell-to-* weights are not diagonal. # the 1st lstm layer in_gate = L.Gate(W_in=self.params['W_lstm1_xi'], W_hid=self.params['W_lstm1_hi'], W_cell=self.params['W_lstm1_ci'], b=self.params['b_lstm1_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm1_xf'], W_hid=self.params['W_lstm1_hf'], W_cell=self.params['W_lstm1_cf'], b=self.params['b_lstm1_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm1_xo'], W_hid=self.params['W_lstm1_ho'], W_cell=self.params['W_lstm1_co'], b=self.params['b_lstm1_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm1_xc'], W_hid=self.params['W_lstm1_hc'], W_cell=None, b=self.params['b_lstm1_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_1 = L.LSTMLayer(l_embed_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # the 2nd lstm layer if skip_connect: # construct skip connection from the lookup table to the 2nd layer batch_size, seq_len, _ = input_var.shape # concatenate the last dimension of l_fwd_1 and embed l_fwd_1_shp = L.ReshapeLayer(l_fwd_1, (-1, NUM_HIDDEN)) l_embed_shp = L.ReshapeLayer(l_embed, (-1, EMBED_DIM)) to_next_layer = L.ReshapeLayer( L.concat([l_fwd_1_shp, l_embed_shp], axis=1), (batch_size, seq_len, NUM_HIDDEN + EMBED_DIM)) else: to_next_layer = l_fwd_1 to_next_layer_noise = L.dropout(to_next_layer, p=DROPOUT_RATE) in_gate = L.Gate(W_in=self.params['W_lstm2_xi'], W_hid=self.params['W_lstm2_hi'], W_cell=self.params['W_lstm2_ci'], b=self.params['b_lstm2_i'], nonlinearity=lasagne.nonlinearities.sigmoid) forget_gate = L.Gate(W_in=self.params['W_lstm2_xf'], W_hid=self.params['W_lstm2_hf'], W_cell=self.params['W_lstm2_cf'], b=self.params['b_lstm2_f'], nonlinearity=lasagne.nonlinearities.sigmoid) out_gate = L.Gate(W_in=self.params['W_lstm2_xo'], W_hid=self.params['W_lstm2_ho'], W_cell=self.params['W_lstm2_co'], b=self.params['b_lstm2_o'], nonlinearity=lasagne.nonlinearities.sigmoid) cell_gate = L.Gate(W_in=self.params['W_lstm2_xc'], W_hid=self.params['W_lstm2_hc'], W_cell=None, b=self.params['b_lstm2_c'], nonlinearity=lasagne.nonlinearities.tanh) l_fwd_2 = L.LSTMLayer(to_next_layer_noise, NUM_HIDDEN, ingate=in_gate, forgetgate=forget_gate, cell=cell_gate, outgate=out_gate, peepholes=True, grad_clipping=GRAD_CLIP, mask_input=l_mask, gradient_steps=GRAD_STEPS, precompute_input=True) # slice final states of both lstm layers l_fwd_1_slice = L.SliceLayer(l_fwd_1, -1, 1) l_fwd_2_slice = L.SliceLayer(l_fwd_2, -1, 1) # g will be used to score the words based on their embeddings g = L.DenseLayer(L.concat([l_fwd_1_slice, l_fwd_2_slice], axis=1), num_units=EMBED_DIM, W=self.params['W_dense'], b=self.params['b_dense'], nonlinearity=lasagne.nonlinearities.tanh) ## get outputs #g_out = L.get_output(g) # B x D #g_out_val = L.get_output(g, deterministic=True) # B x D ## compute softmax probs #probs,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs = probs.reshape(docidx_var.shape) # B x N #probs_val,_ = theano.scan(fn=lambda g,d,dm,W: T.nnet.softmax(T.dot(g,W[d,:].T)*dm), # outputs_info=None, # sequences=[g_out_val,docidx_var,docidx_mask], # non_sequences=self.params['W_emb']) #predicted_probs_val = probs_val.reshape(docidx_var.shape) # B x N #return predicted_probs, predicted_probs_val # W is shared with the lookup table l_out = L.DenseLayer(g, num_units=vocab_size, W=self.params['W_emb'].T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def build_model(vocab_size, doc_var, qry_var, doc_mask_var, qry_mask_var, W_init=lasagne.init.Normal()): l_doc_in = L.InputLayer(shape=(None, None, 1), input_var=doc_var) l_qry_in = L.InputLayer(shape=(None, None, 1), input_var=qry_var) l_doc_embed = L.EmbeddingLayer(l_doc_in, vocab_size, EMBED_DIM, W=W_init) l_qry_embed = L.EmbeddingLayer(l_qry_in, vocab_size, EMBED_DIM, W=l_doc_embed.W) l_doc_mask = L.InputLayer(shape=(None, None), input_var=doc_mask_var) l_qry_mask = L.InputLayer(shape=(None, None), input_var=qry_mask_var) l_doc_fwd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_doc_bkd = L.LSTMLayer(l_doc_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_doc_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_qry_fwd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True) l_qry_bkd = L.LSTMLayer(l_qry_embed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qry_mask, gradient_steps=GRAD_STEPS, precompute_input=True, backwards=True) l_doc_fwd_slice = L.SliceLayer(l_doc_fwd, -1, 1) l_doc_bkd_slice = L.SliceLayer(l_doc_bkd, 0, 1) l_qry_fwd_slice = L.SliceLayer(l_qry_fwd, -1, 1) l_qry_bkd_slice = L.SliceLayer(l_qry_bkd, 0, 1) r = L.DenseLayer(L.ElemwiseSumLayer([l_doc_fwd_slice, l_doc_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) u = L.DenseLayer(L.ElemwiseSumLayer([l_qry_fwd_slice, l_qry_bkd_slice]), num_units=NUM_HIDDEN, nonlinearity=lasagne.nonlinearities.tanh) g = L.DenseLayer(L.concat([r, u], axis=1), num_units=EMBED_DIM, W=lasagne.init.GlorotNormal(), nonlinearity=lasagne.nonlinearities.tanh) l_out = L.DenseLayer(g, num_units=vocab_size, W=l_doc_embed.W.T, nonlinearity=lasagne.nonlinearities.softmax, b=None) return l_out
def get_char2word(self, ic, avg=False): suf = '_avg' if avg else '' ec = L.EmbeddingLayer( ic, self.args.vc, self.args.nc, name='ec' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 32, 16) ec.params[ec.W].remove('regularizable') if self.args.char_model == 'CNN': lds = L.dimshuffle(ec, (0, 3, 1, 2)) # (100, 16, 24, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.nf, (1, n), untie_biases=True, W=HeNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 24, 32-n+1) lpool = L.MaxPool2DLayer( lconv, (1, self.args.max_len - n + 1)) # (100, 64, 24, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 24) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 24, 16) ls.append(lpool) xc = L.concat(ls, axis=2) # (100, 24, 64) return xc elif self.args.char_model == 'LSTM': ml = L.ExpressionLayer( ic, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_len)) # (2400, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( ec, (-1, self.args.max_len, self.args.nc)) # (2400, 32, 16) lstm_f = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (2400, 64) lstm_b = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (2400, 64) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (2400, 128) xc = L.reshape(xc, (-1, self.args.sw, self.args.nw)) # (100, 24, 256) return xc
def __init__(self, train_raw, test_raw, dim, mode, l2, l1, batch_norm, dropout, batch_size, ihm_C, los_C, ph_C, decomp_C, partition, nbins, **kwargs): print "==> not used params in network class:", kwargs.keys() self.train_raw = train_raw self.test_raw = test_raw self.dim = dim self.mode = mode self.l2 = l2 self.l1 = l1 self.batch_norm = batch_norm self.dropout = dropout self.batch_size = batch_size self.ihm_C = ihm_C self.los_C = los_C self.ph_C = ph_C self.decomp_C = decomp_C self.nbins = nbins if (partition == 'log'): self.get_bin = metrics.get_bin_log self.get_estimate = metrics.get_estimate_log else: assert self.nbins == 10 self.get_bin = metrics.get_bin_custom self.get_estimate = metrics.get_estimate_custom self.train_batch_gen = self.get_batch_gen(self.train_raw) self.test_batch_gen = self.get_batch_gen(self.test_raw) self.input_var = T.tensor3('X') self.input_lens = T.ivector('L') self.ihm_pos = T.ivector('ihm_pos') self.ihm_mask = T.ivector('ihm_mask') self.ihm_label = T.ivector('ihm_label') self.los_mask = T.imatrix('los_mask') self.los_label = T.matrix('los_label') # for regression #self.los_label = T.imatrix('los_label') self.ph_label = T.imatrix('ph_label') self.decomp_mask = T.imatrix('decomp_mask') self.decomp_label = T.imatrix('decomp_label') print "==> Building neural network" # common network network = layers.InputLayer((None, None, self.train_raw[0][0].shape[1]), input_var=self.input_var) if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) network = layers.LSTMLayer(incoming=network, num_units=dim, only_return_final=False, grad_clipping=10, ingate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), forgetgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh, W_in=Orthogonal(), W_hid=Orthogonal()), outgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1))) if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) lstm_output = layers.get_output(network) self.params = layers.get_all_params(network, trainable=True) self.reg_params = layers.get_all_params(network, regularizable=True) # for each example in minibatch take the last output last_outputs = [] for index in range(self.batch_size): last_outputs.append(lstm_output[index, self.input_lens[index]-1, :]) last_outputs = T.stack(last_outputs) # take 48h outputs for fixed mortality task mid_outputs = [] for index in range(self.batch_size): mid_outputs.append(lstm_output[index, self.ihm_pos[index], :]) mid_outputs = T.stack(mid_outputs) # in-hospital mortality related network ihm_network = layers.InputLayer((None, dim), input_var=mid_outputs) ihm_network = layers.DenseLayer(incoming=ihm_network, num_units=2, nonlinearity=softmax) self.ihm_prediction = layers.get_output(ihm_network) self.ihm_det_prediction = layers.get_output(ihm_network, deterministic=True) self.params += layers.get_all_params(ihm_network, trainable=True) self.reg_params += layers.get_all_params(ihm_network, regularizable=True) self.ihm_loss = (self.ihm_mask * categorical_crossentropy(self.ihm_prediction, self.ihm_label)).mean() # length of stay related network # Regression los_network = layers.InputLayer((None, None, dim), input_var=lstm_output) los_network = layers.ReshapeLayer(los_network, (-1, dim)) los_network = layers.DenseLayer(incoming=los_network, num_units=1, nonlinearity=rectify) los_network = layers.ReshapeLayer(los_network, (lstm_output.shape[0], -1)) self.los_prediction = layers.get_output(los_network) self.los_det_prediction = layers.get_output(los_network, deterministic=True) self.params += layers.get_all_params(los_network, trainable=True) self.reg_params += layers.get_all_params(los_network, regularizable=True) self.los_loss = (self.los_mask * squared_error(self.los_prediction, self.los_label)).mean(axis=1).mean(axis=0) # phenotype related network ph_network = layers.InputLayer((None, dim), input_var=last_outputs) ph_network = layers.DenseLayer(incoming=ph_network, num_units=25, nonlinearity=sigmoid) self.ph_prediction = layers.get_output(ph_network) self.ph_det_prediction = layers.get_output(ph_network, deterministic=True) self.params += layers.get_all_params(ph_network, trainable=True) self.reg_params += layers.get_all_params(ph_network, regularizable=True) self.ph_loss = nn_utils.multilabel_loss(self.ph_prediction, self.ph_label) # decompensation related network decomp_network = layers.InputLayer((None, None, dim), input_var=lstm_output) decomp_network = layers.ReshapeLayer(decomp_network, (-1, dim)) decomp_network = layers.DenseLayer(incoming=decomp_network, num_units=2, nonlinearity=softmax) decomp_network = layers.ReshapeLayer(decomp_network, (lstm_output.shape[0], -1, 2)) self.decomp_prediction = layers.get_output(decomp_network)[:, :, 1] self.decomp_det_prediction = layers.get_output(decomp_network, deterministic=True)[:, :, 1] self.params += layers.get_all_params(decomp_network, trainable=True) self.reg_params += layers.get_all_params(decomp_network, regularizable=True) self.decomp_loss = nn_utils.multilabel_loss_with_mask(self.decomp_prediction, self.decomp_label, self.decomp_mask) """ data = next(self.train_batch_gen) print max(data[1]) print lstm_output.eval({self.input_var:data[0]}).shape exit() """ if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params) else: self.loss_l2 = T.constant(0) if self.l1 > 0: self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params) else: self.loss_l1 = T.constant(0) self.reg_loss = self.loss_l1 + self.loss_l2 self.loss = (ihm_C * self.ihm_loss + los_C * self.los_loss + ph_C * self.ph_loss + decomp_C * self.decomp_loss + self.reg_loss) #updates = lasagne.updates.adadelta(self.loss, self.params, # learning_rate=0.001) #updates = lasagne.updates.momentum(self.loss, self.params, # learning_rate=0.00003) #updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5, learning_rate=0.0001) # from DCGAN paper #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9, # learning_rate=0.001, all_inputs = [self.input_var, self.input_lens, self.ihm_pos, self.ihm_mask, self.ihm_label, self.los_mask, self.los_label, self.ph_label, self.decomp_mask, self.decomp_label] train_outputs = [self.ihm_prediction, self.los_prediction, self.ph_prediction, self.decomp_prediction, self.loss, self.ihm_loss, self.los_loss, self.ph_loss, self.decomp_loss, self.reg_loss] test_outputs = [self.ihm_det_prediction, self.los_det_prediction, self.ph_det_prediction, self.decomp_det_prediction, self.loss, self.ihm_loss, self.los_loss, self.ph_loss, self.decomp_loss, self.reg_loss] ## compiling theano functions if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=all_inputs, outputs=train_outputs, updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=all_inputs, outputs=test_outputs)
def build_model(vmap, nclasses=2, embedding_dim=50, nhidden=256, batchsize=None, invar=None, maskvar=None, bidirectional=True, pool=True, grad_clip=100, maxlen=MAXLEN): V = len(vmap) W = lasagne.init.Normal() # Input Layer # TODO: should be (batchsize, maxlen, vocab_size) l_in = layer.InputLayer((batchsize, maxlen, V), input_var=invar) l_mask = layer.InputLayer((batchsize, maxlen), input_var=maskvar) ASSUME = {l_in: (200, 140, 94), l_mask: (200, 140)} print 'Input Layer' print 'output:', get_output_shape(l_in, ASSUME) print 'output(mask):', get_output_shape(l_mask, ASSUME) print # Embedding Layer l_emb = layer.EmbeddingLayer(l_in, input_size=V, output_size=embedding_dim, W=W) print 'Embedding Layer' print 'output:', get_output_shape(l_emb, ASSUME) gate_params = layer.recurrent.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.)) cell_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh) l_fwd = layer.LSTMLayer(l_emb, num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True) print 'Forward LSTM' print 'output:', get_output_shape(l_fwd, ASSUME) l_concat = None if bidirectional: l_bwd = layer.LSTMLayer(l_emb, num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, backwards=True) print 'Backward LSTM' print 'output:', get_output_shape(l_bwd, ASSUME) def tmean(a, b): agg = theano.tensor.add(a, b) agg /= 2. return agg if pool: l_concat = layer.ElemwiseMergeLayer([l_fwd, l_bwd], tmean) else: l_concat = layer.ConcatLayer([l_fwd, l_bwd]) else: l_concat = layer.ConcatLayer([l_fwd]) print 'Concat' print 'output:', get_output_shape(l_concat, ASSUME) l_concat = layer.DropoutLayer(l_concat, p=0.5) l_lstm2 = layer.LSTMLayer(l_concat, num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, only_return_final=True) print 'LSTM #2' print 'output:', get_output_shape(l_lstm2, ASSUME) l_lstm2 = layer.DropoutLayer(l_lstm2, p=0.6) network = layer.DenseLayer(l_lstm2, num_units=nclasses, nonlinearity=lasagne.nonlinearities.softmax) print 'Dense Layer' print 'output:', get_output_shape(network, ASSUME) return network
def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs): batch_size = self.mask_context_var.shape[0] context_len = self.mask_context_var.shape[1] question_len = self.question_var.shape[1] context_word_len = self.context_char_var.shape[2] question_word_len = self.question_char_var.shape[2] self.batch_size = batch_size self.context_len = context_len ''' Inputs and word embeddings''' l_context_char = LL.InputLayer(shape=(None, None, None), input_var=self.context_char_var) l_question_char = LL.InputLayer(shape=(None, None, None), input_var=self.question_char_var) l_c_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_var) l_q_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_var) l_c_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_context_char_var) l_q_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_question_char_var) l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.context_var) l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.question_var) if self.train_unk: l_c_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_unk_var) l_q_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_unk_var) l_c_emb = TrainUnkLayer(l_c_emb, l_c_unk_mask, output_size=self.emb_size, W=self.word_embeddings[0]) l_q_emb = TrainUnkLayer(l_q_emb, l_q_unk_mask, output_size=self.emb_size, W=l_c_emb.W) if self.negative: l_c_emb = TrainNAWLayer(l_c_emb, l_c_mask, output_size=self.emb_size) ''' Char-embeddings ''' # (batch_size x context_len x context_word_len x emb_char_size) l_c_char_emb = LL.EmbeddingLayer(l_context_char, input_size=self.alphabet_size, output_size=self.emb_char_size) l_q_char_emb = LL.EmbeddingLayer(l_question_char, input_size=self.alphabet_size, output_size=self.emb_char_size, W=l_c_char_emb.W) # here I do multiplication of character embeddings with masks, # because I want to pad them with constant zeros l_c_char_mask = ForgetSizeLayer( LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x'))) l_q_char_mask = ForgetSizeLayer( LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x'))) l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask], T.mul) l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask], T.mul) # convolutions l_c_char_emb = LL.dimshuffle( LL.reshape(l_c_char_emb, (batch_size * context_len, context_word_len, self.emb_char_size)), (0, 2, 1)) l_c_char_conv = LL.Conv1DLayer(l_c_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, pad=self.conv) # (batch_size * context_len x num_filters x context_word_len + filter_size - 1) l_c_char_emb = LL.ExpressionLayer(l_c_char_conv, lambda X: X.max(2), output_shape='auto') l_c_char_emb = LL.reshape( l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters)) l_q_char_emb = LL.dimshuffle( LL.reshape(l_q_char_emb, (batch_size * question_len, question_word_len, self.emb_char_size)), (0, 2, 1)) l_q_char_conv = LL.Conv1DLayer(l_q_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, W=l_c_char_conv.W, b=l_c_char_conv.b, pad=self.conv) # (batch_size * question_len x num_filters x question_word_len + filter_size - 1) l_q_char_emb = LL.ExpressionLayer(l_q_char_conv, lambda X: X.max(2), output_shape='auto') l_q_char_emb = LL.reshape( l_q_char_emb, (batch_size, question_len, self.num_emb_char_filters)) ''' Concatenating both embeddings ''' l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2) l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2) # originally I had dropout here ''' Highway layer allowing for interaction between embeddings ''' l_c_P = LL.reshape(l_c_emb, (batch_size * context_len, self.emb_size + self.num_emb_char_filters)) l_c_P = LL.DenseLayer(l_c_P, num_units=self.rec_size, b=None, nonlinearity=None) l_c_high = HighwayLayer(l_c_P) l_c_emb = LL.reshape(l_c_high, (batch_size, context_len, self.rec_size)) l_q_P = LL.reshape(l_q_emb, (batch_size * question_len, self.emb_size + self.num_emb_char_filters)) l_q_P = LL.DenseLayer(l_q_P, num_units=self.rec_size, W=l_c_P.W, b=None, nonlinearity=None) l_q_high = HighwayLayer(l_q_P, W1=l_c_high.W1, b1=l_c_high.b1, W2=l_c_high.W2, b2=l_c_high.b2) l_q_emb = LL.reshape(l_q_high, (batch_size, question_len, self.rec_size)) ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 ''' l_weighted_feat = WeightedFeatureLayer( [l_c_emb, l_q_emb, l_c_mask, l_q_mask]) # batch_size x context_len l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x')) # batch_size x context_len l_bin_feat = LL.InputLayer(shape=(None, None), input_var=self.bin_feat_var) l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x')) ''' Dropout at the embeddings ''' if emb_dropout: print('Using dropout after wiq calculation.') l_c_emb = LL.dropout(l_c_emb) l_q_emb = LL.dropout(l_q_emb) ''' Here we concatenate wiq features to embeddings''' # both features are concatenated to the embeddings # for the question we fix the features to 1 l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2) l_q_emb = LL.pad(l_q_emb, width=[(0, 2)], val=L.utils.floatX(1), batch_ndim=2) ''' Context and question encoding using the same BiLSTM for both ''' # output shape is (batch_size x context_len x rec_size) l_c_enc_forw = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask) l_c_enc_back = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask, backwards=True) # output shape is (batch_size x question_len x rec_size) l_q_enc_forw = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate, W_hid=l_c_enc_forw.W_hid_to_ingate, W_cell=l_c_enc_forw.W_cell_to_ingate, b=l_c_enc_forw.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate, W_hid=l_c_enc_forw.W_hid_to_forgetgate, W_cell=l_c_enc_forw.W_cell_to_forgetgate, b=l_c_enc_forw.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate, W_hid=l_c_enc_forw.W_hid_to_outgate, W_cell=l_c_enc_forw.W_cell_to_outgate, b=l_c_enc_forw.b_outgate), cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell, W_hid=l_c_enc_forw.W_hid_to_cell, W_cell=None, b=l_c_enc_forw.b_cell, nonlinearity=L.nonlinearities.tanh)) l_q_enc_back = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, backwards=True, ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate, W_hid=l_c_enc_back.W_hid_to_ingate, W_cell=l_c_enc_back.W_cell_to_ingate, b=l_c_enc_back.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate, W_hid=l_c_enc_back.W_hid_to_forgetgate, W_cell=l_c_enc_back.W_cell_to_forgetgate, b=l_c_enc_back.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate, W_hid=l_c_enc_back.W_hid_to_outgate, W_cell=l_c_enc_back.W_cell_to_outgate, b=l_c_enc_back.b_outgate), cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell, W_hid=l_c_enc_back.W_hid_to_cell, W_cell=None, b=l_c_enc_back.b_cell, nonlinearity=L.nonlinearities.tanh)) # batch_size x context_len x 2*rec_size l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2) # batch_size x question_len x 2*rec_size l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2) def proj_init(): return np.vstack([ np.eye(self.rec_size, dtype=theano.config.floatX), np.eye(self.rec_size, dtype=theano.config.floatX) ]) # this is H from the paper, shape: (batch_size * context_len x # rec_size) l_c_proj = LL.reshape(l_c_enc, (batch_size * context_len, 2 * self.rec_size)) l_c_proj = LL.DenseLayer(l_c_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) # this is Z from the paper, shape: (batch_size * question_len x # rec_size) l_q_proj = LL.reshape(l_q_enc, (batch_size * question_len, 2 * self.rec_size)) l_q_proj = LL.DenseLayer(l_q_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) ''' Additional, weighted question encoding (alphas from paper) ''' l_alpha = LL.DenseLayer( l_q_proj, # batch_size * question_len x 1 num_units=1, b=None, nonlinearity=None) # batch_size x question_len l_alpha = MaskedSoftmaxLayer( LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask) # batch_size x rec_size l_z_hat = BatchedDotLayer([ LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)), l_alpha ]) return l_c_proj, l_z_hat
def build_model(hyparams, vocab, nclasses=2, batchsize=None, invar=None, maskvar=None, maxlen=MAXLEN): embedding_dim = hyparams.embedding_dim nhidden = hyparams.nhidden bidirectional = hyparams.bidirectional pool = hyparams.pool grad_clip = hyparams.grad_clip init = hyparams.init net = OrderedDict() V = len(vocab) W = lasagne.init.Normal() gate_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), b=lasagne.init.Constant(0.) ) cell_params = layer.recurrent.Gate( W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=lasagne.nonlinearities.tanh ) # define model net['input'] = layer.InputLayer((batchsize, maxlen), input_var=invar) net['mask'] = layer.InputLayer((batchsize, maxlen), input_var=maskvar) net['emb'] = layer.EmbeddingLayer(net['input'], input_size=V, output_size=embedding_dim, W=W) net['fwd1'] = layer.LSTMLayer( net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True ) if bidirectional: net['bwd1'] = layer.LSTMLayer( net['emb'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, backwards=True ) def tmean(a, b): agg = theano.tensor.add(a, b) agg /= 2. return agg net['pool'] = layer.ElemwiseMergeLayer([net['fwd1'], net['bwd1']], tmean) else: net['pool'] = layer.ConcatLayer([net['fwd1']]) net['dropout1'] = layer.DropoutLayer(net['pool'], p=0.5) net['fwd2'] = layer.LSTMLayer( net['dropout1'], num_units=nhidden, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh, mask_input=net['mask'], ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, learn_init=True, only_return_final=True ) net['dropout2'] = layer.DropoutLayer(net['fwd2'], p=0.6) net['softmax'] = layer.DenseLayer( net['dropout2'], num_units=nclasses, nonlinearity=lasagne.nonlinearities.softmax ) ASSUME = {net['input']: (200, 140), net['mask']: (200, 140)} logstr = '========== MODEL ========== \n' logstr += 'vocab size: %d\n' % V logstr += 'embedding dim: %d\n' % embedding_dim logstr += 'nhidden: %d\n' % nhidden logstr += 'pooling: %s\n' % pool for lname, lyr in net.items(): logstr += '%s %s\n' % (lname, str(get_output_shape(lyr, ASSUME))) logstr += '=========================== \n' print logstr return net
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') g_init = T.imatrix('g') ind_init = T.ivector('ind') sub_path_init = T.imatrix('subPathsBatch') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) g_input = lgl.InputLayer(shape=(None, 2), input_var=g_init) ind_input = lgl.InputLayer(shape=(None, ), input_var=ind_init) pair_second = lgl.SliceLayer(g_input, indices=1, axis=1) pair_first = lgl.SliceLayer(g_input, indices=0, axis=1) pair_first_emd = lgl.EmbeddingLayer(pair_first, input_size=self.num_ver, output_size=self.embedding_size) emd_to_numver = layers.DenseLayer( pair_first_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) index_emd = lgl.EmbeddingLayer(ind_input, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) x_to_ydim = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) index_emd = layers.DenseLayer(index_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two = lgl.ConcatLayer([x_to_ydim, index_emd], axis=1) concat_two = layers.DenseLayer(concat_two, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) concat_two_output = lgl.get_output(concat_two) step_loss = lgo.categorical_crossentropy(concat_two_output, y_init).mean() hid_loss = lgl.get_output(x_to_ydim) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(index_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = [ index_emd.W, index_emd.b, x_to_ydim.W, x_to_ydim.b, concat_two.W, concat_two.b ] step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init, ind_init], step_loss, updates=step_updates, on_unused_input='ignore') self.test_fn = theano.function([x_init, ind_init], concat_two_output, on_unused_input='ignore') # supervised train fc_output = lgl.get_output(emd_to_numver) pair_second_output = lgl.get_output(pair_second) sup_loss = lgo.categorical_crossentropy(fc_output, pair_second_output).sum() sup_params = lgl.get_all_params(emd_to_numver, trainable=True) sup_updates = lg.updates.sgd(sup_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([g_init], sup_loss, updates=sup_updates, on_unused_input='ignore') cross_entropy = lgo.categorical_crossentropy(fc_output, pair_second_output) cross_entropy = T.reshape(cross_entropy, (1, self.unsup_batch_size), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) subPath_in = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=sub_path_init) sub_path_emd = lgl.EmbeddingLayer(subPath_in, input_size=self.num_ver, output_size=self.embedding_size, W=pair_first_emd.W) lstm_layer = lgl.LSTMLayer(sub_path_emd, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params_all = lgl.get_all_params(lstm_layer, trainable=True) lstm_params = list(set(lstm_params_all).difference(set(sup_params))) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([sub_path_init, g_init, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')
def build_network(self, V, C, W, dv, qv, tv, dmv, qmv, fv): # inputs l_docin = L.InputLayer(shape=(None, None), input_var=dv) l_qin = L.InputLayer(shape=(None, None), input_var=qv) l_docmask = L.InputLayer(shape=(None, None), input_var=dmv) l_qmask = L.InputLayer(shape=(None, None), input_var=qmv) l_featin = L.InputLayer(shape=(None, None), input_var=fv) l_docembed = L.EmbeddingLayer(l_docin, input_size=V, output_size=EMBED_DIM, W=W) # B x N x DE l_qembed = L.EmbeddingLayer(l_qin, input_size=V, output_size=EMBED_DIM, W=l_docembed.W) # B x Q x DE l_fembed = L.EmbeddingLayer(l_featin, input_size=2, output_size=2) # B x N x 2 # question lstm l_q_lstm = L.LSTMLayer(l_qembed, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_qmask, \ gradient_steps=GRAD_STEPS, precompute_input=True) # B x Q x D l_q_lstm = L.dropout(l_q_lstm, p=DROPOUT_RATE) l_q_att_in = L.ReshapeLayer( l_q_lstm, (qv.shape[0] * qv.shape[1], NUM_HIDDEN)) # BQ x D l_q_att_1 = L.DenseLayer(l_q_att_in, NUM_HIDDEN, b=None, \ nonlinearity=lasagne.nonlinearities.tanh) # BQ x D l_q_att_2 = L.DenseLayer(l_q_att_1, 1, b=None, nonlinearity=None) # BQ x 1 l_q_att_out = L.ReshapeLayer(l_q_att_2, (qv.shape[0], qv.shape[1])) # B x Q q = L.get_output(l_q_lstm) alphas = T.nnet.softmax(L.get_output(l_q_att_out)) * qmv # B x Q alphas = alphas / alphas.sum(axis=1)[:, np.newaxis] rq = (alphas[:, :, np.newaxis] * q).sum(axis=1) # B x D # evidence lstm rq_tiled = T.reshape(T.tile(rq, (1, dv.shape[1])), (dv.shape[0], dv.shape[1], NUM_HIDDEN)) l_rq_in = L.InputLayer(shape=(None, None, NUM_HIDDEN), input_var=rq_tiled) # B x N x D l_ev = L.ConcatLayer([l_docembed, l_rq_in, l_fembed], axis=2) # B x N x (DE+D+2) l_ev_lstm1 = L.LSTMLayer(l_ev, NUM_HIDDEN, grad_clipping=GRAD_CLIP, mask_input=l_docmask, \ gradient_steps=GRAD_STEPS, precompute_input=True) # B x N x D l_ev_lstm1 = L.dropout(l_ev_lstm1, p=DROPOUT_RATE) l_ev_lstm2 = L.LSTMLayer(l_ev_lstm1, NUM_HIDDEN, grad_clipping=GRAD_CLIP, \ mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \ backwards=True) # B x N x D l_ev_lstm2 = L.dropout(l_ev_lstm2, p=DROPOUT_RATE) l_ev_lstm3 = L.LSTMLayer(L.ConcatLayer([l_ev_lstm1,l_ev_lstm2], axis=2), NUM_HIDDEN, \ grad_clipping=GRAD_CLIP, mask_input=l_docmask, gradient_steps=GRAD_STEPS, \ precompute_input=True) # B x N x D l_ev_lstm3 = L.dropout(l_ev_lstm3, p=DROPOUT_RATE) # crf l_class_in = L.ReshapeLayer( l_ev_lstm3, (dv.shape[0] * dv.shape[1], NUM_HIDDEN)) # BN x D l_class = L.DenseLayer(l_class_in, C, b=None, nonlinearity=None) # BN x C l_crf_in = L.ReshapeLayer(l_class, (dv.shape[0], dv.shape[1], C)) # B x N x C l_crf = CRFLayer(l_crf_in, C, mask_input=dmv, label_input=tv, normalize=False, \ end_points=True) # 1 l_crfdecode = CRFDecodeLayer(l_crf_in, C, W_sim=l_crf.W_sim, \ W_end_points=l_crf.W_end_points, mask_input=dmv) # B x N # params self.e_net = l_crf self.q_net = l_q_att_out params = L.get_all_params([self.e_net, self.q_net], trainable=True) return L.get_output(l_crf), params, L.get_output(l_crfdecode, deterministic=True)
def __init__(self, train_raw, test_raw, dim, mode, l2, l1, batch_norm, dropout, batch_size, **kwargs): print "==> not used params in network class:", kwargs.keys() self.train_raw = train_raw self.test_raw = test_raw self.dim = dim self.mode = mode self.l2 = l2 self.l1 = l1 self.batch_norm = batch_norm self.dropout = dropout self.batch_size = batch_size self.train_batch_gen = self.get_batch_gen(self.train_raw) self.test_batch_gen = self.get_batch_gen(self.test_raw) self.input_var = T.tensor3('X') self.input_lens = T.ivector('L') self.target_var = T.imatrix('y') """ for i in range(700//self.batch_size): ret=next(self.train_batch_gen) print len(ret[0]) print ret[0][0].shape print len(ret[1]) print type(ret[1][0]) print "---" exit() """ print "==> Building neural network" network = layers.InputLayer((None, None, self.train_raw[0][0].shape[1]), input_var=self.input_var) #print "!!!!!!!!!!! WARNING: dropout on input is disabled !!!!!!!!!!!!!!!!" if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) network = layers.LSTMLayer(incoming=network, num_units=dim, grad_clipping=10, ingate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), forgetgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh, W_in=Orthogonal(), W_hid=Orthogonal()), outgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1))) if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) network = layers.LSTMLayer(incoming=network, num_units=dim, only_return_final=False, grad_clipping=10, ingate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), forgetgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh, W_in=Orthogonal(), W_hid=Orthogonal()), outgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1))) lstm_output = layers.get_output(network) self.params = layers.get_all_params(network, trainable=True) self.reg_params = layers.get_all_params(network, regularizable=True) """ data = next(self.train_batch_gen) print max(data[1]) print lstm_output.eval({self.input_var:data[0]}).shape exit() """ # for each example in minibatch take the last output last_outputs = [] for index in range(self.batch_size): last_outputs.append(lstm_output[index, self.input_lens[index]-1, :]) last_outputs = T.stack(last_outputs) """ data = next(self.train_batch_gen) print max(data[1]) print last_outputs.eval({self.input_var:data[0], self.input_lens:data[1], }).shape exit() """ network = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_outputs) if (self.dropout > 0): network = layers.DropoutLayer(network, p=self.dropout) network = layers.DenseLayer(incoming=network, num_units=train_raw[1][0].shape[0], nonlinearity=sigmoid) self.prediction = layers.get_output(network) self.det_prediction = layers.get_output(network, deterministic=True) self.params += layers.get_all_params(network, trainable=True) self.reg_params += layers.get_all_params(network, regularizable=True) self.loss_multilabel = -(self.target_var * T.log(self.prediction) + \ (1 - self.target_var) * T.log(1 - self.prediction)).mean(axis=1)\ .mean(axis=0) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params) else: self.loss_l2 = 0 if self.l1 > 0: self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params) else: self.loss_l1 = 0 self.loss = self.loss_multilabel + self.loss_l2 + self.loss_l1 #updates = lasagne.updates.adadelta(self.loss, self.params, # learning_rate=0.001) #updates = lasagne.updates.momentum(self.loss, self.params, # learning_rate=0.00003) #updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5, learning_rate=0.0001) # from DCGAN paper #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9, # learning_rate=0.001, ## compiling theano functions if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.input_lens, self.target_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.input_lens, self.target_var], outputs=[self.det_prediction, self.loss])
def additional_layer(self, idx_layer, emb_layer, avg=False): suf = '_avg' if avg else '' if self.name == 'char': if self.args.char_model == 'cnn': lds = L.dimshuffle(emb_layer, (0, 3, 1, 2)) # (100, 16, 26, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.conv_dim, (1, n), untie_biases=False, # W=HeNormal('relu') if not avg else Constant(), W=GlorotNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 26, 32-n+1) lpool = L.MaxPool2DLayer(lconv, (1, self.args.max_word_len - n + 1)) # (100, 64, 26, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 26) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 26, 16) ls.append(lpool) xc = L.concat(ls, axis=2, name='echar_concat') # (100, 26, 64) # additional # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2, # W=HeNormal() if not avg else Constant()) # (100, 26, 100) return xc elif self.args.char_model == 'lstm': ml = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_word_len)) # (1500, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( emb_layer, (-1, self.args.max_word_len, self.config['char']['emb_dim'])) # (1500, 32, 16) lstm_f = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (1500, 32) lstm_b = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (1500, 32) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (1500, 64) if self.args.lstm_tagger: xc = L.reshape( xc, (-1, self.args.max_sent_len, 64)) # (100, 161, 64) elif self.args.trans_tagger: xc = L.reshape( xc, (-1, self.args.window_size, 64)) # (100, 15, 64) else: xc = L.reshape(xc, (-1, 26, 64)) # (100, 26, 64) return xc elif self.name == 'morph': # idx (100, 26/161, 16) emb (100, 26/161, 16, 32) if self.args.morph_model == 'max': xm = L.MaxPool2DLayer( emb_layer, (self.args.max_morph_len, 1)) # (100, 26/161, 1, 32) # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32) xm = L.flatten(xm, outdim=3) # (100, 26/161, 32) # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2)) elif self.args.morph_model == 'avg': mask = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # (100, 26, 16) mask = L.dimshuffle(mask, (0, 1, 2, 'x')) # (100, 26, 16, 1) mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat( x, self.config['morph']['emb_dim'], 3)) # (100, 26, 16, 1) xm = L.ElemwiseMergeLayer([ emb_layer, mask ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2)) # (100, 26, 32) # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32) return xm else: return emb_layer
# Recurrent layers expect input of shape # (batch size, max sequence length, number of features) l_in = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH, 2)) # The network also needs a way to provide a mask for each sequence. We'll # use a separate input layer for that. Since the mask only determines # which indices are part of the sequence for each batch entry, they are # supplied as matrices of dimensionality (N_BATCH, MAX_LENGTH) l_mask = layers.InputLayer(shape=(N_BATCH, MAX_LENGTH)) # We're using a bidirectional network, which means we will combine two # RecurrentLayers, one with the backwards=True keyword argument. # Setting a value for grad_clipping will clip the gradients in the layer # Setting only_return_final=True makes the layers only return their output # for the final time step, which is all we need for this task l_forward = layers.LSTMLayer(l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, only_return_final=True) l_backward = layers.LSTMLayer(l_in, N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, only_return_final=True, backwards=True) # Now, we'll concatenate the outputs to combine them. l_concat = layers.ConcatLayer([l_forward, l_backward]) # Our output layer is a simple dense connection, with 1 output unit l_out = layers.DenseLayer(l_concat, num_units=1, nonlinearity=lasagne.nonlinearities.tanh)
def __init__(self, dim, mode, l2, l1, batch_norm, dropout, batch_size, input_dim=76, **kwargs): print "==> not used params in network class:", kwargs.keys() self.dim = dim self.mode = mode self.l2 = l2 self.l1 = l1 self.batch_norm = batch_norm self.dropout = dropout self.batch_size = batch_size self.input_var = T.tensor3('X') self.input_lens = T.ivector('L') self.target_var = T.ivector('y') self.weight = T.vector('w') print "==> Building neural network" network = layers.InputLayer((None, None, input_dim), input_var=self.input_var) network = layers.LSTMLayer(incoming=network, num_units=dim, only_return_final=False, grad_clipping=10, ingate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), forgetgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1)), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh, W_in=Orthogonal(), W_hid=Orthogonal()), outgate=lasagne.layers.Gate( W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=Normal(0.1))) lstm_output = layers.get_output(network) self.params = layers.get_all_params(network, trainable=True) self.reg_params = layers.get_all_params(network, regularizable=True) # for each example in minibatch take the last output last_outputs = [] for index in range(self.batch_size): last_outputs.append(lstm_output[index, self.input_lens[index]-1, :]) last_outputs = T.stack(last_outputs) network = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_outputs) network = layers.DenseLayer(incoming=network, num_units=2, nonlinearity=softmax) self.prediction = layers.get_output(network) self.params += layers.get_all_params(network, trainable=True) self.reg_params += layers.get_all_params(network, regularizable=True) self.loss_ce = (self.weight * categorical_crossentropy(self.prediction, self.target_var)).mean() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.reg_params) else: self.loss_l2 = 0 if self.l1 > 0: self.loss_l1 = self.l1 * nn_utils.l1_reg(self.reg_params) else: self.loss_l1 = 0 self.loss = self.loss_ce + self.loss_l2 + self.loss_l1 #updates = lasagne.updates.adadelta(self.loss, self.params, # learning_rate=0.001) #updates = lasagne.updates.momentum(self.loss, self.params, # learning_rate=0.00003) #updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, beta1=0.5, learning_rate=0.0001) # from DCGAN paper #updates = lasagne.updates.nesterov_momentum(loss, params, momentum=0.9, # learning_rate=0.001, ## compiling theano functions if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.input_lens, self.target_var, self.weight], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.input_lens, self.target_var, self.weight], outputs=[self.prediction, self.loss])
def fcrnn( input_var_list, early_conv_dict_list, late_conv_dict, dense_filter_size, final_pool_function=T.max, input_size_list=[128], output_size=10, last_late_conv_size=128, p_dropout=0.5, num_feat_type = 1, num_lstm_unit = 512, gradient_steps = 10 ): assert(len(early_conv_dict_list) == len(input_var_list) == len(input_size_list)) # early conv layers conv_network_list = list() total_stride_list = list() for jj, [early_conv_dict, input_var, input_size] in enumerate(zip( early_conv_dict_list, input_var_list, input_size_list)): input_network = lasagne.layers.InputLayer( shape=(None, num_feat_type, None, input_size), input_var=input_var) total_stride = 1 network, total_stride = conv_layers(input_network, early_conv_dict, total_stride, init_input_size=input_size, p_dropout=0, base_name='early{}'.format(jj)) total_stride_list.append(total_stride) conv_network_list.append(network) ''' # upsampling conv_network_list = [cl.LocalExtend(net, axis=2, extend_size=ts) for net, ts in zip(conv_network_list, total_stride_list)] ''' network = layers.ConcatLayer(conv_network_list, axis=1, cropping=[None, None, 'lower', None], name='MultisourceConcatenate') # late conv layers (dense layers) network, total_stride = conv_layers(network, late_conv_dict, total_stride, init_input_size=1, p_dropout=p_dropout, base_name='late') # frame output layer. every frame has a value network = cl.Conv2DXLayer( lasagne.layers.dropout(network, p=p_dropout), num_filters=last_late_conv_size, filter_size=(dense_filter_size, 1), nonlinearity=lasagne.nonlinearities.sigmoid, W=lasagne.init.GlorotUniform() ) network = layers.ReshapeLayer(network, ([0], [1], -1)) network = layers.DimshuffleLayer(network, (0, 2, 1)) # lstm layers l_forward = layers.LSTMLayer(network, output_size, grad_clipping=100, gradient_steps=10, nonlinearity=lasagne.nonlinearities.sigmoid) # l_backward = layers.LSTMLayer(l_forward, output_size, # grad_clipping=100, # gradient_steps=gradient_steps, # nonlinearity=lasagne.nonlinearities.sigmoid, # backwards=True) network = layers.DimshuffleLayer(l_forward, (0, 2, 1)) # pool network = layers.GlobalPoolLayer(network, pool_function=final_pool_function) network = layers.ReshapeLayer(network, ([0], -1)) return network