def _get_normalised_relevance_layer(self, layer, feeder): def add_epsilon(Zs): tmp = (T.cast(Zs >= 0, theano.config.floatX) * 2.0 - 1.0) return Zs + self.epsilon * tmp if isinstance(layer, L.DenseLayer): forward_layer = L.DenseLayer(layer.input_layer, layer.num_units, W=layer.W, b=layer.b, nonlinearity=None) elif isinstance(layer, L.Conv2DLayer): forward_layer = L.Conv2DLayer(layer.input_layer, num_filters=layer.num_filters, W=layer.W, b=layer.b, stride=layer.stride, filter_size=layer.filter_size, flip_filters=layer.flip_filters, untie_biases=layer.untie_biases, pad=layer.pad, nonlinearity=None) else: raise NotImplementedError() forward_layer = L.ExpressionLayer(forward_layer, lambda x: 1.0 / add_epsilon(x)) feeder = L.ElemwiseMergeLayer([forward_layer, feeder], merge_function=T.mul) return feeder
def get_conv_input(self, sidx, tidx, avg=False): suf = '_avg' if avg else '' feat_embs = [ self.manager.feats[name].get_emb_layer(sidx, tidx, avg=avg) for name in self.args.source_feats ] # TODO: change the meaning if self.args.lex == 'mix': concat_emb = L.ElemwiseSumLayer(feat_embs) # (100, 15, 256) else: concat_emb = L.concat(feat_embs, axis=2) # (100, 15, 256+100) pos = np.array([0] * (self.args.window_size / 2) + [1] + [0] * (self.args.window_size / 2)).astype( theano.config.floatX) post = theano.shared(pos[np.newaxis, :, np.newaxis], borrow=True) # (1, 15, 1) posl = L.InputLayer( (None, self.args.window_size, 1), input_var=T.extra_ops.repeat(post, sidx.shape[0], axis=0)) # (100, 15, 1) conv_in = L.concat([concat_emb, posl], axis=2) # (100, 15, 256+1) if self.args.pos_emb: posint = L.flatten( L.ExpressionLayer(posl, lambda x: T.cast(x, 'int64'))) # (100, 15) pos_emb = L.EmbeddingLayer( posint, self.args.window_size, 8, name='epos' + suf, W=Normal(0.01) if not avg else Constant()) # (100, 15, 8) pos_emb.params[pos_emb.W].remove('regularizable') conv_in = L.concat([concat_emb, posl, pos_emb], axis=2) # (100, 15, 256+1+8) # # squeeze # if self.args.squeeze: # conv_in = L.DenseLayer(conv_in, num_units=self.args.squeeze, name='squeeze'+suf, num_leading_axes=2, # W=HeNormal('relu')) # (100, 15, 256) conv_in = L.dimshuffle(conv_in, (0, 2, 1)) # (100, 256+1, 15) return conv_in
def _invert_GlobalPoolLayer(self, layer, feeder): assert isinstance(layer, L.GlobalPoolLayer) assert layer.pool_function == T.mean assert len(L.get_output_shape(layer.input_layer)) == 4 target_shape = L.get_output_shape(feeder) + (1, 1) if target_shape[0] is None: target_shape = (-1, ) + target_shape[1:] feeder = L.ReshapeLayer(feeder, target_shape) upscaling = L.get_output_shape(layer.input_layer)[2:] feeder = L.Upscale2DLayer(feeder, upscaling) def expression(x): return x / np.prod(upscaling).astype(theano.config.floatX) feeder = L.ExpressionLayer(feeder, expression) return feeder
def __init__(self, incomings, vocab_size, emb_size, W, WT=None, **kwargs): super(EncodingFullLayer, self).__init__(incomings, **kwargs) # if len(self.input_shapes[0]) == 3: # batch_size, w_count, w_length = self.input_shapes[0] shape = tuple(self.input_shapes[0]) # else: # shape = tuple(self.input_shapes[0]) self.WT = None # self.reset_zero() self.l_in = LL.InputLayer(shape=shape) self.l_in_pe = LL.InputLayer(shape=shape + (emb_size, )) self.l_emb = LL.EmbeddingLayer(self.l_in, input_size=vocab_size, output_size=emb_size, W=W) self.W = self.l_emb.W self.l_emb = LL.ElemwiseMergeLayer((self.l_emb, self.l_in_pe), merge_function=T.mul) self.l_emb_res = LL.ExpressionLayer(self.l_emb, lambda X: X.sum(2), output_shape='auto') # self.l_emb_res = SumLayer(self.l_emb, axis=2) if np.any(WT): self.l_emb_res = TemporalEncodicgLayer(self.l_emb_res, T=WT) self.WT = self.l_emb_res.T params = LL.helper.get_all_params(self.l_emb_res, trainable=True) values = LL.helper.get_all_param_values(self.l_emb_res, trainable=True) for p, v in zip(params, values): self.add_param(p, v.shape, name=p.name) zero_vec_tensor = T.vector() self.zero_vec = np.zeros(emb_size, dtype=theano.config.floatX) self.set_zero = theano.function( [zero_vec_tensor], updates=[(x, T.set_subtensor(x[-1, :], zero_vec_tensor)) for x in [self.W]])
def get_char2word(self, ic, avg=False): suf = '_avg' if avg else '' ec = L.EmbeddingLayer( ic, self.args.vc, self.args.nc, name='ec' + suf, W=HeNormal() if not avg else Constant()) # (100, 24, 32, 16) ec.params[ec.W].remove('regularizable') if self.args.char_model == 'CNN': lds = L.dimshuffle(ec, (0, 3, 1, 2)) # (100, 16, 24, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.nf, (1, n), untie_biases=True, W=HeNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 24, 32-n+1) lpool = L.MaxPool2DLayer( lconv, (1, self.args.max_len - n + 1)) # (100, 64, 24, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 24) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 24, 16) ls.append(lpool) xc = L.concat(ls, axis=2) # (100, 24, 64) return xc elif self.args.char_model == 'LSTM': ml = L.ExpressionLayer( ic, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_len)) # (2400, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( ec, (-1, self.args.max_len, self.args.nc)) # (2400, 32, 16) lstm_f = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (2400, 64) lstm_b = L.LSTMLayer( lstm_in, self.args.nw / 2, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (2400, 64) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (2400, 128) xc = L.reshape(xc, (-1, self.args.sw, self.args.nw)) # (100, 24, 256) return xc
def __init__(self, incomings, vocab_size, emb_size, A=lasagne.init.Normal(std=0.1), C=lasagne.init.Normal(std=0.1), AT=lasagne.init.Normal(std=0.1), CT=lasagne.init.Normal(std=0.1), nonlin=lasagne.nonlinearities.softmax, RN=0., **kwargs): super(MemoryLayer, self).__init__(incomings, **kwargs) self.vocab_size, self.emb_size = vocab_size, emb_size self.nonlin = nonlin self.RN = RN # self.A, self.C, self.AT, self.CT = A, C, AT, CT batch_size, c_count, c_length = self.input_shapes[0] _, q_count, _ = self.input_shapes[2] self.l_c_in = LL.InputLayer(shape=(batch_size, c_count, c_length)) self.l_c_in_pe = LL.InputLayer(shape=(batch_size, c_count, c_length, self.emb_size)) self.l_u_in = LL.InputLayer(shape=(batch_size, q_count, self.emb_size)) self.l_c_A_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe), self.vocab_size, self.emb_size, A, AT) self.l_c_C_enc = EncodingFullLayer((self.l_c_in, self.l_c_in_pe), self.vocab_size, self.emb_size, C, CT) self.A, self.C = self.l_c_A_enc.W, self.l_c_C_enc.W self.AT, self.CT = self.l_c_A_enc.WT, self.l_c_C_enc.WT if len(incomings ) == 4: # if there is also the probabilities over sentences self.l_in_ac_prob = LL.InputLayer(shape=(batch_size, c_count, emb_size)) self.l_c_A_enc_ = LL.ElemwiseMergeLayer( (self.l_c_A_enc, self.l_in_ac_prob), merge_function=T.mul) self.l_c_C_enc_ = LL.ElemwiseMergeLayer( (self.l_c_C_enc, self.l_in_ac_prob), merge_function=T.mul) self.l_u_in_tr = LL.DimshuffleLayer(self.l_u_in, pattern=(0, 2, 1)) if len(incomings) == 4: self.l_p = BatchedDotLayer((self.l_c_A_enc_, self.l_u_in_tr)) else: self.l_p = BatchedDotLayer((self.l_c_A_enc, self.l_u_in_tr)) if self.l_p.output_shape[2] == 1: self.l_p = LL.FlattenLayer(self.l_p, outdim=2) # self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1)) if self.nonlin == 'MaxOut': raise NotImplementedError self.l_p = LL.NonlinearityLayer(self.l_p, nonlinearity=nonlin) self.l_p = LL.DimshuffleLayer(self.l_p, (0, 1, 'x')) # self.l_p = LL.ReshapeLayer(self.l_p, self.l_p.output_shape + (1,)) self.l_p = LL.ExpressionLayer(self.l_p, lambda X: X.repeat(emb_size, 2), output_shape='auto') ## self.l_p = RepeatDimLayer(self.l_p, emb_size, axis=2) if len(incomings) == 4: self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc_), merge_function=T.mul) else: self.l_pc = LL.ElemwiseMergeLayer((self.l_p, self.l_c_C_enc), merge_function=T.mul) self.l_o = LL.ExpressionLayer(self.l_pc, lambda X: X.sum(1), output_shape='auto') # self.l_o = SumLayer(self.l_pc, axis=1) self.l_o = LL.DimshuffleLayer(self.l_o, pattern=(0, 'x', 1)) self.l_o_u = LL.ElemwiseMergeLayer((self.l_o, self.l_u_in), merge_function=T.add) params = LL.helper.get_all_params(self.l_o_u, trainable=True) values = LL.helper.get_all_param_values(self.l_o_u, trainable=True) for p, v in zip(params, values): self.add_param(p, v.shape, name=p.name)
def conv_concat(_in, _vec): n = _in.output_shape[2] _bcast = L.ExpressionLayer(_vec, lambda __X: __X.dimshuffle(0, 1, 'x', 'x') * T.ones((__X.shape[0], __X.shape[1], n, n)), output_shape='auto') return L.ConcatLayer([_in, _bcast], axis=1)
def create_conditon_slices_from(_cond, ish): if len(ish)==2: return _cond else: return L.ExpressionLayer(_cond, lambda __X: __X.dimshuffle(0, 1, 'x', 'x') \ * T.ones((__X.shape[0], __X.shape[1],)+ish[-2:]), output_shape='auto')
def clip(l, b=1): """ A very simple gradient clipping wrapper because stupid lasagne doens't support it """ return L.ExpressionLayer(l, lambda x: theano.gradient.grad_clip(x, -b, b))
def additional_layer(self, idx_layer, emb_layer, avg=False): suf = '_avg' if avg else '' if self.name == 'char': if self.args.char_model == 'cnn': lds = L.dimshuffle(emb_layer, (0, 3, 1, 2)) # (100, 16, 26, 32) ls = [] for n in self.args.ngrams: lconv = L.Conv2DLayer( lds, self.args.conv_dim, (1, n), untie_biases=False, # W=HeNormal('relu') if not avg else Constant(), W=GlorotNormal('relu') if not avg else Constant(), name='conv_%d' % n + suf) # (100, 64/4, 26, 32-n+1) lpool = L.MaxPool2DLayer(lconv, (1, self.args.max_word_len - n + 1)) # (100, 64, 26, 1) lpool = L.flatten(lpool, outdim=3) # (100, 16, 26) lpool = L.dimshuffle(lpool, (0, 2, 1)) # (100, 26, 16) ls.append(lpool) xc = L.concat(ls, axis=2, name='echar_concat') # (100, 26, 64) # additional # xc = L.DenseLayer(xc, self.args.embw_dim, nonlinearity=None, name='echar_affine', num_leading_axes=2, # W=HeNormal() if not avg else Constant()) # (100, 26, 100) return xc elif self.args.char_model == 'lstm': ml = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # mask layer (100, 24, 32) ml = L.reshape(ml, (-1, self.args.max_word_len)) # (1500, 32) gate_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal()) cell_params = L.recurrent.Gate(W_in=Orthogonal(), W_hid=Orthogonal(), W_cell=None, nonlinearity=tanh) lstm_in = L.reshape( emb_layer, (-1, self.args.max_word_len, self.config['char']['emb_dim'])) # (1500, 32, 16) lstm_f = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, name='forward' + suf) # (1500, 32) lstm_b = L.LSTMLayer( lstm_in, 32, mask_input=ml, grad_clipping=10., learn_init=True, peepholes=False, precompute_input=True, ingate=gate_params, forgetgate=gate_params, cell=cell_params, outgate=gate_params, # unroll_scan=True, only_return_final=True, backwards=True, name='backward' + suf) # (1500, 32) remove_reg(lstm_f) remove_reg(lstm_b) if avg: set_zero(lstm_f) set_zero(lstm_b) xc = L.concat([lstm_f, lstm_b], axis=1) # (1500, 64) if self.args.lstm_tagger: xc = L.reshape( xc, (-1, self.args.max_sent_len, 64)) # (100, 161, 64) elif self.args.trans_tagger: xc = L.reshape( xc, (-1, self.args.window_size, 64)) # (100, 15, 64) else: xc = L.reshape(xc, (-1, 26, 64)) # (100, 26, 64) return xc elif self.name == 'morph': # idx (100, 26/161, 16) emb (100, 26/161, 16, 32) if self.args.morph_model == 'max': xm = L.MaxPool2DLayer( emb_layer, (self.args.max_morph_len, 1)) # (100, 26/161, 1, 32) # xm = L.reshape(xm, (-1, 26, self.config['morph']['emb_dim'])) # (100, 26/161, 32) xm = L.flatten(xm, outdim=3) # (100, 26/161, 32) # xm = L.ExpressionLayer(emb_layer, lambda x: T.max(x, 2)) elif self.args.morph_model == 'avg': mask = L.ExpressionLayer( idx_layer, lambda x: T.neq(x, 0)) # (100, 26, 16) mask = L.dimshuffle(mask, (0, 1, 2, 'x')) # (100, 26, 16, 1) mask = L.ExpressionLayer(mask, lambda x: T.extra_ops.repeat( x, self.config['morph']['emb_dim'], 3)) # (100, 26, 16, 1) xm = L.ElemwiseMergeLayer([ emb_layer, mask ], lambda x, m: T.sum(x * m, 2) / T.sum(m, 2)) # (100, 26, 32) # xm = L.reshape(xm, (-1, self.args.feat_shape, self.config['morph']['emb_dim'])) # (100, 26, 32) return xm else: return emb_layer
def make_model(): image = ll.InputLayer((BS, CH, IH, IW), name='step1.image') h_read_init = ll.InputLayer( (HS, ), lasagne.utils.create_param(li.Uniform(), (HS, ), name='step1.tensor.h_read_init'), name='step1.h_read_init') h_read_init.add_param(h_read_init.input_var, (HS, )) h_write_init = ll.InputLayer( (HS, ), lasagne.utils.create_param(li.Uniform(), (HS, ), name='step1.tensor.h_write_init'), name='step1.h_write_init') h_write_init.add_param(h_write_init.input_var, (HS, )) h_read = ll.ExpressionLayer(h_read_init, lambda t: T.tile(T.reshape(t, (1, HS)), (BS, 1)), (BS, HS), name='step1.h_read') h_write = ll.ExpressionLayer(h_write_init, lambda t: T.tile(T.reshape(t, (1, HS)), (BS, 1)), (BS, HS), name='step1.h_write') canvas = ll.InputLayer( (BS, CH, IH, IW), lasagne.utils.create_param(li.Constant(0.0), (BS, CH, IH, IW), name='step1.tensor.canvas'), name='step1.canvas') image_prev = ll.NonlinearityLayer(canvas, ln.sigmoid, name='step1.image_prev') image_error = ll.ElemwiseSumLayer([image, image_prev], coeffs=[1, -1], name='step1.image_error') image_stack = ll.ConcatLayer([image, image_error], name='step1.image_stack') read_params = ll.DenseLayer(h_write, 6, nonlinearity=None, name='step1.read_params') read_window = advanced_layers.AttentionLayer([read_params, image_stack], (WH, WW), name='step1.read_window') read_flat = ll.FlattenLayer(read_window, name='step1.read_flat') read_code = ll.ConcatLayer([read_flat, h_write], name='step1.read_code') read_code_sequence = ll.ReshapeLayer(read_code, (BS, 1, read_code.output_shape[-1]), name='step1.read_code_sequence') read_rnn = ll.GRULayer( read_code_sequence, HS, only_return_final=True, hid_init=h_read, name='step1.read_rnn', ) sample_mean = ll.DenseLayer(read_rnn, ENC_NDIM, nonlinearity=None, name='step1.sample_mean') sample_logvar2 = ll.DenseLayer(read_rnn, ENC_NDIM, nonlinearity=None, name='step1.sample_logvar2') sample = advanced_layers.SamplingLayer([sample_mean, sample_logvar2], ENC_VAR, name='step1.sample') write_code = ll.DenseLayer(sample, HS, name='step1.write_code') write_code_sequence = ll.ReshapeLayer(write_code, (BS, 1, write_code.output_shape[-1]), name='step1.write_code_sequence') write_rnn = ll.GRULayer( write_code_sequence, HS, only_return_final=True, hid_init=h_write, name='step1.write_rnn', ) write_window_flat = ll.DenseLayer(write_rnn, CH * WH * WW, name='step1.write_window_flat') write_window = ll.ReshapeLayer(write_window_flat, (BS, CH, WH, WW), name='step1.write_window') write_params = ll.DenseLayer(h_write, 6, nonlinearity=None, name='step1.write_params') write_image = advanced_layers.AttentionLayer([write_params, write_window], (IH, IW), name='step1.write_image') canvas_next = ll.ElemwiseSumLayer([canvas, write_image], name='step1.canvas_next') def rename(name): if name is None: return None step, real_name = name.split('.', 1) step = int(step[4:]) return 'step%d.%s' % (step + 1, real_name) for step in xrange(1, TIME_ROUNDS): sample_random_variable_next = sample.random_stream.normal( sample.input_shapes[0], std=sample.variation_coeff, ) sample_random_variable_next.name = 'step%d.sample.random_variable' % \ (step + 1) canvas, canvas_next = (canvas_next, utils.modified_copy( canvas_next, modify={ h_read: read_rnn, h_write: write_rnn, canvas: canvas_next, sample.random_stream: sample.random_stream, sample.random_variable: sample_random_variable_next, }, rename=rename, )) h_read = read_rnn h_write = write_rnn read_rnn = utils.layer_by_name(canvas_next, 'step%d.read_rnn' % (step + 1)) write_rnn = utils.layer_by_name(canvas_next, 'step%d.write_rnn' % (step + 1)) sample = utils.layer_by_name(canvas_next, 'step%d.sample' % (step + 1)) output = ll.NonlinearityLayer(canvas_next, ln.sigmoid, name='output') return output
def _build_net(self, emb_char_filter_size=5, emb_dropout=True, **kwargs): batch_size = self.mask_context_var.shape[0] context_len = self.mask_context_var.shape[1] question_len = self.question_var.shape[1] context_word_len = self.context_char_var.shape[2] question_word_len = self.question_char_var.shape[2] self.batch_size = batch_size self.context_len = context_len ''' Inputs and word embeddings''' l_context_char = LL.InputLayer(shape=(None, None, None), input_var=self.context_char_var) l_question_char = LL.InputLayer(shape=(None, None, None), input_var=self.question_char_var) l_c_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_var) l_q_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_var) l_c_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_context_char_var) l_q_char_mask = LL.InputLayer(shape=(None, None, None), input_var=self.mask_question_char_var) l_c_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.context_var) l_q_emb = LL.InputLayer(shape=(None, None, self.emb_size), input_var=self.question_var) if self.train_unk: l_c_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_context_unk_var) l_q_unk_mask = LL.InputLayer(shape=(None, None), input_var=self.mask_question_unk_var) l_c_emb = TrainUnkLayer(l_c_emb, l_c_unk_mask, output_size=self.emb_size, W=self.word_embeddings[0]) l_q_emb = TrainUnkLayer(l_q_emb, l_q_unk_mask, output_size=self.emb_size, W=l_c_emb.W) if self.negative: l_c_emb = TrainNAWLayer(l_c_emb, l_c_mask, output_size=self.emb_size) ''' Char-embeddings ''' # (batch_size x context_len x context_word_len x emb_char_size) l_c_char_emb = LL.EmbeddingLayer(l_context_char, input_size=self.alphabet_size, output_size=self.emb_char_size) l_q_char_emb = LL.EmbeddingLayer(l_question_char, input_size=self.alphabet_size, output_size=self.emb_char_size, W=l_c_char_emb.W) # here I do multiplication of character embeddings with masks, # because I want to pad them with constant zeros l_c_char_mask = ForgetSizeLayer( LL.dimshuffle(l_c_char_mask, (0, 1, 2, 'x'))) l_q_char_mask = ForgetSizeLayer( LL.dimshuffle(l_q_char_mask, (0, 1, 2, 'x'))) l_c_char_emb = LL.ElemwiseMergeLayer([l_c_char_emb, l_c_char_mask], T.mul) l_q_char_emb = LL.ElemwiseMergeLayer([l_q_char_emb, l_q_char_mask], T.mul) # convolutions l_c_char_emb = LL.dimshuffle( LL.reshape(l_c_char_emb, (batch_size * context_len, context_word_len, self.emb_char_size)), (0, 2, 1)) l_c_char_conv = LL.Conv1DLayer(l_c_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, pad=self.conv) # (batch_size * context_len x num_filters x context_word_len + filter_size - 1) l_c_char_emb = LL.ExpressionLayer(l_c_char_conv, lambda X: X.max(2), output_shape='auto') l_c_char_emb = LL.reshape( l_c_char_emb, (batch_size, context_len, self.num_emb_char_filters)) l_q_char_emb = LL.dimshuffle( LL.reshape(l_q_char_emb, (batch_size * question_len, question_word_len, self.emb_char_size)), (0, 2, 1)) l_q_char_conv = LL.Conv1DLayer(l_q_char_emb, num_filters=self.num_emb_char_filters, filter_size=emb_char_filter_size, nonlinearity=L.nonlinearities.tanh, W=l_c_char_conv.W, b=l_c_char_conv.b, pad=self.conv) # (batch_size * question_len x num_filters x question_word_len + filter_size - 1) l_q_char_emb = LL.ExpressionLayer(l_q_char_conv, lambda X: X.max(2), output_shape='auto') l_q_char_emb = LL.reshape( l_q_char_emb, (batch_size, question_len, self.num_emb_char_filters)) ''' Concatenating both embeddings ''' l_c_emb = LL.concat([l_c_emb, l_c_char_emb], axis=2) l_q_emb = LL.concat([l_q_emb, l_q_char_emb], axis=2) # originally I had dropout here ''' Highway layer allowing for interaction between embeddings ''' l_c_P = LL.reshape(l_c_emb, (batch_size * context_len, self.emb_size + self.num_emb_char_filters)) l_c_P = LL.DenseLayer(l_c_P, num_units=self.rec_size, b=None, nonlinearity=None) l_c_high = HighwayLayer(l_c_P) l_c_emb = LL.reshape(l_c_high, (batch_size, context_len, self.rec_size)) l_q_P = LL.reshape(l_q_emb, (batch_size * question_len, self.emb_size + self.num_emb_char_filters)) l_q_P = LL.DenseLayer(l_q_P, num_units=self.rec_size, W=l_c_P.W, b=None, nonlinearity=None) l_q_high = HighwayLayer(l_q_P, W1=l_c_high.W1, b1=l_c_high.b1, W2=l_c_high.W2, b2=l_c_high.b2) l_q_emb = LL.reshape(l_q_high, (batch_size, question_len, self.rec_size)) ''' Calculating wiq features from https://arxiv.org/abs/1703.04816 ''' l_weighted_feat = WeightedFeatureLayer( [l_c_emb, l_q_emb, l_c_mask, l_q_mask]) # batch_size x context_len l_weighted_feat = LL.dimshuffle(l_weighted_feat, (0, 1, 'x')) # batch_size x context_len l_bin_feat = LL.InputLayer(shape=(None, None), input_var=self.bin_feat_var) l_bin_feat = LL.dimshuffle(l_bin_feat, (0, 1, 'x')) ''' Dropout at the embeddings ''' if emb_dropout: print('Using dropout after wiq calculation.') l_c_emb = LL.dropout(l_c_emb) l_q_emb = LL.dropout(l_q_emb) ''' Here we concatenate wiq features to embeddings''' # both features are concatenated to the embeddings # for the question we fix the features to 1 l_c_emb = LL.concat([l_c_emb, l_bin_feat, l_weighted_feat], axis=2) l_q_emb = LL.pad(l_q_emb, width=[(0, 2)], val=L.utils.floatX(1), batch_ndim=2) ''' Context and question encoding using the same BiLSTM for both ''' # output shape is (batch_size x context_len x rec_size) l_c_enc_forw = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask) l_c_enc_back = LL.LSTMLayer(l_c_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_c_mask, backwards=True) # output shape is (batch_size x question_len x rec_size) l_q_enc_forw = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, ingate=LL.Gate(W_in=l_c_enc_forw.W_in_to_ingate, W_hid=l_c_enc_forw.W_hid_to_ingate, W_cell=l_c_enc_forw.W_cell_to_ingate, b=l_c_enc_forw.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_forgetgate, W_hid=l_c_enc_forw.W_hid_to_forgetgate, W_cell=l_c_enc_forw.W_cell_to_forgetgate, b=l_c_enc_forw.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_forw.W_in_to_outgate, W_hid=l_c_enc_forw.W_hid_to_outgate, W_cell=l_c_enc_forw.W_cell_to_outgate, b=l_c_enc_forw.b_outgate), cell=LL.Gate(W_in=l_c_enc_forw.W_in_to_cell, W_hid=l_c_enc_forw.W_hid_to_cell, W_cell=None, b=l_c_enc_forw.b_cell, nonlinearity=L.nonlinearities.tanh)) l_q_enc_back = LL.LSTMLayer( l_q_emb, num_units=self.rec_size, grad_clipping=100, mask_input=l_q_mask, backwards=True, ingate=LL.Gate(W_in=l_c_enc_back.W_in_to_ingate, W_hid=l_c_enc_back.W_hid_to_ingate, W_cell=l_c_enc_back.W_cell_to_ingate, b=l_c_enc_back.b_ingate), forgetgate=LL.Gate(W_in=l_c_enc_back.W_in_to_forgetgate, W_hid=l_c_enc_back.W_hid_to_forgetgate, W_cell=l_c_enc_back.W_cell_to_forgetgate, b=l_c_enc_back.b_forgetgate), outgate=LL.Gate(W_in=l_c_enc_back.W_in_to_outgate, W_hid=l_c_enc_back.W_hid_to_outgate, W_cell=l_c_enc_back.W_cell_to_outgate, b=l_c_enc_back.b_outgate), cell=LL.Gate(W_in=l_c_enc_back.W_in_to_cell, W_hid=l_c_enc_back.W_hid_to_cell, W_cell=None, b=l_c_enc_back.b_cell, nonlinearity=L.nonlinearities.tanh)) # batch_size x context_len x 2*rec_size l_c_enc = LL.concat([l_c_enc_forw, l_c_enc_back], axis=2) # batch_size x question_len x 2*rec_size l_q_enc = LL.concat([l_q_enc_forw, l_q_enc_back], axis=2) def proj_init(): return np.vstack([ np.eye(self.rec_size, dtype=theano.config.floatX), np.eye(self.rec_size, dtype=theano.config.floatX) ]) # this is H from the paper, shape: (batch_size * context_len x # rec_size) l_c_proj = LL.reshape(l_c_enc, (batch_size * context_len, 2 * self.rec_size)) l_c_proj = LL.DenseLayer(l_c_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) # this is Z from the paper, shape: (batch_size * question_len x # rec_size) l_q_proj = LL.reshape(l_q_enc, (batch_size * question_len, 2 * self.rec_size)) l_q_proj = LL.DenseLayer(l_q_proj, num_units=self.rec_size, W=proj_init(), b=None, nonlinearity=L.nonlinearities.tanh) ''' Additional, weighted question encoding (alphas from paper) ''' l_alpha = LL.DenseLayer( l_q_proj, # batch_size * question_len x 1 num_units=1, b=None, nonlinearity=None) # batch_size x question_len l_alpha = MaskedSoftmaxLayer( LL.reshape(l_alpha, (batch_size, question_len)), l_q_mask) # batch_size x rec_size l_z_hat = BatchedDotLayer([ LL.reshape(l_q_proj, (batch_size, question_len, self.rec_size)), l_alpha ]) return l_c_proj, l_z_hat
def __init__(self, input_shape, output_dim, hidden_sizes, conv_filters, conv_filter_sizes, conv_strides, conv_pads, hidden_W_init=LI.GlorotUniform(), hidden_b_init=LI.Constant(0.), output_W_init=LI.GlorotUniform(), output_b_init=LI.Constant(0.), hidden_nonlinearity=LN.rectify, output_nonlinearity=LN.softmax, name=None, input_var=None): if name is None: prefix = "" else: prefix = name + "_" if len(input_shape) == 3: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) l_hid = L.reshape(l_in, ([0], ) + input_shape) elif len(input_shape) == 2: l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var) input_shape = (1, ) + input_shape l_hid = L.reshape(l_in, ([0], ) + input_shape) else: l_in = L.InputLayer(shape=(None, ) + input_shape, input_var=input_var) l_hid = l_in assert input_shape[0] % 2 == 0 l_hid0 = L.SliceLayer(l_hid, slice(None, input_shape[0] // 2), axis=1) l_hid1 = L.SliceLayer(l_hid, slice(input_shape[0] // 2, None), axis=1) l_hids = [l_hid0, l_hid1] for idx, conv_filter, filter_size, stride, pad in zip( range(len(conv_filters)), conv_filters, conv_filter_sizes, conv_strides, conv_pads, ): for ihid in range(len(l_hids)): if ihid > 0: conv_kwargs = dict(W=l_hids[0].W, b=l_hids[0].b) else: conv_kwargs = dict() l_hids[ihid] = L.Conv2DLayer(l_hids[ihid], num_filters=conv_filter, filter_size=filter_size, stride=(stride, stride), pad=pad, nonlinearity=hidden_nonlinearity, name="%sconv_hidden_%d_%d" % (prefix, idx, ihid), convolution=wrapped_conv, **conv_kwargs) l_hid = L.ElemwiseSumLayer(l_hids, coeffs=[-1, 1]) l_hid = L.ExpressionLayer(l_hid, lambda X: X * X) for idx, hidden_size in enumerate(hidden_sizes): l_hid = L.DenseLayer( l_hid, num_units=hidden_size, nonlinearity=hidden_nonlinearity, name="%shidden_%d" % (prefix, idx), W=hidden_W_init, b=hidden_b_init, ) l_out = L.DenseLayer( l_hid, num_units=output_dim, nonlinearity=output_nonlinearity, name="%soutput" % (prefix, ), W=output_W_init, b=output_b_init, ) self._l_in = l_in self._l_out = l_out self._input_var = l_in.input_var
def clip(l, b=1): return L.ExpressionLayer(l, lambda x: theano.gradient.grad_clip(x, -b, b))