def get_embedding_layer(self, l_in, extra_vars): language = extra_vars[0] context_vars = extra_vars[1:] id_tag = (self.id + '/') if self.id else '' l_lang = InputLayer(shape=(None, ), input_var=language, name=id_tag + 'lang_input') if self.options.bilingual_en_embed_file: en_embeddings = load_embeddings( self.options.bilingual_en_embed_file, self.seq_vec) en_embed_size = en_embeddings.shape[1] else: en_embeddings = Normal() en_embed_size = self.options.bilingual_embed_size if self.options.bilingual_zh_embed_file: zh_embeddings = load_embeddings( self.options.bilingual_zh_embed_file, self.seq_vec) zh_embed_size = zh_embeddings.shape[1] else: zh_embeddings = Normal() zh_embed_size = self.options.bilingual_embed_size l_en = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=en_embed_size, W=en_embeddings, name=id_tag + 'desc_embed_en') l_en_transformed = dimshuffle(l_en, (0, 2, 1)) l_en_transformed = NINLayer(l_en_transformed, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'desc_embed_en_transformed') l_en_transformed = dimshuffle(l_en_transformed, (0, 2, 1)) l_zh = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=zh_embed_size, W=zh_embeddings, name=id_tag + 'desc_embed_zh') l_zh_transformed = dimshuffle(l_zh, (0, 2, 1)) l_zh_transformed = NINLayer(l_zh_transformed, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'desc_embed_zh_transformed') l_zh_transformed = dimshuffle(l_zh_transformed, (0, 2, 1)) l_merged = SwitchLayer(l_lang, [l_en_transformed, l_zh_transformed], name=id_tag + 'desc_embed_switch') return (l_merged, context_vars)
def build_model(self): # Define tensor variables. x_user = T.ivector("x_user") x_user_context = T.ivector("x_user_context") y_labels = T.vector("y_emb") ################################################################################################################ # Unsupervised embedding learning. ################################################################################################################ l_in_user = InputLayer(shape=(None, ), input_var=x_user) l_in_user_context = InputLayer(shape=(None, ), input_var=x_user_context) l1_user = EmbeddingLayer(l_in_user, input_size=self.number_of_users, output_size=self.embedding_size, W=lasagne.init.GlorotUniform(gain=1.0)) l1_user_context = EmbeddingLayer( l_in_user_context, input_size=self.number_of_users, output_size=self.embedding_size, W=lasagne.init.GlorotUniform(gain=1.0)) l_user_user_merge = lasagne.layers.ElemwiseMergeLayer( [l1_user, l1_user_context], T.mul) self.l.append(l_user_user_merge) user_user_embedding_merge = lasagne.layers.get_output( l_user_user_merge) user_user_loss = -T.log( T.nnet.sigmoid( T.sum(user_user_embedding_merge, axis=1) * y_labels)).sum() l_user_user_merge_params = lasagne.layers.get_all_params( l_user_user_merge, trainable=True) user_user_updates = lasagne.updates.adam( user_user_loss, l_user_user_merge_params, learning_rate=self.learning_rate) user_user_batch_train_function = theano.function( [x_user, x_user_context, y_labels], user_user_loss, updates=user_user_updates, on_unused_input="ignore") return user_user_batch_train_function, \ l1_user
def _add_word_embeddings(self): self._net['input_x'] = InputLayer(shape=(None, None, None), input_var=T.itensor3(name='input_x'), name='input_x') self._net['input_y'] = InputLayer(shape=(None, None), input_var=T.imatrix(name='input_y'), name='input_y') # Infer these variables from data passed to computation graph since batch shape may differ in training and # prediction phases self._batch_size = self._net['input_x'].input_var.shape[0] self._input_context_size = self._net['input_x'].input_var.shape[1] self._input_seq_len = self._net['input_x'].input_var.shape[2] self._output_seq_len = self._net['input_y'].input_var.shape[1] self._net['input_x_batched'] = \ reshape(self._net['input_x'], (self._batch_size * self._input_context_size, self._input_seq_len)) self._net['input_x_mask'] = NotEqualMaskLayer( incoming=self._net['input_x_batched'], x=self._skip_token_id, name='mask_x') self._net['emb_x'] = EmbeddingLayer( incoming=self._net['input_x_batched'], input_size=self._vocab_size, output_size=self._word_embedding_dim, W=self._W_init_embedding, name='emb_x') # output shape (batch_size, input_context_size, input_seq_len, embedding_dimension) self._net['input_y_mask'] = NotEqualMaskLayer( incoming=self._net['input_y'], x=self._skip_token_id, name='mask_y') self._net['emb_y'] = EmbeddingLayer( incoming=self._net['input_y'], input_size=self._vocab_size, output_size=self._word_embedding_dim, W=self._W_init_embedding, name='emb_y') # output shape (batch_size, output_seq_len, embedding_dimension) if not self._train_word_embedding: self._net['emb_x'].params[self._net['emb_x'].W].remove('trainable') self._net['emb_y'].params[self._net['emb_y'].W].remove('trainable')
def _get_l_out(self, input_vars): id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] l_in = InputLayer(shape=(None,), input_var=input_var, name=id_tag + 'desc_input') embed_size = self.options.listener_cell_size or self.color_vec.num_types l_in_embed = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=embed_size, name=id_tag + 'desc_embed') if self.options.listener_cell_size == 0: l_scores = l_in_embed # BiasLayer(l_in_embed, name=id_tag + 'bias') else: l_hidden = DenseLayer(l_in_embed, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.color_vec.num_types, nonlinearity=None, name=id_tag + 'scores') l_out = NonlinearityLayer(l_scores, nonlinearity=softmax, name=id_tag + 'out') return l_out, [l_in]
def test_embedding_2D_input(): import numpy as np import theano import theano.tensor as T from lasagne.layers import EmbeddingLayer, InputLayer, helper x = T.imatrix() batch_size = 2 seq_len = 3 emb_size = 5 vocab_size = 3 l_in = InputLayer((None, seq_len)) W = np.arange( vocab_size*emb_size).reshape((vocab_size, emb_size)).astype('float32') l1 = EmbeddingLayer(l_in, input_size=vocab_size, output_size=emb_size, W=W) x_test = np.array([[0, 1, 2], [0, 0, 2]], dtype='int32') # check output shape assert helper.get_output_shape( l1, (batch_size, seq_len)) == (batch_size, seq_len, emb_size) output = helper.get_output(l1, x) f = theano.function([x], output) np.testing.assert_array_almost_equal(f(x_test), W[x_test])
def __init__(self, vocab): ### THEANO GRAPH INPUT ### self.input_phrase = T.imatrix("encoder phrase tokens") ########################## self.l_in = InputLayer((None, None), self.input_phrase, name='context input') self.l_mask = InputLayer((None, None), T.neq(self.input_phrase, vocab.PAD_ix), name='context mask') self.l_emb = EmbeddingLayer(self.l_in, vocab.n_tokens, Config.EMB_SIZE, name="context embedding") self.l_lstm = LSTMLayer(self.l_emb, Config.N_LSTM_UNITS, name='encoder_lstm', grad_clipping=Config.LSTM_LAYER_GRAD_CLIP, mask_input=self.l_mask, only_return_final=True, peepholes=False) self.output = self.l_lstm
def build_lstm_decorer(): net = collections.OrderedDict() net['sent_input'] = InputLayer((None, CFG['SEQUENCE LENGTH'] - 1), input_var=T.imatrix()) net['word_emb'] = EmbeddingLayer(net['sent_input'], input_size=CFG['VOCAB SIZE'],\ output_size=CFG['EMBEDDING SIZE']) net['vis_input'] = InputLayer((None, CFG['VIS SIZE']), input_var=T.matrix()) net['vis_emb'] = DenseLayer(net['vis_input'], num_units=CFG['EMBEDDING SIZE'], nonlinearity=lasagne.nonlinearities.identity) net['vis_emb_reshp'] = ReshapeLayer(net['vis_emb'], (-1, 1, CFG['EMBEDDING SIZE'])) net['decorder_input'] = ConcatLayer( [net['vis_emb_reshp'], net['word_emb']]) net['feat_dropout'] = DropoutLayer(net['decorder_input'], p=0.5) net['mask_input'] = InputLayer((None, CFG['SEQUENCE LENGTH'])) net['lstm'] = LSTMLayer(net['feat_dropout'],num_units=CFG['EMBEDDING SIZE'], \ mask_input=net['mask_input'], grad_clipping=5.) net['lstm_dropout'] = DropoutLayer(net['lstm'], p=0.5) net['lstm_reshp'] = ReshapeLayer(net['lstm_dropout'], (-1, CFG['EMBEDDING SIZE'])) net['word_prob'] = DenseLayer(net['lstm_reshp'], num_units=CFG['VOCAB SIZE'] + 2, nonlinearity=softmax) net['sent_prob'] = ReshapeLayer( net['word_prob'], (-1, CFG['SEQUENCE LENGTH'], CFG['VOCAB SIZE'] + 2)) return net
def build_model(batch_size=128): x = T.matrix('input', dtype='int32') l_in = lasagne.layers.InputLayer(shape=(None, None,), input_var=x) # We can retrieve symbolic references to the input variable's shape, which # we will later use in reshape layers. batchsize, seqlen = l_in.input_var.shape W = np.random.rand(vocab_size, embedding_size).astype(np.float32) ebd = EmbeddingLayer(l_in, input_size=vocab_size, output_size=embedding_size, W=W) # All gradients above this will be clipped GRAD_CLIP = 100 # We now build the LSTM layer which takes l_in as the input layer # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. l_forward_1 = lasagne.layers.LSTMLayer( ebd, hiddenLayDim, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh) l_forward_2 = lasagne.layers.LSTMLayer( l_forward_1, hiddenLayDim, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh) # the output size of l_forward_2 will be (batch_size, seqlen, hiddenLayDim) # In order to connect a recurrent layer to a dense layer, we need to # flatten the first two dimensions (batch_size, seqlen); this will # cause each time step of each sequence to be processed independently l_shp = ReshapeLayer(l_forward_2, (-1, hiddenLayDim)) l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=None) # Don't reshape back. Because keep the current shape will make it # easier to calc the categorical_crossentropy # l_out = ReshapeLayer(l_dense, (batchsize, seqlen, vocab_size)) return l_out, x
def build_res_stafg(): net = collections.OrderedDict() # INPUTS---------------------------------------- net['sent_input'] = InputLayer((None, CFG['SEQUENCE LENGTH']), input_var=T.imatrix()) net['word_emb'] = EmbeddingLayer(net['sent_input'], input_size=CFG['VOCAB SIZE']+3,\ output_size=CFG['WORD VECTOR SIZE'],W=np.copy(CFG['wemb'])) net['vis_input'] = InputLayer((None,CFG['VISUAL LENGTH'], CFG['VIS SIZE'])) # key words model------------------------------------- net['vis_mean_pool'] = FeaturePoolLayer(net['vis_input'], CFG['VISUAL LENGTH'],pool_function=T.mean) net['ctx_vis_reshp'] = ReshapeLayer(net['vis_mean_pool'],(-1,CFG['VIS SIZE'])) net['global_vis'] = DenseLayer(net['ctx_vis_reshp'],num_units=CFG['EMBEDDING SIZE'],nonlinearity=linear) net['key_words_prob'] = DenseLayer(DropoutLayer(net['global_vis']), num_units=CFG['VOCAB SIZE']+3,nonlinearity=sigmoid) # gru model-------------------------------------- net['mask_input'] = InputLayer((None, CFG['SEQUENCE LENGTH'])) net['sgru'] = GRULayer(net['word_emb'],num_units=CFG['EMBEDDING SIZE'], \ mask_input=net['mask_input'],hid_init=net['global_vis']) net['sta_gru'] = CTXAttentionGRULayer([net['sgru'],net['vis_input'],net['global_vis']], num_units=CFG['EMBEDDING SIZE'], mask_input=net['mask_input']) net['fusion'] = DropoutLayer(ConcatLayer([net['sta_gru'],net['gru']],axis=2), p=0.5) net['fusion_reshp'] = ReshapeLayer(net['fusion'], (-1,CFG['EMBEDDING SIZE']*2)) net['word_prob'] = DenseLayer(net['fusion_reshp'], num_units=CFG['VOCAB SIZE']+3, nonlinearity=softmax) net['sent_prob'] = ReshapeLayer(net['word_prob'],(-1,CFG['SEQUENCE LENGTH'], CFG['VOCAB SIZE']+3)) return net
def _add_word_embeddings(self): self._net['input_x'] = InputLayer(shape=(None, None, None), input_var=T.itensor3(name='input_x'), name='input_x') self._net['input_y'] = InputLayer(shape=(None, None), input_var=T.imatrix(name='input_y'), name='input_y') # These are theano variables and they are computed dynamically as data is passed into the computational graph self._batch_size = self._net['input_x'].input_var.shape[0] self._input_context_size = self._net['input_x'].input_var.shape[1] self._input_seq_len = self._net['input_x'].input_var.shape[2] self._output_seq_len = self._net['input_y'].input_var.shape[1] self._net['input_x_batched'] = \ reshape(self._net['input_x'], (self._batch_size * self._input_context_size, self._input_seq_len)) self._net['input_x_mask'] = NotEqualMaskLayer( incoming=self._net['input_x_batched'], x=self._skip_token_id, name='mask_x') self._net['emb_x'] = EmbeddingLayer( incoming=self._net['input_x_batched'], input_size=self._vocab_size, output_size=self._word_embedding_dim, W=self._W_init_embedding, name='emb_x') # output shape (batch_size, input_context_size, input_seq_len, embedding_dimension) self._net['input_y_mask'] = NotEqualMaskLayer( incoming=self._net['input_y'], x=self._skip_token_id, name='mask_y') self._net['emb_y'] = EmbeddingLayer( incoming=self._net['input_y'], input_size=self._vocab_size, output_size=self._word_embedding_dim, W=self._W_init_embedding, name='emb_y') # output shape (batch_size, output_seq_len, embedding_dimension) if not self._train_word_embedding: self._net['emb_x'].params[self._net['emb_x'].W].remove('trainable') self._net['emb_y'].params[self._net['emb_y'].W].remove('trainable')
def _add_condition_embeddings(self): self._net['input_condition_id'] = InputLayer( shape=(None, ), input_var=T.ivector(name='in_condition_id'), name='input_condition_id') self._net['emb_condition_id'] = EmbeddingLayer( incoming=self._net['input_condition_id'], input_size=self._condition_ids_num, output_size=self._condition_embedding_dim, name='embedding_condition_id')
def embedding(self, input_dim, cats, output_dim): words = np.random.uniform(-0.05, 0.05, (cats, output_dim)).astype("float32") w = theano.shared(value=words.astype(theano.config.floatX)) embed_input = InputLayer((None, input_dim), input_var=T.imatrix()) e = EmbeddingLayer(embed_input, input_size=cats, output_size=output_dim, W=w) return e
def test_lstm_get_emb_output(): hid_size = 10 inp_size = 10 out_size = 40 l_in = InputLayer((None, None), input_var=T.imatrix()) l_emb = EmbeddingLayer(l_in, inp_size, out_size) l_lstm = LSTMLayer(l_emb, hid_size) emb_output = lasagne.layers.get_output(l_emb) output = lasagne.layers.get_output(l_lstm) output_for = l_lstm.get_output_for([emb_output])
def _get_l_out(self, input_vars): listener.check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_hidden = DenseLayer( l_rec1_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_out = DenseLayer(l_hidden_drop, num_units=3, nonlinearity=softmax, name=id_tag + 'scores') return l_out, [l_in]
def test_lnlstm_get_emb_output(): hid_size = 10 inp_size = 10 out_size = 40 n_batches = 23 seqlen = 47 l_in = InputLayer((n_batches, seqlen), input_var=T.imatrix('input_var'), name="l_in") l_emb = EmbeddingLayer(l_in, inp_size, out_size, name="l_emb") l_lstm = LNLSTMLayer(l_emb, hid_size, name="l_lstm") emb_output = lasagne.layers.get_output(l_emb) output = lasagne.layers.get_output(l_lstm) output_for = l_lstm.get_output_for([emb_output])
def __init__(self, vocab, num_users): self.vocab = vocab self._user_id = T.ivector('user ids') self._good_utterance = T.imatrix('utterance from user') self._bad_utterance = T.imatrix('utterance not from user') self.l_utt_enc = Enc(vocab) self._user_inp = InputLayer((None, ), input_var=self._user_id, name='user ids layer') self.l_user_emb = EmbeddingLayer(self._user_inp, num_users, DssmConfig.USER_EMB_SIZE, name='user embedding') self.l_user_semantic = DenseLayer(self.l_user_emb, DssmConfig.SEMANTIC_SPACE_SIZE, name='user representation') self.l_user_semantic = dropout(self.l_user_semantic, p=DssmConfig.DROPOUT_RATE) self.l_utt_semantic = DenseLayer(self.l_utt_enc.output, DssmConfig.SEMANTIC_SPACE_SIZE, name='utterance representation') self.l_utt_semantic = dropout(self.l_utt_semantic, p=DssmConfig.DROPOUT_RATE) self.user_semantic = get_output(self.l_user_semantic) self.user_semantic_d = get_output(self.l_user_semantic, deterministic=True) self.good_utt_semantic = get_output( self.l_utt_semantic, inputs={self.l_utt_enc.l_in: self._good_utterance}) self.good_utt_semantic_d = get_output( self.l_utt_semantic, inputs={self.l_utt_enc.l_in: self._good_utterance}, deterministic=True) self.bad_utt_semantic = get_output( self.l_utt_semantic, inputs={self.l_utt_enc.l_in: self._bad_utterance}) self.bad_utt_semantic_d = get_output( self.l_utt_semantic, inputs={self.l_utt_enc.l_in: self._bad_utterance}, deterministic=True) self._build_loss_and_ops()
def get_input_layer(self, input_vars, recurrent_length=0, cell_size=20, context_len=1, id=None): id_tag = (id + '/') if id else '' (input_var,) = input_vars shape = ((None, context_len * len(self.buckets)) if recurrent_length == 0 else (None, recurrent_length, context_len * len(self.buckets))) l_color = InputLayer(shape=shape, input_var=input_var, name=id_tag + 'color_input') l_color_embed = EmbeddingLayer(l_color, input_size=sum(b.num_types for b in self.buckets), output_size=cell_size, name=id_tag + 'color_embed') dims = (([0], -1) if recurrent_length == 0 else ([0], [1], -1)) l_color_flattened = reshape(l_color_embed, dims) return l_color_flattened, [l_color]
def get_input_layer(self, input_vars, recurrent_length=0, cell_size=20, context_len=1, id=None): id_tag = (id + '/') if id else '' (input_var,) = input_vars input_shape = ((None, context_len) if recurrent_length == 0 else (None, recurrent_length, context_len)) l_color = InputLayer(shape=input_shape, input_var=input_var, name=id_tag + 'color_input') l_color_embed = EmbeddingLayer(l_color, input_size=self.num_types, output_size=cell_size, name=id_tag + 'color_embed') output_shape = (([0], context_len * cell_size) if recurrent_length == 0 else ([0], recurrent_length, context_len * cell_size)) l_color_shape = reshape(l_color_embed, output_shape, name=id_tag + 'color_embed_flattened') return l_color_shape, [l_color]
def modify_context(self, l_context_repr, extra_vars): language = extra_vars[0] id_tag = (self.id + '/') if self.id else '' print('l_context_repr: {}'.format(l_context_repr.output_shape)) l_lang_input = InputLayer(shape=(None, ), input_var=language, name=id_tag + 'lang_input') l_lang_embed = EmbeddingLayer( l_lang_input, input_size=len(self.lang_vec.tokens), output_size=self.options.bilingual_lang_embed_size, name=id_tag + 'lang_embed') print('l_lang_embed: {}'.format(l_lang_embed.output_shape)) l_modified_context = ConcatLayer([l_lang_embed, l_context_repr]) print('l_modified_context: {}'.format(l_modified_context.output_shape)) return (l_modified_context, [l_lang_input])
def __init__(self, vocab, enc): # Define inputs of decoder at each time step. self.prev_cell = InputLayer((None, Config.N_LSTM_UNITS), name='cell') self.prev_hid = InputLayer((None, Config.N_LSTM_UNITS), name='hid') self.input_word = InputLayer((None, )) self.encoder_lstm = InputLayer((None, Config.N_LSTM_UNITS), name='encoder') # Embed input word and use the same embeddings as in the encoder. self.word_embedding = EmbeddingLayer(self.input_word, vocab.n_tokens, Config.EMB_SIZE, W=enc.l_emb.W, name='emb') # This is not WrongLSTMLayer! *Cell is used for one-tick networks. self.new_cell, self.new_hid = LSTMCell( self.prev_cell, self.prev_hid, input_or_inputs=[self.word_embedding, self.encoder_lstm], name='decoder_lstm', peepholes=False) # Define parts for new word prediction. Bottleneck is a hack for reducing time complexity. self.bottleneck = DenseLayer(self.new_hid, Config.BOTTLENECK_UNITS, nonlinearity=T.tanh, name='decoder intermediate') self.next_word_probs = DenseLayer(self.bottleneck, vocab.n_tokens, nonlinearity=lambda probs: T.nnet. softmax(probs / Config.TEMPERATURE), name='decoder next word probas') self.next_words = ProbabilisticResolver(self.next_word_probs, assume_normalized=True)
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val=0.5 * 1e-4): print("Building multi task model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen - kw + 1 stride = 1 filter_size = wordDim pool_size = num_filters input = InputLayer((None, seqlen, num_feats), input_var=input_var) batchsize, _, _ = input.input_var.shape #span emb1 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim)) conv1d_1 = DimshuffleLayer( Conv1DLayer(reshape1, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size) hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid) network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax) """ #DocTimeRel emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim)) conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size) hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid) network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax) """ #Type emb3 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim)) conv1d_3 = DimshuffleLayer( Conv1DLayer(reshape3, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size) hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid) network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax) #Degree emb4 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim)) conv1d_4 = DimshuffleLayer( Conv1DLayer(reshape4, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size) hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid) network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax) #Polarity emb5 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim)) conv1d_5 = DimshuffleLayer( Conv1DLayer(reshape5, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size) hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid) network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax) #ContextualModality emb6 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim)) conv1d_6 = DimshuffleLayer( Conv1DLayer(reshape6, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size) hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid) network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax) """ #ContextualAspect emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim)) conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size) hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid) network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax) """ """ #Permanence emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim)) conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size) hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid) network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax) """ # Is this important? """ network_1_out, network_2_out, network_3_out, network_4_out, \ network_5_out, network_6_out, network_7_out, network_8_out = \ get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8]) """ network_1_out = get_output(network_1) network_3_out = get_output(network_3) network_4_out = get_output(network_4) network_5_out = get_output(network_5) network_6_out = get_output(network_6) loss_1 = T.mean(binary_crossentropy( network_1_out, target_var)) + regularize_layer_params_weighted( { emb1: lambda_val, conv1d_1: lambda_val, hid_1: lambda_val, network_1: lambda_val }, l2) updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step) train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True) val_acc_1 = T.mean( binary_accuracy(get_output(network_1, deterministic=True), target_var)) val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True) """ loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, hid_2:lambda_val, network_2:lambda_val} , l2) updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step) train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True) val_acc_2 = T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var)) val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True) """ loss_3 = T.mean(categorical_crossentropy( network_3_out, target_var)) + regularize_layer_params_weighted( { emb3: lambda_val, conv1d_3: lambda_val, hid_3: lambda_val, network_3: lambda_val }, l2) updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step) train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True) val_acc_3 = T.mean( categorical_accuracy(get_output(network_3, deterministic=True), target_var)) val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True) loss_4 = T.mean(categorical_crossentropy( network_4_out, target_var)) + regularize_layer_params_weighted( { emb4: lambda_val, conv1d_4: lambda_val, hid_4: lambda_val, network_4: lambda_val }, l2) updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step) train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True) val_acc_4 = T.mean( categorical_accuracy(get_output(network_4, deterministic=True), target_var)) val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True) loss_5 = T.mean(categorical_crossentropy( network_5_out, target_var)) + regularize_layer_params_weighted( { emb5: lambda_val, conv1d_5: lambda_val, hid_5: lambda_val, network_5: lambda_val }, l2) updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step) train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True) val_acc_5 = T.mean( categorical_accuracy(get_output(network_5, deterministic=True), target_var)) val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True) loss_6 = T.mean(categorical_crossentropy( network_6_out, target_var)) + regularize_layer_params_weighted( { emb6: lambda_val, conv1d_6: lambda_val, hid_6: lambda_val, network_6: lambda_val }, l2) updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step) train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True) val_acc_6 = T.mean( categorical_accuracy(get_output(network_6, deterministic=True), target_var)) val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True) """ loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, hid_7:lambda_val, network_7:lambda_val} , l2) updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step) train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True) val_acc_7 = T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var)) val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True) loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, hid_8:lambda_val, network_8:lambda_val} , l2) updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step) train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True) val_acc_8 = T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var)) val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True) """ """ return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8 """ return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', only_return_final=True, **cell_kwargs) if self.options.listener_bidi: l_rec1_backwards = cell(l_in_embed, name=id_tag + 'rec1_back', backwards=True, only_return_final=True, **cell_kwargs) l_rec1 = ConcatLayer([l_rec1, l_rec1_backwards], axis=1, name=id_tag + 'rec1_bidi_concat') if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 # (batch_size, repr_size) l_pred_mean = DenseLayer(l_rec1_drop, num_units=self.color_vec.output_size, nonlinearity=None, name=id_tag + 'pred_mean') # (batch_size, repr_size * repr_size) l_pred_covar_vec = DenseLayer( l_rec1_drop, num_units=self.color_vec.output_size**2, # initially produce identity matrix b=np.eye(self.color_vec.output_size, dtype=theano.config.floatX).ravel(), nonlinearity=None, name=id_tag + 'pred_covar_vec') # (batch_size, repr_size, repr_size) l_pred_covar = reshape( l_pred_covar_vec, ([0], self.color_vec.output_size, self.color_vec.output_size), name=id_tag + 'pred_covar') # Context repr has shape (batch_size, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_context_points = reshape( l_context_repr, ([0], self.context_len, self.color_vec.output_size)) l_unnorm_scores = GaussianScoreLayer(l_context_points, l_pred_mean, l_pred_covar, name=id_tag + 'gaussian_score') l_scores = NonlinearityLayer(l_unnorm_scores, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def __init__(self, pre_trained_w_embs=None, pre_trained_c_embs=None, w_grams=(3, 4, 5), w_nfs=(50, 50, 50), c_grams=(4, 5, 6), c_nfs=(50, 50, 50), mlp_layers=(2, ), mlp_dropouts=(0.5, ), mlp_nonlinearities=(softmax, ), opt_method=lasagne.updates.adadelta, opt_args={ 'learning_rate': 0.1, 'rho': 0.95, 'epsilon': 1e-6 }, **kwargs): parameters = locals() del parameters['self'] parameters.update(kwargs) self.parameters = parameters assert pre_trained_w_embs is not None assert pre_trained_c_embs is not None if isinstance(pre_trained_w_embs, tuple): w_vocab_size = pre_trained_w_embs[0] w_emb_dim = pre_trained_w_embs[1] pre_trained_w_embs = Uniform(0.25).sample( (w_vocab_size, w_emb_dim)) else: w_vocab_size = pre_trained_w_embs.shape[0] w_emb_dim = pre_trained_w_embs.shape[1] t_pre_trained_w_embs = theano.shared(pre_trained_w_embs, name='w_embs', borrow=True) if isinstance(pre_trained_c_embs, tuple): c_vocab_size = pre_trained_c_embs[0] c_emb_dim = pre_trained_c_embs[1] pre_trained_c_embs = Uniform(0.25).sample( (c_vocab_size, c_emb_dim)) else: c_vocab_size = pre_trained_c_embs.shape[0] c_emb_dim = pre_trained_c_embs.shape[1] t_pre_trained_c_embs = theano.shared(pre_trained_c_embs, name='c_embs', borrow=True) w_sents = T.imatrix(name='w_sents') c_sents = T.imatrix(name='c_sents') labels = T.ivector(name='labels') w_input = InputLayer((None, None), input_var=w_sents) c_input = InputLayer((None, None), input_var=c_sents) w_embs = EmbeddingLayer(w_input, input_size=w_vocab_size, output_size=w_emb_dim, W=t_pre_trained_w_embs) w_embs = ReshapeLayer(w_embs, ([0], 1, [1], [2])) c_embs = EmbeddingLayer(c_input, input_size=c_vocab_size, output_size=c_emb_dim, W=t_pre_trained_c_embs) c_embs = ReshapeLayer(c_embs, ([0], 1, [1], [2])) conv_layers = [] for w_gram, w_nf in zip(w_grams, w_nfs): conv_layer = Conv2DLayer(w_embs, w_nf, (w_gram, w_emb_dim), pad=(w_gram - 1, 0)) # shape (batch_size, nfs[i], 1, 1) pooled_layer = MaxLayer(conv_layer, axis=2) # shape (batch_size, nfs[i]) flatten_layer = ReshapeLayer(pooled_layer, ([0], [1])) conv_layers.append(flatten_layer) for c_gram, c_nf in zip(c_grams, c_nfs): # shape (batch_size, nfs[i], num_features, 1) conv_layer = Conv2DLayer(c_embs, c_nf, (c_gram, c_emb_dim), pad=(c_gram - 1, 0)) # shape (batch_size, nfs[i], 1, 1) pooled_layer = MaxLayer(conv_layer, axis=2) # shape (batch_size, nfs[i]) flatten_layer = ReshapeLayer(pooled_layer, ([0], [1])) conv_layers.append(flatten_layer) network = ConcatLayer(conv_layers, axis=1) for mlp_layer, mlp_dropout, mlp_nonlinearity in zip( mlp_layers, mlp_dropouts, mlp_nonlinearities): if mlp_dropout is not None: network = DropoutLayer(network, p=mlp_dropout) network = DenseLayer(network, num_units=mlp_layer, nonlinearity=mlp_nonlinearity) self.network = network # Create a loss expression for training, i.e., negative log likelihood we want to maximize): train_predict = get_output(network) train_loss = negative_log_likelihood(train_predict, labels) train_acc = T.sum(T.eq(T.argmax(train_predict, axis=1), labels), axis=0) # We could add some weight decay as well here, see lasagne.regularization. # Here to create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = get_all_params(network, trainable=True) updates = opt_method(train_loss, params, **opt_args) # correct updates for embeddings[0] by resetting it to its initial value updates[t_pre_trained_w_embs] = T.set_subtensor( updates[t_pre_trained_w_embs][0, :], t_pre_trained_w_embs[0]) updates[t_pre_trained_c_embs] = T.set_subtensor( updates[t_pre_trained_c_embs][0, :], t_pre_trained_c_embs[0]) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = get_output(network, deterministic=True) test_loss = negative_log_likelihood(test_prediction, labels) test_acc = T.sum(T.eq(T.argmax(test_prediction, axis=1), labels), axis=0) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: self.train_fn = theano.function([w_sents, c_sents, labels], [train_loss, train_acc], updates=updates) # Compile a second function computing the validation loss and accuracy: self.test_fn = theano.function([w_sents, c_sents, labels], [test_loss, test_acc])
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' color_mask_var, prev_output_var, mask_var = input_vars[-3:] color_input_vars = input_vars[:-3] num_contexts = color_mask_var.shape[0] num_colors = color_mask_var.shape[1] l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=0, cell_size=self.options.speaker_cell_size, context_len=None, id=self.id) l_color_reshaped = ReshapeLayer( l_color_repr, (num_contexts, num_colors, self.color_vec.output_size), name=id_tag + 'color_reshaped') l_color_mask_in = InputLayer(shape=(None, None), input_var=color_mask_var, name=id_tag + 'color_mask') cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_color_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.speaker_nonlinearity] l_context_out = cell(l_color_reshaped, name=id_tag + 'reccontext', only_return_final=True, **cell_kwargs) l_context_tiled = RepeatLayer(l_context_out, self.seq_vec.max_len - 1, name=id_tag + 'reccontext_tiled') l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer( l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_context_tiled, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell_kwargs['mask_input'] = (None if self.options.speaker_no_mask else l_mask_in) for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer( l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_color_mask_in, l_prev_out, l_mask_in]
def _get_net(self): net = OrderedDict() net['l_in_x'] = InputLayer(shape=(None, None), input_var=T.imatrix(name="enc_ix"), name="encoder_seq_ix") net['l_in_y'] = InputLayer(shape=(None, None), input_var=T.imatrix(name="dec_ix"), name="decoder_seq_ix") net['l_emb_x'] = EmbeddingLayer( incoming=net['l_in_x'], input_size=self.vocab_size, output_size=TOKEN_REPRESENTATION_SIZE, W=self.W, name="embeddings_layer_x" ) net['l_emb_y'] = EmbeddingLayer( incoming=net['l_in_y'], input_size=self.vocab_size, output_size=TOKEN_REPRESENTATION_SIZE, W=self.W, name="embeddings_layer_y" ) if not LEARN_WORD_EMBEDDINGS: net['l_emb_x'].params[net['l_emb_x'].W].remove('trainable') net['l_emb_y'].params[net['l_emb_y'].W].remove('trainable') # encoder ############################################### net['l_enc'] = self.rnn_layer( incoming=net['l_emb_x'], num_units=HIDDEN_LAYER_DIMENSION, grad_clipping=self.gc, only_return_final=True, name='lstm_encoder' ) # decoder ############################################### net['l_dec'] = self.rnn_layer( incoming=net['l_emb_y'], num_units=HIDDEN_LAYER_DIMENSION, hid_init=net['l_enc'], grad_clipping=GRAD_CLIP, name='lstm_decoder' ) # decoder returns the batch of sequences of though vectors, each corresponds to a decoded token # reshape this 3d tensor to 2d matrix so that the next Dense layer can convert each though vector to # probability distribution vector # output ############################################### # cut off the last prob vectors for every prob sequence: # they correspond to the tokens that go after EOS_TOKEN and we are not interested in it net['l_slice'] = SliceLayer( incoming=net['l_dec'], indices=slice(0, -1), # keep all but the last token axis=1, # sequneces axis name='slice_layer' ) net['l_dec_long'] = ReshapeLayer( incoming=net['l_slice'], shape=(-1, HIDDEN_LAYER_DIMENSION), name='reshape_layer' ) net['l_dist'] = DenseLayer( incoming=net['l_dec_long'], num_units=self.vocab_size, nonlinearity=lasagne.nonlinearities.softmax, name="dense_output_probas" ) # don't need to reshape back, can compare this "long" output with true one-hot vectors return net
def test_clone(self): # Data for unit testing X_unit = ['abcdef', 'abcdef', 'qwerty'] X_unit = [[ord(c) for c in w] for w in X_unit] X_unit = np.array(X_unit, dtype='int8') n_alerts_unit, l_alerts_unit = X_unit.shape mask_unit = np.ones(X_unit.shape, dtype='int8') # Dimensions n_alerts = None l_alerts = None n_alphabet = 2**7 # All ASCII chars num_units = 10 # Symbolic variables input_var, input_var2 = T.imatrices('inputs', 'inputs2') mask_var, mask_var2 = T.matrices('masks', 'masks2') target_var = T.dvector('targets') # build net for testing l_in = InputLayer(shape=(n_alerts, l_alerts), input_var=input_var, name='INPUT-LAYER') l_emb = EmbeddingLayer(l_in, n_alphabet, n_alphabet, W=np.eye(n_alphabet), name='EMBEDDING-LAYER') l_emb.params[l_emb.W].remove('trainable') # Fix weight l_mask = InputLayer(shape=(n_alerts, l_alerts), input_var=mask_var, name='MASK-INPUT-LAYER') l_lstm = LSTMLayer(l_emb, num_units=num_units, name='LSTM-LAYER', mask_input=l_mask) l_slice = SliceLayer(l_lstm, indices=-1, axis=1, name="SLICE-LAYER") # Only last timestep net = l_slice # clone l_in2 = InputLayer(shape=(n_alerts, l_alerts), input_var=input_var2, name='INPUT-LAYER2') l_mask2 = InputLayer(shape=(n_alerts, l_alerts), input_var=mask_var2, name='MASK-INPUT-LAYER2') net2 = lstm_rnn_tied_weights.clone(net, l_in2, l_mask2) self.assertNotEqual(repr(net), repr(net2)) pred_unit = layers.get_output(net, inputs={ l_in: input_var, l_mask: mask_var }).eval({ input_var: X_unit, mask_var: mask_unit }) pred_unit2 = layers.get_output(net2, inputs={ l_in2: input_var2, l_mask2: mask_var2 }).eval({ input_var2: X_unit, mask_var2: mask_unit }) self.assert_array_equal(pred_unit, pred_unit2)
def __init__(self, vocab_size, n_entities, embedding_size, n_hidden_que, n_hidden_con, n_out_hidden, residual=False, depth_rnn=1, grad_clipping=10, skip_connections=False, bidir=False, dropout=False, **kwargs): ReaderTwoSeqModel.__init__(self, vocab_size, n_entities, embedding_size, residual, depth_rnn, grad_clipping, skip_connections, bidir, dropout) self.n_hidden_question = n_hidden_que self.n_hidden_context = n_hidden_con self.n_out_hidden = n_out_hidden ################## # SEQ PROCESSING # ################## embed_con = EmbeddingLayer(self.in_con, vocab_size, embedding_size) embed_que = EmbeddingLayer(self.in_que, vocab_size, embedding_size, W=embed_con.W) gru_con = create_deep_rnn(embed_con, GRULayer, depth_rnn, layer_mask=self.in_con_mask, num_units=n_hidden_con, grad_clipping=grad_clipping, residual=residual, skip_connections=skip_connections, bidir=bidir)[-1] gru_que = create_deep_rnn(embed_que, GRULayer, depth_rnn, layer_mask=self.in_que_mask, num_units=n_hidden_que, grad_clipping=grad_clipping, residual=residual, skip_connections=skip_connections, bidir=bidir)[-1] ############# # ATTENTION # ############# que_condition = SliceLayer(gru_que, indices=-1, axis=1) batch_size = self.seq_con.shape[0] att = self.create_attention(gru_con, self.in_con_mask, que_condition, batch_size, n_hidden_con, **kwargs) ########## # OUTPUT # ########## out_att = DenseLayer(att, n_out_hidden, nonlinearity=None) out_que = DenseLayer(que_condition, n_out_hidden, nonlinearity=None) out_sum = ElemwiseSumLayer([out_att, out_que]) if dropout: out_sum = DropoutLayer(out_sum, dropout) out_tanh = NonlinearityLayer(out_sum, nonlinearity=T.tanh) out = DenseLayer(out_tanh, self.n_entities, nonlinearity=None) if dropout: out = DropoutLayer(out, dropout) self.net = CandidateOutputLayer(out, self.in_cand, self.in_cand_mask)
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_context_repr = reshape( l_context_repr, ([0], [1], self.context_len, self.color_vec.output_size)) l_hidden_context = dimshuffle(l_context_repr, (0, 3, 1, 2), name=id_tag + 'shuffle_in') for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[ self.options.listener_nonlinearity], b=Constant(0.1), name=id_tag + 'hidden_context%d' % i) l_pool = FeaturePoolLayer(l_hidden_context, pool_size=self.context_len, axis=3, pool_function=T.mean, name=id_tag + 'pool') l_pool_squeezed = reshape(l_pool, ([0], [1], [2]), name=id_tag + 'pool_squeezed') l_pool_shuffle = dimshuffle(l_pool_squeezed, (0, 2, 1), name=id_tag + 'shuffle_out') l_concat = ConcatLayer([l_pool_shuffle, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] # l_rec1_drop = l_concat l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', only_return_final=True, **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_rec2_drop = NINLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'rec2_dense') # Context is fed into the RNN as one copy for each time step; just use # the first time step for output. # Input shape: (batch_size, repr_size, seq_len, context_len) # Output shape: (batch_size, repr_size, context_len) l_context_nonrec = SliceLayer(l_hidden_context, indices=0, axis=2, name=id_tag + 'context_nonrec') l_pool_nonrec = SliceLayer(l_pool_squeezed, indices=0, axis=2, name=id_tag + 'pool_nonrec') # Output shape: (batch_size, repr_size, context_len) l_sub = broadcast_sub_layer( l_pool_nonrec, l_context_nonrec, feature_dim=self.options.listener_cell_size, id_tag=id_tag) # Output shape: (batch_size, repr_size * 2, context_len) l_concat_sub = ConcatLayer([l_context_nonrec, l_sub], axis=1, name=id_tag + 'concat_inp_context') # Output shape: (batch_size, cell_size, context_len) l_hidden = NINLayer(l_concat_sub, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_dot = broadcast_dot_layer( l_rec2_drop, l_hidden_drop, feature_dim=self.options.listener_cell_size, id_tag=id_tag) l_dot_bias = l_dot # BiasLayer(l_dot, name=id_tag + 'dot_bias') l_dot_clipped = NonlinearityLayer( l_dot_bias, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'dot_clipped') l_scores = NonlinearityLayer(l_dot_clipped, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense - 1: l_output = DenseLayer( l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, { l_in_prem: x_p, l_in_hyp: x_h }) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted( layer_dict, p_metric) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format( str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex % (acc_num * batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' prev_output_var, mask_var = input_vars[-2:] color_input_vars = input_vars[:-2] context_len = self.context_len if hasattr(self, 'context_len') else 1 l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=self.seq_vec.max_len - 1, cell_size=self.options.speaker_cell_size, context_len=context_len, id=self.id) l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1)) for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1)) l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer( l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.speaker_nonlinearity] for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer( l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_prev_out, l_mask_in]