def build_recur_dropout(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p): # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=p, shared_axes=(1,)) ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_forward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=False, ingate=ingate_forward, outgate=outgate_forward, forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward') ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_backward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=False, backwards=True, ingate=ingate_backward, outgate=outgate_backward, forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm") # shape = [batch, n-step, num_units] bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=p, shared_axes=(1,)) return ChainCRFLayer(bi_lstm_cnn, num_labels, mask_input=mask)
def build_std_dropout_sgru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p): # Construct Bi-directional LSTM-CNNs-CRF with standard dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] incoming1 = lasagne.layers.DropoutLayer(incoming1, p=p) # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.2) resetgate_input_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) resetgate_hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) sgru_forward = SGRULayer(incoming, num_units, mask_input=mask, resetgate_input=resetgate_input_forward, resetgate_hidden=resetgate_hidden_forward, updategate=updategate_forward, hidden_update=hidden_update_forward, grad_clipping=grad_clipping, name='forward') resetgate_input_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) resetgate_hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) sgru_backward = SGRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate_input=resetgate_input_backward, resetgate_hidden=resetgate_hidden_backward, updategate=updategate_backward, hidden_update=hidden_update_backward, grad_clipping=grad_clipping, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_sgru_cnn = lasagne.layers.concat([sgru_forward, sgru_backward], axis=2, name="bi-sgru") bi_sgru_cnn = lasagne.layers.DropoutLayer(bi_sgru_cnn, p=p) # reshape bi-rnn-cnn to [batch * max_length, num_units] bi_sgru_cnn = lasagne.layers.reshape(bi_sgru_cnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_sgru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') return layer_output
def build_network(word_var, char_var, pos_var, mask_var, word_alphabet, char_alphabet, pos_alphabet, depth, num_units, num_types, grad_clipping=5.0, num_filters=30, p=0.5, mlp=1, peepholes=False, use_char=False, use_pos=False, normalize_digits=True, embedding='glove', embedding_path='data/glove/glove.6B/glove.6B.100d.gz', char_embedding='random', char_path=None): def generate_random_embedding(scale, shape): return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX) def construct_word_embedding_table(): scale = np.sqrt(3.0 / WORD_DIM) table = np.empty([word_alphabet.size(), WORD_DIM], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding( scale, [1, WORD_DIM]) for word, index in word_alphabet.iteritems(): ww = word.lower() if caseless else word embedd = embedd_dict[ ww] if ww in embedd_dict else generate_random_embedding( scale, [1, WORD_DIM]) table[index, :] = embedd print 'construct word table: %s, dimension: %d' % (embedding, table.shape[1]) return table def construct_char_embedding_table(): if char_embedding == 'random': scale = np.sqrt(3.0 / CHARACTER_DIM) table = generate_random_embedding( scale, [char_alphabet.size(), CHARACTER_DIM]) else: char_dict, char_dim, caseless = utils.load_word_embedding_dict( char_embedding, char_path, normalize_digits=False) scale = np.sqrt(3.0 / char_dim) table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding( scale, [1, char_dim]) for char, index in char_alphabet.iteritems(): cc = char.lower() if caseless else char char_embedd = char_dict[ cc] if cc in char_dict else generate_random_embedding( scale, [1, char_dim]) table[index, :] = char_embedd print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1]) return table def construct_pos_embedding_table(): scale = np.sqrt(3.0 / POS_DIM) table = generate_random_embedding(scale, [pos_alphabet.size(), POS_DIM]) print 'construct pos table: %s, dimension: %d' % ('random', table.shape[1]) return table def construct_word_input_layer(): # shape = [batch, n-step] layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input') # shape = [batch, n-step, w_dim] layer_word_embedding = lasagne.layers.EmbeddingLayer( layer_word_input, input_size=word_alphabet.size(), output_size=WORD_DIM, W=word_table, name='word_embedd') return layer_word_embedding def construct_pos_input_layer(): # shape = [batch, n-step] layer_pos_input = lasagne.layers.InputLayer(shape=(None, None), input_var=pos_var, name='pos_input') # shape = [batch, n-step, w_dim] layer_pos_embedding = lasagne.layers.EmbeddingLayer( layer_pos_input, input_size=pos_alphabet.size(), output_size=POS_DIM, W=pos_table, name='pos_embedd') return layer_pos_embedding def construct_char_input_layer(): # shape = [batch, n-step, char_length] layer_char_input = lasagne.layers.InputLayer( shape=(None, None, data_utils.MAX_CHAR_LENGTH), input_var=char_var, name='char_input') # shape = [batch, n-step, char_length, c_dim] layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char_input, input_size=char_alphabet.size(), output_size=CHARACTER_DIM, W=char_table, name='char_embedd') # shape = [batch, n-step, c_dim, char_length] layer_char_embedding = lasagne.layers.DimshuffleLayer( layer_char_embedding, pattern=(0, 1, 3, 2)) return layer_char_embedding def construct_bi_lstm_layer(): lstm_forward = incoming lstm_backward = incoming assert depth > 0 for d in xrange(depth): ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_forward = LSTMLayer(lstm_forward, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, ingate=ingate_forward, outgate=outgate_forward, forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward%d' % d) lstm_forward = lasagne.layers.DropoutLayer(lstm_forward, p=0.33, shared_axes=(1, )) # ---------------------------------------------------------------------------------------------------- ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_backward = LSTMLayer(lstm_backward, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, backwards=True, ingate=ingate_backward, outgate=outgate_backward, forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward%d' % d) lstm_backward = lasagne.layers.DropoutLayer(lstm_backward, p=0.33, shared_axes=(1, )) # ------------------------------------------------------------------------------------------------------ # concatenate the outputs of forward and backward LSTMs to combine them. bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm") return bi_lstm_cnn embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, normalize_digits=normalize_digits) WORD_DIM = embedd_dim POS_DIM = 50 CHARACTER_DIM = 50 word_table = construct_word_embedding_table() pos_table = construct_pos_embedding_table() if use_pos else None char_table = construct_char_embedding_table() if use_char else None if char_table is not None: CHARACTER_DIM = char_table.shape[1] layer_word_input = construct_word_input_layer() incoming = layer_word_input mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask') if use_pos: layer_pos_input = construct_pos_input_layer() incoming = lasagne.layers.concat([incoming, layer_pos_input], axis=2) if use_char: layer_char_input = construct_char_input_layer() # dropout before CNN # TODO # layer_char_input = lasagne.layers.DropoutLayer(layer_char_input, p=0.15) # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer( layer_char_input, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.15, shared_axes=(1, )) # shape [batch, n-step, num_units] bi_lstm_cnn = construct_bi_lstm_layer() # MLP layers # shape [batch, n-step, 100] for d in xrange(1, mlp): bi_lstm_cnn = lasagne.layers.DenseLayer( bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu, num_leading_axes=2, name='dense%d' % d) bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=0.33, shared_axes=(1, )) bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu, num_leading_axes=2, name='dense%d' % mlp) return TreeBiAffineCRFLayer(bi_lstm_cnn, num_types, mask_input=mask, name='crf')