def build_network(word_var, char_var, mask_var, word_alphabet, char_alphabet, dropout, num_units, num_labels, grad_clipping=5.0, num_filters=30, p=0.5): def generate_random_embedding(scale, shape): return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX) def construct_word_embedding_table(): scale = np.sqrt(3.0 / WORD_DIM) table = np.empty([word_alphabet.size(), WORD_DIM], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, WORD_DIM]) for word, index in word_alphabet.iteritems(): ww = word.lower() if caseless else word embedding = embedd_dict[ww] if ww in embedd_dict else generate_random_embedding(scale, [1, WORD_DIM]) table[index, :] = embedding return table def construct_char_embedding_table(): scale = np.sqrt(3.0 / CHARACTER_DIM) table = generate_random_embedding(scale, [char_alphabet.size(), CHARACTER_DIM]) return table def construct_word_input_layer(): # shape = [batch, n-step] layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input') # shape = [batch, n-step, w_dim] layer_word_embedding = lasagne.layers.EmbeddingLayer(layer_word_input, input_size=word_alphabet.size(), output_size=WORD_DIM, W=word_table, name='word_embedd') return layer_word_embedding def construct_char_input_layer(): # shape = [batch, n-step, char_length] layer_char_input = lasagne.layers.InputLayer(shape=(None, None, data_utils.MAX_CHAR_LENGTH), input_var=char_var, name='char_input') # shape = [batch, n-step, char_length, c_dim] layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet.size(), output_size=CHARACTER_DIM, W=char_table, name='char_embedd') # shape = [batch, n-step, c_dim, char_length] layer_char_embedding = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 1, 3, 2)) return layer_char_embedding embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict('glove', "data/glove/glove.6B/glove.6B.100d.gz") assert embedd_dim == WORD_DIM word_table = construct_word_embedding_table() char_table = construct_char_embedding_table() layer_char_input = construct_char_input_layer() layer_word_input = construct_word_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask') if dropout == 'std': return build_std_dropout(layer_char_input, layer_word_input, num_units, num_labels, layer_mask, grad_clipping, num_filters, p) elif dropout == 'recurrent': return build_recur_dropout(layer_char_input, layer_word_input, num_units, num_labels, layer_mask, grad_clipping, num_filters, p) else: raise ValueError('unkown dropout patten: %s' % dropout)
def construct_char_embedding_table(): if char_embedding == 'random': scale = np.sqrt(3.0 / CHARACTER_DIM) table = generate_random_embedding(scale, [char_alphabet.size(), CHARACTER_DIM]) else: char_dict, char_dim, caseless = utils.load_word_embedding_dict(char_embedding, char_path, normalize_digits=False) scale = np.sqrt(3.0 / char_dim) table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, char_dim]) for char, index in char_alphabet.iteritems(): cc = char.lower() if caseless else char char_embedd = char_dict[cc] if cc in char_dict else generate_random_embedding(scale, [1, char_dim]) table[index, :] = char_embedd print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1]) return table
def construct_char_embedding_table(): if char_embedding == 'random': scale = np.sqrt(3.0 / CHARACTER_DIM) table = generate_random_embedding( scale, [char_alphabet.size(), CHARACTER_DIM]) else: char_dict, char_dim, caseless = utils.load_word_embedding_dict( char_embedding, char_path, normalize_digits=False) scale = np.sqrt(3.0 / char_dim) table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding( scale, [1, char_dim]) for char, index in char_alphabet.iteritems(): cc = char.lower() if caseless else char char_embedd = char_dict[ cc] if cc in char_dict else generate_random_embedding( scale, [1, char_dim]) table[index, :] = char_embedd print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1]) return table
def build_network(word_var, char_var, pos_var, mask_var, word_alphabet, char_alphabet, pos_alphabet, depth, num_units, num_types, grad_clipping=5.0, num_filters=30, p=0.5, mlp=1, peepholes=False, use_char=False, use_pos=False, normalize_digits=True, embedding='glove', embedding_path='data/glove/glove.6B/glove.6B.100d.gz', char_embedding='random', char_path=None): def generate_random_embedding(scale, shape): return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX) def construct_word_embedding_table(): scale = np.sqrt(3.0 / WORD_DIM) table = np.empty([word_alphabet.size(), WORD_DIM], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, WORD_DIM]) for word, index in word_alphabet.iteritems(): ww = word.lower() if caseless else word embedd = embedd_dict[ww] if ww in embedd_dict else generate_random_embedding(scale, [1, WORD_DIM]) table[index, :] = embedd print 'construct word table: %s, dimension: %d' % (embedding, table.shape[1]) return table def construct_char_embedding_table(): if char_embedding == 'random': scale = np.sqrt(3.0 / CHARACTER_DIM) table = generate_random_embedding(scale, [char_alphabet.size(), CHARACTER_DIM]) else: char_dict, char_dim, caseless = utils.load_word_embedding_dict(char_embedding, char_path, normalize_digits=False) scale = np.sqrt(3.0 / char_dim) table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, char_dim]) for char, index in char_alphabet.iteritems(): cc = char.lower() if caseless else char char_embedd = char_dict[cc] if cc in char_dict else generate_random_embedding(scale, [1, char_dim]) table[index, :] = char_embedd print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1]) return table def construct_pos_embedding_table(): scale = np.sqrt(3.0 / POS_DIM) table = generate_random_embedding(scale, [pos_alphabet.size(), POS_DIM]) print 'construct pos table: %s, dimension: %d' % ('random', table.shape[1]) return table def construct_word_input_layer(): # shape = [batch, n-step] layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input') # shape = [batch, n-step, w_dim] layer_word_embedding = lasagne.layers.EmbeddingLayer(layer_word_input, input_size=word_alphabet.size(), output_size=WORD_DIM, W=word_table, name='word_embedd') return layer_word_embedding def construct_pos_input_layer(): # shape = [batch, n-step] layer_pos_input = lasagne.layers.InputLayer(shape=(None, None), input_var=pos_var, name='pos_input') # shape = [batch, n-step, w_dim] layer_pos_embedding = lasagne.layers.EmbeddingLayer(layer_pos_input, input_size=pos_alphabet.size(), output_size=POS_DIM, W=pos_table, name='pos_embedd') return layer_pos_embedding def construct_char_input_layer(): # shape = [batch, n-step, char_length] layer_char_input = lasagne.layers.InputLayer(shape=(None, None, data_utils.MAX_CHAR_LENGTH), input_var=char_var, name='char_input') # shape = [batch, n-step, char_length, c_dim] layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet.size(), output_size=CHARACTER_DIM, W=char_table, name='char_embedd') # shape = [batch, n-step, c_dim, char_length] layer_char_embedding = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 1, 3, 2)) return layer_char_embedding def construct_bi_lstm_layer(): lstm_forward = incoming lstm_backward = incoming assert depth > 0 for d in xrange(depth): ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_forward = LSTMLayer(lstm_forward, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, ingate=ingate_forward, outgate=outgate_forward, forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward%d' % d) lstm_forward = lasagne.layers.DropoutLayer(lstm_forward, p=0.33, shared_axes=(1,)) # ---------------------------------------------------------------------------------------------------- ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_backward = LSTMLayer(lstm_backward, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, backwards=True, ingate=ingate_backward, outgate=outgate_backward, forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward%d' % d) lstm_backward = lasagne.layers.DropoutLayer(lstm_backward, p=0.33, shared_axes=(1,)) # ------------------------------------------------------------------------------------------------------ # concatenate the outputs of forward and backward LSTMs to combine them. bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm") return bi_lstm_cnn embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path, normalize_digits=normalize_digits) WORD_DIM = embedd_dim POS_DIM = 50 CHARACTER_DIM = 50 word_table = construct_word_embedding_table() pos_table = construct_pos_embedding_table() if use_pos else None char_table = construct_char_embedding_table() if use_char else None if char_table is not None: CHARACTER_DIM = char_table.shape[1] layer_word_input = construct_word_input_layer() incoming = layer_word_input mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask') if use_pos: layer_pos_input = construct_pos_input_layer() incoming = lasagne.layers.concat([incoming, layer_pos_input], axis=2) if use_char: layer_char_input = construct_char_input_layer() # dropout before CNN # TODO # layer_char_input = lasagne.layers.DropoutLayer(layer_char_input, p=0.15) # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(layer_char_input, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.15, shared_axes=(1,)) # shape [batch, n-step, num_units] bi_lstm_cnn = construct_bi_lstm_layer() # MLP layers # shape [batch, n-step, 100] for d in xrange(1, mlp): bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu, num_leading_axes=2, name='dense%d' % d) bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=0.33, shared_axes=(1,)) bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu, num_leading_axes=2, name='dense%d' % mlp) return TreeBiAffineCRFLayer(bi_lstm_cnn, num_types, mask_input=mask, name='crf')
def build_network(word_var, char_var, pos_var, mask_var, word_alphabet, char_alphabet, pos_alphabet, depth, num_units, num_types, grad_clipping=5.0, num_filters=30, p=0.5, mlp=1, peepholes=False, use_char=False, use_pos=False, normalize_digits=True, embedding='glove', embedding_path='data/glove/glove.6B/glove.6B.100d.gz', char_embedding='random', char_path=None): def generate_random_embedding(scale, shape): return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX) def construct_word_embedding_table(): scale = np.sqrt(3.0 / WORD_DIM) table = np.empty([word_alphabet.size(), WORD_DIM], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding( scale, [1, WORD_DIM]) for word, index in word_alphabet.iteritems(): ww = word.lower() if caseless else word embedd = embedd_dict[ ww] if ww in embedd_dict else generate_random_embedding( scale, [1, WORD_DIM]) table[index, :] = embedd print 'construct word table: %s, dimension: %d' % (embedding, table.shape[1]) return table def construct_char_embedding_table(): if char_embedding == 'random': scale = np.sqrt(3.0 / CHARACTER_DIM) table = generate_random_embedding( scale, [char_alphabet.size(), CHARACTER_DIM]) else: char_dict, char_dim, caseless = utils.load_word_embedding_dict( char_embedding, char_path, normalize_digits=False) scale = np.sqrt(3.0 / char_dim) table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX) table[data_utils.UNK_ID, :] = generate_random_embedding( scale, [1, char_dim]) for char, index in char_alphabet.iteritems(): cc = char.lower() if caseless else char char_embedd = char_dict[ cc] if cc in char_dict else generate_random_embedding( scale, [1, char_dim]) table[index, :] = char_embedd print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1]) return table def construct_pos_embedding_table(): scale = np.sqrt(3.0 / POS_DIM) table = generate_random_embedding(scale, [pos_alphabet.size(), POS_DIM]) print 'construct pos table: %s, dimension: %d' % ('random', table.shape[1]) return table def construct_word_input_layer(): # shape = [batch, n-step] layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input') # shape = [batch, n-step, w_dim] layer_word_embedding = lasagne.layers.EmbeddingLayer( layer_word_input, input_size=word_alphabet.size(), output_size=WORD_DIM, W=word_table, name='word_embedd') return layer_word_embedding def construct_pos_input_layer(): # shape = [batch, n-step] layer_pos_input = lasagne.layers.InputLayer(shape=(None, None), input_var=pos_var, name='pos_input') # shape = [batch, n-step, w_dim] layer_pos_embedding = lasagne.layers.EmbeddingLayer( layer_pos_input, input_size=pos_alphabet.size(), output_size=POS_DIM, W=pos_table, name='pos_embedd') return layer_pos_embedding def construct_char_input_layer(): # shape = [batch, n-step, char_length] layer_char_input = lasagne.layers.InputLayer( shape=(None, None, data_utils.MAX_CHAR_LENGTH), input_var=char_var, name='char_input') # shape = [batch, n-step, char_length, c_dim] layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char_input, input_size=char_alphabet.size(), output_size=CHARACTER_DIM, W=char_table, name='char_embedd') # shape = [batch, n-step, c_dim, char_length] layer_char_embedding = lasagne.layers.DimshuffleLayer( layer_char_embedding, pattern=(0, 1, 3, 2)) return layer_char_embedding def construct_bi_lstm_layer(): lstm_forward = incoming lstm_backward = incoming assert depth > 0 for d in xrange(depth): ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_forward = LSTMLayer(lstm_forward, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, ingate=ingate_forward, outgate=outgate_forward, forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward%d' % d) lstm_forward = lasagne.layers.DropoutLayer(lstm_forward, p=0.33, shared_axes=(1, )) # ---------------------------------------------------------------------------------------------------- ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_backward = LSTMLayer(lstm_backward, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, backwards=True, ingate=ingate_backward, outgate=outgate_backward, forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward%d' % d) lstm_backward = lasagne.layers.DropoutLayer(lstm_backward, p=0.33, shared_axes=(1, )) # ------------------------------------------------------------------------------------------------------ # concatenate the outputs of forward and backward LSTMs to combine them. bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm") return bi_lstm_cnn embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( embedding, embedding_path, normalize_digits=normalize_digits) WORD_DIM = embedd_dim POS_DIM = 50 CHARACTER_DIM = 50 word_table = construct_word_embedding_table() pos_table = construct_pos_embedding_table() if use_pos else None char_table = construct_char_embedding_table() if use_char else None if char_table is not None: CHARACTER_DIM = char_table.shape[1] layer_word_input = construct_word_input_layer() incoming = layer_word_input mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask') if use_pos: layer_pos_input = construct_pos_input_layer() incoming = lasagne.layers.concat([incoming, layer_pos_input], axis=2) if use_char: layer_char_input = construct_char_input_layer() # dropout before CNN # TODO # layer_char_input = lasagne.layers.DropoutLayer(layer_char_input, p=0.15) # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer( layer_char_input, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.15, shared_axes=(1, )) # shape [batch, n-step, num_units] bi_lstm_cnn = construct_bi_lstm_layer() # MLP layers # shape [batch, n-step, 100] for d in xrange(1, mlp): bi_lstm_cnn = lasagne.layers.DenseLayer( bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu, num_leading_axes=2, name='dense%d' % d) bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=0.33, shared_axes=(1, )) bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu, num_leading_axes=2, name='dense%d' % mlp) return TreeBiAffineCRFLayer(bi_lstm_cnn, num_types, mask_input=mask, name='crf')
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional MAXRU-CNN') parser.add_argument('--architec', choices=['sgru', 'lstm', 'gru0', 'gru1'], help='architecture of rnn', required=True) parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in TARU') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--schedule', nargs='+', type=int, help='schedule for learning rate decay') args = parser.parse_args() architec = args.architec num_epochs = args.num_epochs batch_size = args.batch_size num_units = args.num_units learning_rate = args.learning_rate decay_rate = args.decay_rate schedule = args.schedule grad_clipping = args.grad_clipping logger = get_logger("Sentiment Classification (%s)" % (architec)) def read_dataset(filename): data = [[] for _ in _buckets] print 'Reading data from %s' % filename counter = 0 with open(filename, "r") as f: for line in f: counter += 1 tag, words = line.lower().strip().split(" ||| ") words = words.split(" ") wids = [w2i[x] for x in words] tag = t2i[tag] length = len(words) for bucket_id, bucket_size in enumerate(_buckets): if length < bucket_size: data[bucket_id].append([words, wids, tag]) break print "Total number of data: %d" % counter return data def generate_random_embedding(scale, shape): return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX) def construct_word_input_layer(): # shape = [batch, n-step] layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input') # shape = [batch, n-step, w_dim] layer_word_embedding = lasagne.layers.EmbeddingLayer( layer_word_input, input_size=vocab_size, output_size=WORD_DIM, W=word_table, name='word_embedd') return layer_word_embedding def construct_word_embedding_table(): scale = np.sqrt(3.0 / WORD_DIM) table = np.empty([vocab_size, WORD_DIM], dtype=theano.config.floatX) table[UNK, :] = generate_random_embedding(scale, [1, WORD_DIM]) for word, index in w2i.iteritems(): if index == 0: continue ww = word.lower() if caseless else word embedding = embedd_dict[ ww] if ww in embedd_dict else generate_random_embedding( scale, [1, WORD_DIM]) table[index, :] = embedding return table # Functions to read in the corpus w2i = defaultdict(lambda: len(w2i)) t2i = defaultdict(lambda: len(t2i)) UNK = w2i["<unk>"] data_train = read_dataset('data/sst1/train.txt') w2i = defaultdict(lambda: UNK, w2i) data_dev = read_dataset('data/sst1/dev.txt') data_test = read_dataset('data/sst1/test.txt') vocab_size = len(w2i) num_labels = len(t2i) embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict( 'glove', "data/glove/glove.6B/glove.6B.100d.gz") assert embedd_dim == WORD_DIM num_data_train = sum([len(bucket) for bucket in data_train]) num_data_dev = sum([len(bucket) for bucket in data_dev]) num_data_test = sum([len(bucket) for bucket in data_test]) logger.info("constructing network...") # create variables target_var = T.ivector(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) word_var = T.imatrix(name='inputs') word_table = construct_word_embedding_table() layer_word_input = construct_word_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask') layer_input = layer_word_input layer_input = lasagne.layers.DropoutLayer(layer_input, p=0.2) layer_rnn = build_RNN(architec, layer_input, layer_mask, num_units, grad_clipping) layer_rnn = lasagne.layers.DropoutLayer(layer_rnn, p=0.5) network = lasagne.layers.DenseLayer(layer_rnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') # get output of bi-taru-cnn shape=[batch * max_length, #label] prediction_train = lasagne.layers.get_output(network) prediction_eval = lasagne.layers.get_output(network, deterministic=True) final_prediction = T.argmax(prediction_eval, axis=1) loss_train = lasagne.objectives.categorical_crossentropy( prediction_train, target_var).mean() loss_eval = lasagne.objectives.categorical_crossentropy( prediction_eval, target_var).mean() corr_train = lasagne.objectives.categorical_accuracy( prediction_train, target_var).sum() corr_eval = lasagne.objectives.categorical_accuracy( prediction_eval, target_var).sum() params = lasagne.layers.get_all_params(network, trainable=True) updates = adam(loss_train, params=params, learning_rate=learning_rate, beta1=0.9, beta2=0.9) # Compile a function performing a training step on a mini-batch train_fn = theano.function([word_var, target_var, mask_var], [loss_train, corr_train], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([word_var, target_var, mask_var], [corr_eval, final_prediction]) # Finally, launch the training loop. logger.info("%s: (#data: %d, batch size: %d, clip: %.1f)" % (architec, num_data_train, batch_size, grad_clipping)) num_batches = num_data_train / batch_size + 1 dev_correct = 0.0 best_epoch = 0 test_correct = 0.0 test_total = 0 lr = learning_rate for epoch in range(1, num_epochs + 1): print 'Epoch %d (%s, learning rate=%.4f, decay rate=%.4f): ' % ( epoch, architec, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 start_time = time.time() num_back = 0 for batch in xrange(1, num_batches + 1): wids, tids, masks = get_batch(data_train, batch_size) num = wids.shape[0] err, corr = train_fn(wids, tids, masks) train_err += err * num train_corr += corr train_total += num time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_total == num_batches * batch_size sys.stdout.write("\b" * num_back) print 'train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( train_total, train_err / train_total, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_corr = 0.0 dev_total = 0 for batch in iterate_batch(data_dev, batch_size): wids, tids, masks = batch num = wids.shape[0] corr, predictions = eval_fn(wids, tids, masks) dev_corr += corr dev_total += num assert dev_total == num_data_dev print 'dev corr: %d, total: %d, acc: %.2f%%' % ( dev_corr, dev_total, dev_corr * 100 / dev_total) if dev_correct <= dev_corr: dev_correct = dev_corr best_epoch = epoch # evaluate on test data when better performance detected test_corr = 0.0 test_total = 0 for batch in iterate_batch(data_test, batch_size): wids, tids, masks = batch num = wids.shape[0] corr, predictions = eval_fn(wids, tids, masks) test_corr += corr test_total += num assert test_total == num_data_test test_correct = test_corr print "best dev corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % ( dev_correct, dev_total, dev_correct * 100 / dev_total, best_epoch) print "best test corr: %d, total: %d, acc: %.2f%%(epoch: %d)" % ( test_correct, test_total, test_correct * 100 / test_total, best_epoch) if epoch in schedule: lr = lr * decay_rate updates = adam(loss_train, params=params, learning_rate=lr, beta1=0.9, beta2=0.9) train_fn = theano.function([word_var, target_var, mask_var], [loss_train, corr_train], updates=updates)