Пример #1
0
def build_network(word_var, char_var, mask_var, word_alphabet, char_alphabet, dropout, num_units, num_labels,
                  grad_clipping=5.0, num_filters=30, p=0.5):
    def generate_random_embedding(scale, shape):
        return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / WORD_DIM)
        table = np.empty([word_alphabet.size(), WORD_DIM], dtype=theano.config.floatX)
        table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, WORD_DIM])
        for word, index in word_alphabet.iteritems():
            ww = word.lower() if caseless else word
            embedding = embedd_dict[ww] if ww in embedd_dict else generate_random_embedding(scale, [1, WORD_DIM])
            table[index, :] = embedding
        return table

    def construct_char_embedding_table():
        scale = np.sqrt(3.0 / CHARACTER_DIM)
        table = generate_random_embedding(scale, [char_alphabet.size(), CHARACTER_DIM])
        return table

    def construct_word_input_layer():
        # shape = [batch, n-step]
        layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input')
        # shape = [batch, n-step, w_dim]
        layer_word_embedding = lasagne.layers.EmbeddingLayer(layer_word_input, input_size=word_alphabet.size(),
                                                             output_size=WORD_DIM, W=word_table, name='word_embedd')
        return layer_word_embedding

    def construct_char_input_layer():
        # shape = [batch, n-step, char_length]
        layer_char_input = lasagne.layers.InputLayer(shape=(None, None, data_utils.MAX_CHAR_LENGTH), input_var=char_var,
                                                     name='char_input')

        # shape = [batch, n-step, char_length, c_dim]
        layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet.size(),
                                                             output_size=CHARACTER_DIM, W=char_table,
                                                             name='char_embedd')
        # shape = [batch, n-step, c_dim, char_length]
        layer_char_embedding = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 1, 3, 2))
        return layer_char_embedding

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict('glove', "data/glove/glove.6B/glove.6B.100d.gz")
    assert embedd_dim == WORD_DIM

    word_table = construct_word_embedding_table()
    char_table = construct_char_embedding_table()

    layer_char_input = construct_char_input_layer()
    layer_word_input = construct_word_input_layer()
    layer_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask')

    if dropout == 'std':
        return build_std_dropout(layer_char_input, layer_word_input, num_units, num_labels, layer_mask,
                                 grad_clipping, num_filters, p)
    elif dropout == 'recurrent':
        return build_recur_dropout(layer_char_input, layer_word_input, num_units, num_labels, layer_mask,
                                   grad_clipping, num_filters, p)
    else:
        raise ValueError('unkown dropout patten: %s' % dropout)
Пример #2
0
 def construct_char_embedding_table():
     if char_embedding == 'random':
         scale = np.sqrt(3.0 / CHARACTER_DIM)
         table = generate_random_embedding(scale, [char_alphabet.size(), CHARACTER_DIM])
     else:
         char_dict, char_dim, caseless = utils.load_word_embedding_dict(char_embedding, char_path,
                                                                        normalize_digits=False)
         scale = np.sqrt(3.0 / char_dim)
         table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX)
         table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, char_dim])
         for char, index in char_alphabet.iteritems():
             cc = char.lower() if caseless else char
             char_embedd = char_dict[cc] if cc in char_dict else generate_random_embedding(scale, [1, char_dim])
             table[index, :] = char_embedd
     print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1])
     return table
Пример #3
0
 def construct_char_embedding_table():
     if char_embedding == 'random':
         scale = np.sqrt(3.0 / CHARACTER_DIM)
         table = generate_random_embedding(
             scale, [char_alphabet.size(), CHARACTER_DIM])
     else:
         char_dict, char_dim, caseless = utils.load_word_embedding_dict(
             char_embedding, char_path, normalize_digits=False)
         scale = np.sqrt(3.0 / char_dim)
         table = np.empty([char_alphabet.size(), char_dim],
                          dtype=theano.config.floatX)
         table[data_utils.UNK_ID, :] = generate_random_embedding(
             scale, [1, char_dim])
         for char, index in char_alphabet.iteritems():
             cc = char.lower() if caseless else char
             char_embedd = char_dict[
                 cc] if cc in char_dict else generate_random_embedding(
                     scale, [1, char_dim])
             table[index, :] = char_embedd
     print 'construct character table: %s, dimension: %d' % (char_embedding,
                                                             table.shape[1])
     return table
Пример #4
0
def build_network(word_var, char_var, pos_var, mask_var, word_alphabet, char_alphabet, pos_alphabet,
                  depth, num_units, num_types, grad_clipping=5.0, num_filters=30, p=0.5, mlp=1, peepholes=False,
                  use_char=False, use_pos=False, normalize_digits=True,
                  embedding='glove', embedding_path='data/glove/glove.6B/glove.6B.100d.gz',
                  char_embedding='random', char_path=None):
    def generate_random_embedding(scale, shape):
        return np.random.uniform(-scale, scale, shape).astype(theano.config.floatX)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / WORD_DIM)
        table = np.empty([word_alphabet.size(), WORD_DIM], dtype=theano.config.floatX)
        table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, WORD_DIM])
        for word, index in word_alphabet.iteritems():
            ww = word.lower() if caseless else word
            embedd = embedd_dict[ww] if ww in embedd_dict else generate_random_embedding(scale, [1, WORD_DIM])
            table[index, :] = embedd
        print 'construct word table: %s, dimension: %d' % (embedding, table.shape[1])
        return table

    def construct_char_embedding_table():
        if char_embedding == 'random':
            scale = np.sqrt(3.0 / CHARACTER_DIM)
            table = generate_random_embedding(scale, [char_alphabet.size(), CHARACTER_DIM])
        else:
            char_dict, char_dim, caseless = utils.load_word_embedding_dict(char_embedding, char_path,
                                                                           normalize_digits=False)
            scale = np.sqrt(3.0 / char_dim)
            table = np.empty([char_alphabet.size(), char_dim], dtype=theano.config.floatX)
            table[data_utils.UNK_ID, :] = generate_random_embedding(scale, [1, char_dim])
            for char, index in char_alphabet.iteritems():
                cc = char.lower() if caseless else char
                char_embedd = char_dict[cc] if cc in char_dict else generate_random_embedding(scale, [1, char_dim])
                table[index, :] = char_embedd
        print 'construct character table: %s, dimension: %d' % (char_embedding, table.shape[1])
        return table

    def construct_pos_embedding_table():
        scale = np.sqrt(3.0 / POS_DIM)
        table = generate_random_embedding(scale, [pos_alphabet.size(), POS_DIM])
        print 'construct pos table: %s, dimension: %d' % ('random', table.shape[1])
        return table

    def construct_word_input_layer():
        # shape = [batch, n-step]
        layer_word_input = lasagne.layers.InputLayer(shape=(None, None), input_var=word_var, name='word_input')
        # shape = [batch, n-step, w_dim]
        layer_word_embedding = lasagne.layers.EmbeddingLayer(layer_word_input, input_size=word_alphabet.size(),
                                                             output_size=WORD_DIM, W=word_table, name='word_embedd')
        return layer_word_embedding

    def construct_pos_input_layer():
        # shape = [batch, n-step]
        layer_pos_input = lasagne.layers.InputLayer(shape=(None, None), input_var=pos_var, name='pos_input')
        # shape = [batch, n-step, w_dim]
        layer_pos_embedding = lasagne.layers.EmbeddingLayer(layer_pos_input, input_size=pos_alphabet.size(),
                                                            output_size=POS_DIM, W=pos_table, name='pos_embedd')
        return layer_pos_embedding

    def construct_char_input_layer():
        # shape = [batch, n-step, char_length]
        layer_char_input = lasagne.layers.InputLayer(shape=(None, None, data_utils.MAX_CHAR_LENGTH), input_var=char_var,
                                                     name='char_input')

        # shape = [batch, n-step, char_length, c_dim]
        layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet.size(),
                                                             output_size=CHARACTER_DIM, W=char_table,
                                                             name='char_embedd')
        # shape = [batch, n-step, c_dim, char_length]
        layer_char_embedding = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 1, 3, 2))
        return layer_char_embedding

    def construct_bi_lstm_layer():
        lstm_forward = incoming
        lstm_backward = incoming
        assert depth > 0
        for d in xrange(depth):
            ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                  W_cell=lasagne.init.Uniform(range=0.1))
            outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                   W_cell=lasagne.init.Uniform(range=0.1))
            # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
            forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                      W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
            # now use tanh for nonlinear function of cell, need to try pure linear cell
            cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
                                nonlinearity=nonlinearities.tanh)
            lstm_forward = LSTMLayer(lstm_forward, num_units, mask_input=mask, grad_clipping=grad_clipping,
                                     nonlinearity=nonlinearities.tanh, peepholes=peepholes,
                                     ingate=ingate_forward, outgate=outgate_forward,
                                     forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward%d' % d)
            lstm_forward = lasagne.layers.DropoutLayer(lstm_forward, p=0.33, shared_axes=(1,))
            # ----------------------------------------------------------------------------------------------------
            ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                   W_cell=lasagne.init.Uniform(range=0.1))
            outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                    W_cell=lasagne.init.Uniform(range=0.1))
            # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
            forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                       W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
            # now use tanh for nonlinear function of cell, need to try pure linear cell
            cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
                                 nonlinearity=nonlinearities.tanh)
            lstm_backward = LSTMLayer(lstm_backward, num_units, mask_input=mask, grad_clipping=grad_clipping,
                                      nonlinearity=nonlinearities.tanh, peepholes=peepholes, backwards=True,
                                      ingate=ingate_backward, outgate=outgate_backward,
                                      forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward%d' % d)
            lstm_backward = lasagne.layers.DropoutLayer(lstm_backward, p=0.33, shared_axes=(1,))
            # ------------------------------------------------------------------------------------------------------

        # concatenate the outputs of forward and backward LSTMs to combine them.
        bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm")
        return bi_lstm_cnn

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(embedding, embedding_path,
                                                                       normalize_digits=normalize_digits)

    WORD_DIM = embedd_dim
    POS_DIM = 50
    CHARACTER_DIM = 50

    word_table = construct_word_embedding_table()
    pos_table = construct_pos_embedding_table() if use_pos else None
    char_table = construct_char_embedding_table() if use_char else None

    if char_table is not None:
        CHARACTER_DIM = char_table.shape[1]

    layer_word_input = construct_word_input_layer()
    incoming = layer_word_input
    mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var, name='mask')

    if use_pos:
        layer_pos_input = construct_pos_input_layer()
        incoming = lasagne.layers.concat([incoming, layer_pos_input], axis=2)

    if use_char:
        layer_char_input = construct_char_input_layer()
        # dropout before CNN
        # TODO
        # layer_char_input = lasagne.layers.DropoutLayer(layer_char_input, p=0.15)
        # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
        conv_window = 3
        # shape = [batch, n-step, c_dim, char_length]
        # construct convolution layer
        # shape = [batch, n-step, c_filters, output_length]
        cnn_layer = ConvTimeStep1DLayer(layer_char_input, num_filters=num_filters, filter_size=conv_window, pad='full',
                                        nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, _, pool_size = cnn_layer.output_shape
        # construct max pool layer
        # shape = [batch, n-step, c_filters, 1]
        pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
        # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))

        # finally, concatenate the two incoming layers together.
        # shape = [batch, n-step, c_filter&w_dim]
        incoming = lasagne.layers.concat([output_cnn_layer, incoming], axis=2)

    # dropout for incoming
    incoming = lasagne.layers.DropoutLayer(incoming, p=0.15, shared_axes=(1,))
    # shape [batch, n-step, num_units]
    bi_lstm_cnn = construct_bi_lstm_layer()

    # MLP layers
    # shape [batch, n-step, 100]
    for d in xrange(1, mlp):
        bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu,
                                                num_leading_axes=2, name='dense%d' % d)
        bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=0.33, shared_axes=(1,))

    bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn, 100, nonlinearity=nonlinearities.elu,
                                            num_leading_axes=2, name='dense%d' % mlp)

    return TreeBiAffineCRFLayer(bi_lstm_cnn, num_types, mask_input=mask, name='crf')
Пример #5
0
def build_network(word_var,
                  char_var,
                  pos_var,
                  mask_var,
                  word_alphabet,
                  char_alphabet,
                  pos_alphabet,
                  depth,
                  num_units,
                  num_types,
                  grad_clipping=5.0,
                  num_filters=30,
                  p=0.5,
                  mlp=1,
                  peepholes=False,
                  use_char=False,
                  use_pos=False,
                  normalize_digits=True,
                  embedding='glove',
                  embedding_path='data/glove/glove.6B/glove.6B.100d.gz',
                  char_embedding='random',
                  char_path=None):
    def generate_random_embedding(scale, shape):
        return np.random.uniform(-scale, scale,
                                 shape).astype(theano.config.floatX)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / WORD_DIM)
        table = np.empty([word_alphabet.size(), WORD_DIM],
                         dtype=theano.config.floatX)
        table[data_utils.UNK_ID, :] = generate_random_embedding(
            scale, [1, WORD_DIM])
        for word, index in word_alphabet.iteritems():
            ww = word.lower() if caseless else word
            embedd = embedd_dict[
                ww] if ww in embedd_dict else generate_random_embedding(
                    scale, [1, WORD_DIM])
            table[index, :] = embedd
        print 'construct word table: %s, dimension: %d' % (embedding,
                                                           table.shape[1])
        return table

    def construct_char_embedding_table():
        if char_embedding == 'random':
            scale = np.sqrt(3.0 / CHARACTER_DIM)
            table = generate_random_embedding(
                scale, [char_alphabet.size(), CHARACTER_DIM])
        else:
            char_dict, char_dim, caseless = utils.load_word_embedding_dict(
                char_embedding, char_path, normalize_digits=False)
            scale = np.sqrt(3.0 / char_dim)
            table = np.empty([char_alphabet.size(), char_dim],
                             dtype=theano.config.floatX)
            table[data_utils.UNK_ID, :] = generate_random_embedding(
                scale, [1, char_dim])
            for char, index in char_alphabet.iteritems():
                cc = char.lower() if caseless else char
                char_embedd = char_dict[
                    cc] if cc in char_dict else generate_random_embedding(
                        scale, [1, char_dim])
                table[index, :] = char_embedd
        print 'construct character table: %s, dimension: %d' % (char_embedding,
                                                                table.shape[1])
        return table

    def construct_pos_embedding_table():
        scale = np.sqrt(3.0 / POS_DIM)
        table = generate_random_embedding(scale,
                                          [pos_alphabet.size(), POS_DIM])
        print 'construct pos table: %s, dimension: %d' % ('random',
                                                          table.shape[1])
        return table

    def construct_word_input_layer():
        # shape = [batch, n-step]
        layer_word_input = lasagne.layers.InputLayer(shape=(None, None),
                                                     input_var=word_var,
                                                     name='word_input')
        # shape = [batch, n-step, w_dim]
        layer_word_embedding = lasagne.layers.EmbeddingLayer(
            layer_word_input,
            input_size=word_alphabet.size(),
            output_size=WORD_DIM,
            W=word_table,
            name='word_embedd')
        return layer_word_embedding

    def construct_pos_input_layer():
        # shape = [batch, n-step]
        layer_pos_input = lasagne.layers.InputLayer(shape=(None, None),
                                                    input_var=pos_var,
                                                    name='pos_input')
        # shape = [batch, n-step, w_dim]
        layer_pos_embedding = lasagne.layers.EmbeddingLayer(
            layer_pos_input,
            input_size=pos_alphabet.size(),
            output_size=POS_DIM,
            W=pos_table,
            name='pos_embedd')
        return layer_pos_embedding

    def construct_char_input_layer():
        # shape = [batch, n-step, char_length]
        layer_char_input = lasagne.layers.InputLayer(
            shape=(None, None, data_utils.MAX_CHAR_LENGTH),
            input_var=char_var,
            name='char_input')

        # shape = [batch, n-step, char_length, c_dim]
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char_input,
            input_size=char_alphabet.size(),
            output_size=CHARACTER_DIM,
            W=char_table,
            name='char_embedd')
        # shape = [batch, n-step, c_dim, char_length]
        layer_char_embedding = lasagne.layers.DimshuffleLayer(
            layer_char_embedding, pattern=(0, 1, 3, 2))
        return layer_char_embedding

    def construct_bi_lstm_layer():
        lstm_forward = incoming
        lstm_backward = incoming
        assert depth > 0
        for d in xrange(depth):
            ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                  W_hid=lasagne.init.GlorotUniform(),
                                  W_cell=lasagne.init.Uniform(range=0.1))
            outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                   W_hid=lasagne.init.GlorotUniform(),
                                   W_cell=lasagne.init.Uniform(range=0.1))
            # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
            forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                      W_hid=lasagne.init.GlorotUniform(),
                                      W_cell=lasagne.init.Uniform(range=0.1),
                                      b=lasagne.init.Constant(1.))
            # now use tanh for nonlinear function of cell, need to try pure linear cell
            cell_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                W_hid=lasagne.init.GlorotUniform(),
                                W_cell=None,
                                nonlinearity=nonlinearities.tanh)
            lstm_forward = LSTMLayer(lstm_forward,
                                     num_units,
                                     mask_input=mask,
                                     grad_clipping=grad_clipping,
                                     nonlinearity=nonlinearities.tanh,
                                     peepholes=peepholes,
                                     ingate=ingate_forward,
                                     outgate=outgate_forward,
                                     forgetgate=forgetgate_forward,
                                     cell=cell_forward,
                                     p=p,
                                     name='forward%d' % d)
            lstm_forward = lasagne.layers.DropoutLayer(lstm_forward,
                                                       p=0.33,
                                                       shared_axes=(1, ))
            # ----------------------------------------------------------------------------------------------------
            ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                   W_hid=lasagne.init.GlorotUniform(),
                                   W_cell=lasagne.init.Uniform(range=0.1))
            outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                    W_hid=lasagne.init.GlorotUniform(),
                                    W_cell=lasagne.init.Uniform(range=0.1))
            # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
            forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                       W_hid=lasagne.init.GlorotUniform(),
                                       W_cell=lasagne.init.Uniform(range=0.1),
                                       b=lasagne.init.Constant(1.))
            # now use tanh for nonlinear function of cell, need to try pure linear cell
            cell_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                 W_hid=lasagne.init.GlorotUniform(),
                                 W_cell=None,
                                 nonlinearity=nonlinearities.tanh)
            lstm_backward = LSTMLayer(lstm_backward,
                                      num_units,
                                      mask_input=mask,
                                      grad_clipping=grad_clipping,
                                      nonlinearity=nonlinearities.tanh,
                                      peepholes=peepholes,
                                      backwards=True,
                                      ingate=ingate_backward,
                                      outgate=outgate_backward,
                                      forgetgate=forgetgate_backward,
                                      cell=cell_backward,
                                      p=p,
                                      name='backward%d' % d)
            lstm_backward = lasagne.layers.DropoutLayer(lstm_backward,
                                                        p=0.33,
                                                        shared_axes=(1, ))
            # ------------------------------------------------------------------------------------------------------

        # concatenate the outputs of forward and backward LSTMs to combine them.
        bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward],
                                            axis=2,
                                            name="bi-lstm")
        return bi_lstm_cnn

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
        embedding, embedding_path, normalize_digits=normalize_digits)

    WORD_DIM = embedd_dim
    POS_DIM = 50
    CHARACTER_DIM = 50

    word_table = construct_word_embedding_table()
    pos_table = construct_pos_embedding_table() if use_pos else None
    char_table = construct_char_embedding_table() if use_char else None

    if char_table is not None:
        CHARACTER_DIM = char_table.shape[1]

    layer_word_input = construct_word_input_layer()
    incoming = layer_word_input
    mask = lasagne.layers.InputLayer(shape=(None, None),
                                     input_var=mask_var,
                                     name='mask')

    if use_pos:
        layer_pos_input = construct_pos_input_layer()
        incoming = lasagne.layers.concat([incoming, layer_pos_input], axis=2)

    if use_char:
        layer_char_input = construct_char_input_layer()
        # dropout before CNN
        # TODO
        # layer_char_input = lasagne.layers.DropoutLayer(layer_char_input, p=0.15)
        # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
        conv_window = 3
        # shape = [batch, n-step, c_dim, char_length]
        # construct convolution layer
        # shape = [batch, n-step, c_filters, output_length]
        cnn_layer = ConvTimeStep1DLayer(
            layer_char_input,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, _, pool_size = cnn_layer.output_shape
        # construct max pool layer
        # shape = [batch, n-step, c_filters, 1]
        pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
        # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))

        # finally, concatenate the two incoming layers together.
        # shape = [batch, n-step, c_filter&w_dim]
        incoming = lasagne.layers.concat([output_cnn_layer, incoming], axis=2)

    # dropout for incoming
    incoming = lasagne.layers.DropoutLayer(incoming, p=0.15, shared_axes=(1, ))
    # shape [batch, n-step, num_units]
    bi_lstm_cnn = construct_bi_lstm_layer()

    # MLP layers
    # shape [batch, n-step, 100]
    for d in xrange(1, mlp):
        bi_lstm_cnn = lasagne.layers.DenseLayer(
            bi_lstm_cnn,
            100,
            nonlinearity=nonlinearities.elu,
            num_leading_axes=2,
            name='dense%d' % d)
        bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn,
                                                  p=0.33,
                                                  shared_axes=(1, ))

    bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn,
                                            100,
                                            nonlinearity=nonlinearities.elu,
                                            num_leading_axes=2,
                                            name='dense%d' % mlp)

    return TreeBiAffineCRFLayer(bi_lstm_cnn,
                                num_types,
                                mask_input=mask,
                                name='crf')
Пример #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional MAXRU-CNN')
    parser.add_argument('--architec',
                        choices=['sgru', 'lstm', 'gru0', 'gru1'],
                        help='architecture of rnn',
                        required=True)
    parser.add_argument('--num_epochs',
                        type=int,
                        default=1000,
                        help='Number of training epochs')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='Number of sentences in each batch')
    parser.add_argument('--num_units',
                        type=int,
                        default=100,
                        help='Number of hidden units in TARU')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping',
                        type=float,
                        default=0,
                        help='Gradient clipping')
    parser.add_argument('--schedule',
                        nargs='+',
                        type=int,
                        help='schedule for learning rate decay')
    args = parser.parse_args()

    architec = args.architec
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    num_units = args.num_units
    learning_rate = args.learning_rate
    decay_rate = args.decay_rate
    schedule = args.schedule
    grad_clipping = args.grad_clipping
    logger = get_logger("Sentiment Classification (%s)" % (architec))

    def read_dataset(filename):
        data = [[] for _ in _buckets]
        print 'Reading data from %s' % filename
        counter = 0
        with open(filename, "r") as f:
            for line in f:
                counter += 1
                tag, words = line.lower().strip().split(" ||| ")
                words = words.split(" ")
                wids = [w2i[x] for x in words]
                tag = t2i[tag]
                length = len(words)
                for bucket_id, bucket_size in enumerate(_buckets):
                    if length < bucket_size:
                        data[bucket_id].append([words, wids, tag])
                        break

        print "Total number of data: %d" % counter
        return data

    def generate_random_embedding(scale, shape):
        return np.random.uniform(-scale, scale,
                                 shape).astype(theano.config.floatX)

    def construct_word_input_layer():
        # shape = [batch, n-step]
        layer_word_input = lasagne.layers.InputLayer(shape=(None, None),
                                                     input_var=word_var,
                                                     name='word_input')
        # shape = [batch, n-step, w_dim]
        layer_word_embedding = lasagne.layers.EmbeddingLayer(
            layer_word_input,
            input_size=vocab_size,
            output_size=WORD_DIM,
            W=word_table,
            name='word_embedd')
        return layer_word_embedding

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / WORD_DIM)
        table = np.empty([vocab_size, WORD_DIM], dtype=theano.config.floatX)
        table[UNK, :] = generate_random_embedding(scale, [1, WORD_DIM])
        for word, index in w2i.iteritems():
            if index == 0:
                continue
            ww = word.lower() if caseless else word
            embedding = embedd_dict[
                ww] if ww in embedd_dict else generate_random_embedding(
                    scale, [1, WORD_DIM])
            table[index, :] = embedding
        return table

    # Functions to read in the corpus
    w2i = defaultdict(lambda: len(w2i))
    t2i = defaultdict(lambda: len(t2i))
    UNK = w2i["<unk>"]

    data_train = read_dataset('data/sst1/train.txt')
    w2i = defaultdict(lambda: UNK, w2i)
    data_dev = read_dataset('data/sst1/dev.txt')
    data_test = read_dataset('data/sst1/test.txt')
    vocab_size = len(w2i)
    num_labels = len(t2i)

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
        'glove', "data/glove/glove.6B/glove.6B.100d.gz")
    assert embedd_dim == WORD_DIM

    num_data_train = sum([len(bucket) for bucket in data_train])
    num_data_dev = sum([len(bucket) for bucket in data_dev])
    num_data_test = sum([len(bucket) for bucket in data_test])

    logger.info("constructing network...")
    # create variables
    target_var = T.ivector(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    word_var = T.imatrix(name='inputs')

    word_table = construct_word_embedding_table()
    layer_word_input = construct_word_input_layer()
    layer_mask = lasagne.layers.InputLayer(shape=(None, None),
                                           input_var=mask_var,
                                           name='mask')

    layer_input = layer_word_input

    layer_input = lasagne.layers.DropoutLayer(layer_input, p=0.2)

    layer_rnn = build_RNN(architec, layer_input, layer_mask, num_units,
                          grad_clipping)
    layer_rnn = lasagne.layers.DropoutLayer(layer_rnn, p=0.5)

    network = lasagne.layers.DenseLayer(layer_rnn,
                                        num_units=num_labels,
                                        nonlinearity=nonlinearities.softmax,
                                        name='softmax')

    # get output of bi-taru-cnn shape=[batch * max_length, #label]
    prediction_train = lasagne.layers.get_output(network)
    prediction_eval = lasagne.layers.get_output(network, deterministic=True)
    final_prediction = T.argmax(prediction_eval, axis=1)

    loss_train = lasagne.objectives.categorical_crossentropy(
        prediction_train, target_var).mean()
    loss_eval = lasagne.objectives.categorical_crossentropy(
        prediction_eval, target_var).mean()

    corr_train = lasagne.objectives.categorical_accuracy(
        prediction_train, target_var).sum()
    corr_eval = lasagne.objectives.categorical_accuracy(
        prediction_eval, target_var).sum()

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = adam(loss_train,
                   params=params,
                   learning_rate=learning_rate,
                   beta1=0.9,
                   beta2=0.9)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([word_var, target_var, mask_var],
                               [loss_train, corr_train],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([word_var, target_var, mask_var],
                              [corr_eval, final_prediction])

    # Finally, launch the training loop.
    logger.info("%s: (#data: %d, batch size: %d, clip: %.1f)" %
                (architec, num_data_train, batch_size, grad_clipping))

    num_batches = num_data_train / batch_size + 1
    dev_correct = 0.0
    best_epoch = 0
    test_correct = 0.0
    test_total = 0
    lr = learning_rate
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (%s, learning rate=%.4f, decay rate=%.4f): ' % (
            epoch, architec, lr, decay_rate)
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        start_time = time.time()
        num_back = 0
        for batch in xrange(1, num_batches + 1):
            wids, tids, masks = get_batch(data_train, batch_size)
            num = wids.shape[0]
            err, corr = train_fn(wids, tids, masks)
            train_err += err * num
            train_corr += corr
            train_total += num
            time_ave = (time.time() - start_time) / batch
            time_left = (num_batches - batch) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                batch, num_batches, train_err / train_total,
                train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_total == num_batches * batch_size
        sys.stdout.write("\b" * num_back)
        print 'train: %d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            train_total, train_err / train_total,
            train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_corr = 0.0
        dev_total = 0
        for batch in iterate_batch(data_dev, batch_size):
            wids, tids, masks = batch
            num = wids.shape[0]
            corr, predictions = eval_fn(wids, tids, masks)
            dev_corr += corr
            dev_total += num

        assert dev_total == num_data_dev
        print 'dev corr: %d, total: %d, acc: %.2f%%' % (
            dev_corr, dev_total, dev_corr * 100 / dev_total)

        if dev_correct <= dev_corr:
            dev_correct = dev_corr
            best_epoch = epoch

            # evaluate on test data when better performance detected
            test_corr = 0.0
            test_total = 0
            for batch in iterate_batch(data_test, batch_size):
                wids, tids, masks = batch
                num = wids.shape[0]
                corr, predictions = eval_fn(wids, tids, masks)
                test_corr += corr
                test_total += num

            assert test_total == num_data_test
            test_correct = test_corr
        print "best dev  corr: %d, total: %d, acc: %.2f%% (epoch: %d)" % (
            dev_correct, dev_total, dev_correct * 100 / dev_total, best_epoch)
        print "best test corr: %d, total: %d, acc: %.2f%%(epoch: %d)" % (
            test_correct, test_total, test_correct * 100 / test_total,
            best_epoch)

        if epoch in schedule:
            lr = lr * decay_rate
            updates = adam(loss_train,
                           params=params,
                           learning_rate=lr,
                           beta1=0.9,
                           beta2=0.9)
            train_fn = theano.function([word_var, target_var, mask_var],
                                       [loss_train, corr_train],
                                       updates=updates)