Пример #1
0
def build_recur_dropout(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p):
    # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
    # first get some necessary dimensions or parameters
    conv_window = 3
    # shape = [batch, n-step, c_dim, char_length]
    # construct convolution layer
    # shape = [batch, n-step, c_filters, output_length]
    cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
                                    nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
    # infer the pool size for pooling (pool size should go through all time step of cnn)
    _, _, _, pool_size = cnn_layer.output_shape
    # construct max pool layer
    # shape = [batch, n-step, c_filters, 1]
    pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
    # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
    output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))

    # finally, concatenate the two incoming layers together.
    # shape = [batch, n-step, c_filter&w_dim]
    incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)

    # dropout for incoming
    incoming = lasagne.layers.DropoutLayer(incoming, p=p, shared_axes=(1,))

    ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                          W_cell=lasagne.init.Uniform(range=0.1))
    outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                           W_cell=lasagne.init.Uniform(range=0.1))
    # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
    forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                              W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
    # now use tanh for nonlinear function of cell, need to try pure linear cell
    cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
                        nonlinearity=nonlinearities.tanh)
    lstm_forward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
                             nonlinearity=nonlinearities.tanh, peepholes=False,
                             ingate=ingate_forward, outgate=outgate_forward,
                             forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward')

    ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                           W_cell=lasagne.init.Uniform(range=0.1))
    outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                            W_cell=lasagne.init.Uniform(range=0.1))
    # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
    forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                               W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.))
    # now use tanh for nonlinear function of cell, need to try pure linear cell
    cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None,
                         nonlinearity=nonlinearities.tanh)
    lstm_backward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping,
                              nonlinearity=nonlinearities.tanh, peepholes=False, backwards=True,
                              ingate=ingate_backward, outgate=outgate_backward,
                              forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward')

    # concatenate the outputs of forward and backward LSTMs to combine them.
    bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm")
    # shape = [batch, n-step, num_units]
    bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=p, shared_axes=(1,))

    return ChainCRFLayer(bi_lstm_cnn, num_labels, mask_input=mask)
Пример #2
0
def build_std_dropout_sgru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p):
    # Construct Bi-directional LSTM-CNNs-CRF with standard dropout.
    # first get some necessary dimensions or parameters
    conv_window = 3
    # shape = [batch, n-step, c_dim, char_length]
    incoming1 = lasagne.layers.DropoutLayer(incoming1, p=p)

    # construct convolution layer
    # shape = [batch, n-step, c_filters, output_length]
    cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full',
                                    nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
    # infer the pool size for pooling (pool size should go through all time step of cnn)
    _, _, _, pool_size = cnn_layer.output_shape
    # construct max pool layer
    # shape = [batch, n-step, c_filters, 1]
    pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
    # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
    output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))

    # finally, concatenate the two incoming layers together.
    # shape = [batch, n-step, c_filter&w_dim]
    incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2)

    # dropout for incoming
    incoming = lasagne.layers.DropoutLayer(incoming, p=0.2)

    resetgate_input_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
    resetgate_hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
    updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
    hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                 W_cell=None, nonlinearity=nonlinearities.tanh)
    sgru_forward = SGRULayer(incoming, num_units, mask_input=mask,
                             resetgate_input=resetgate_input_forward, resetgate_hidden=resetgate_hidden_forward,
                             updategate=updategate_forward, hidden_update=hidden_update_forward,
                             grad_clipping=grad_clipping, name='forward')

    resetgate_input_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
    resetgate_hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
    updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None)
    hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(),
                                  W_cell=None, nonlinearity=nonlinearities.tanh)
    sgru_backward = SGRULayer(incoming, num_units, mask_input=mask, backwards=True,
                              resetgate_input=resetgate_input_backward, resetgate_hidden=resetgate_hidden_backward,
                              updategate=updategate_backward, hidden_update=hidden_update_backward,
                              grad_clipping=grad_clipping, name='backward')

    # concatenate the outputs of forward and backward LSTMs to combine them.
    bi_sgru_cnn = lasagne.layers.concat([sgru_forward, sgru_backward], axis=2, name="bi-sgru")

    bi_sgru_cnn = lasagne.layers.DropoutLayer(bi_sgru_cnn, p=p)

    # reshape bi-rnn-cnn to [batch * max_length, num_units]
    bi_sgru_cnn = lasagne.layers.reshape(bi_sgru_cnn, (-1, [2]))

    # construct output layer (dense layer with softmax)
    layer_output = lasagne.layers.DenseLayer(bi_sgru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
                                             name='softmax')

    return layer_output
Пример #3
0
def build_network(word_var,
                  char_var,
                  pos_var,
                  mask_var,
                  word_alphabet,
                  char_alphabet,
                  pos_alphabet,
                  depth,
                  num_units,
                  num_types,
                  grad_clipping=5.0,
                  num_filters=30,
                  p=0.5,
                  mlp=1,
                  peepholes=False,
                  use_char=False,
                  use_pos=False,
                  normalize_digits=True,
                  embedding='glove',
                  embedding_path='data/glove/glove.6B/glove.6B.100d.gz',
                  char_embedding='random',
                  char_path=None):
    def generate_random_embedding(scale, shape):
        return np.random.uniform(-scale, scale,
                                 shape).astype(theano.config.floatX)

    def construct_word_embedding_table():
        scale = np.sqrt(3.0 / WORD_DIM)
        table = np.empty([word_alphabet.size(), WORD_DIM],
                         dtype=theano.config.floatX)
        table[data_utils.UNK_ID, :] = generate_random_embedding(
            scale, [1, WORD_DIM])
        for word, index in word_alphabet.iteritems():
            ww = word.lower() if caseless else word
            embedd = embedd_dict[
                ww] if ww in embedd_dict else generate_random_embedding(
                    scale, [1, WORD_DIM])
            table[index, :] = embedd
        print 'construct word table: %s, dimension: %d' % (embedding,
                                                           table.shape[1])
        return table

    def construct_char_embedding_table():
        if char_embedding == 'random':
            scale = np.sqrt(3.0 / CHARACTER_DIM)
            table = generate_random_embedding(
                scale, [char_alphabet.size(), CHARACTER_DIM])
        else:
            char_dict, char_dim, caseless = utils.load_word_embedding_dict(
                char_embedding, char_path, normalize_digits=False)
            scale = np.sqrt(3.0 / char_dim)
            table = np.empty([char_alphabet.size(), char_dim],
                             dtype=theano.config.floatX)
            table[data_utils.UNK_ID, :] = generate_random_embedding(
                scale, [1, char_dim])
            for char, index in char_alphabet.iteritems():
                cc = char.lower() if caseless else char
                char_embedd = char_dict[
                    cc] if cc in char_dict else generate_random_embedding(
                        scale, [1, char_dim])
                table[index, :] = char_embedd
        print 'construct character table: %s, dimension: %d' % (char_embedding,
                                                                table.shape[1])
        return table

    def construct_pos_embedding_table():
        scale = np.sqrt(3.0 / POS_DIM)
        table = generate_random_embedding(scale,
                                          [pos_alphabet.size(), POS_DIM])
        print 'construct pos table: %s, dimension: %d' % ('random',
                                                          table.shape[1])
        return table

    def construct_word_input_layer():
        # shape = [batch, n-step]
        layer_word_input = lasagne.layers.InputLayer(shape=(None, None),
                                                     input_var=word_var,
                                                     name='word_input')
        # shape = [batch, n-step, w_dim]
        layer_word_embedding = lasagne.layers.EmbeddingLayer(
            layer_word_input,
            input_size=word_alphabet.size(),
            output_size=WORD_DIM,
            W=word_table,
            name='word_embedd')
        return layer_word_embedding

    def construct_pos_input_layer():
        # shape = [batch, n-step]
        layer_pos_input = lasagne.layers.InputLayer(shape=(None, None),
                                                    input_var=pos_var,
                                                    name='pos_input')
        # shape = [batch, n-step, w_dim]
        layer_pos_embedding = lasagne.layers.EmbeddingLayer(
            layer_pos_input,
            input_size=pos_alphabet.size(),
            output_size=POS_DIM,
            W=pos_table,
            name='pos_embedd')
        return layer_pos_embedding

    def construct_char_input_layer():
        # shape = [batch, n-step, char_length]
        layer_char_input = lasagne.layers.InputLayer(
            shape=(None, None, data_utils.MAX_CHAR_LENGTH),
            input_var=char_var,
            name='char_input')

        # shape = [batch, n-step, char_length, c_dim]
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char_input,
            input_size=char_alphabet.size(),
            output_size=CHARACTER_DIM,
            W=char_table,
            name='char_embedd')
        # shape = [batch, n-step, c_dim, char_length]
        layer_char_embedding = lasagne.layers.DimshuffleLayer(
            layer_char_embedding, pattern=(0, 1, 3, 2))
        return layer_char_embedding

    def construct_bi_lstm_layer():
        lstm_forward = incoming
        lstm_backward = incoming
        assert depth > 0
        for d in xrange(depth):
            ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                  W_hid=lasagne.init.GlorotUniform(),
                                  W_cell=lasagne.init.Uniform(range=0.1))
            outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                   W_hid=lasagne.init.GlorotUniform(),
                                   W_cell=lasagne.init.Uniform(range=0.1))
            # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
            forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                      W_hid=lasagne.init.GlorotUniform(),
                                      W_cell=lasagne.init.Uniform(range=0.1),
                                      b=lasagne.init.Constant(1.))
            # now use tanh for nonlinear function of cell, need to try pure linear cell
            cell_forward = Gate(W_in=lasagne.init.GlorotUniform(),
                                W_hid=lasagne.init.GlorotUniform(),
                                W_cell=None,
                                nonlinearity=nonlinearities.tanh)
            lstm_forward = LSTMLayer(lstm_forward,
                                     num_units,
                                     mask_input=mask,
                                     grad_clipping=grad_clipping,
                                     nonlinearity=nonlinearities.tanh,
                                     peepholes=peepholes,
                                     ingate=ingate_forward,
                                     outgate=outgate_forward,
                                     forgetgate=forgetgate_forward,
                                     cell=cell_forward,
                                     p=p,
                                     name='forward%d' % d)
            lstm_forward = lasagne.layers.DropoutLayer(lstm_forward,
                                                       p=0.33,
                                                       shared_axes=(1, ))
            # ----------------------------------------------------------------------------------------------------
            ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                   W_hid=lasagne.init.GlorotUniform(),
                                   W_cell=lasagne.init.Uniform(range=0.1))
            outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                    W_hid=lasagne.init.GlorotUniform(),
                                    W_cell=lasagne.init.Uniform(range=0.1))
            # according to Jozefowicz et al.(2015), init bias of forget gate to 1.
            forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                       W_hid=lasagne.init.GlorotUniform(),
                                       W_cell=lasagne.init.Uniform(range=0.1),
                                       b=lasagne.init.Constant(1.))
            # now use tanh for nonlinear function of cell, need to try pure linear cell
            cell_backward = Gate(W_in=lasagne.init.GlorotUniform(),
                                 W_hid=lasagne.init.GlorotUniform(),
                                 W_cell=None,
                                 nonlinearity=nonlinearities.tanh)
            lstm_backward = LSTMLayer(lstm_backward,
                                      num_units,
                                      mask_input=mask,
                                      grad_clipping=grad_clipping,
                                      nonlinearity=nonlinearities.tanh,
                                      peepholes=peepholes,
                                      backwards=True,
                                      ingate=ingate_backward,
                                      outgate=outgate_backward,
                                      forgetgate=forgetgate_backward,
                                      cell=cell_backward,
                                      p=p,
                                      name='backward%d' % d)
            lstm_backward = lasagne.layers.DropoutLayer(lstm_backward,
                                                        p=0.33,
                                                        shared_axes=(1, ))
            # ------------------------------------------------------------------------------------------------------

        # concatenate the outputs of forward and backward LSTMs to combine them.
        bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward],
                                            axis=2,
                                            name="bi-lstm")
        return bi_lstm_cnn

    embedd_dict, embedd_dim, caseless = utils.load_word_embedding_dict(
        embedding, embedding_path, normalize_digits=normalize_digits)

    WORD_DIM = embedd_dim
    POS_DIM = 50
    CHARACTER_DIM = 50

    word_table = construct_word_embedding_table()
    pos_table = construct_pos_embedding_table() if use_pos else None
    char_table = construct_char_embedding_table() if use_char else None

    if char_table is not None:
        CHARACTER_DIM = char_table.shape[1]

    layer_word_input = construct_word_input_layer()
    incoming = layer_word_input
    mask = lasagne.layers.InputLayer(shape=(None, None),
                                     input_var=mask_var,
                                     name='mask')

    if use_pos:
        layer_pos_input = construct_pos_input_layer()
        incoming = lasagne.layers.concat([incoming, layer_pos_input], axis=2)

    if use_char:
        layer_char_input = construct_char_input_layer()
        # dropout before CNN
        # TODO
        # layer_char_input = lasagne.layers.DropoutLayer(layer_char_input, p=0.15)
        # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout.
        conv_window = 3
        # shape = [batch, n-step, c_dim, char_length]
        # construct convolution layer
        # shape = [batch, n-step, c_filters, output_length]
        cnn_layer = ConvTimeStep1DLayer(
            layer_char_input,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, _, pool_size = cnn_layer.output_shape
        # construct max pool layer
        # shape = [batch, n-step, c_filters, 1]
        pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size)
        # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2]))

        # finally, concatenate the two incoming layers together.
        # shape = [batch, n-step, c_filter&w_dim]
        incoming = lasagne.layers.concat([output_cnn_layer, incoming], axis=2)

    # dropout for incoming
    incoming = lasagne.layers.DropoutLayer(incoming, p=0.15, shared_axes=(1, ))
    # shape [batch, n-step, num_units]
    bi_lstm_cnn = construct_bi_lstm_layer()

    # MLP layers
    # shape [batch, n-step, 100]
    for d in xrange(1, mlp):
        bi_lstm_cnn = lasagne.layers.DenseLayer(
            bi_lstm_cnn,
            100,
            nonlinearity=nonlinearities.elu,
            num_leading_axes=2,
            name='dense%d' % d)
        bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn,
                                                  p=0.33,
                                                  shared_axes=(1, ))

    bi_lstm_cnn = lasagne.layers.DenseLayer(bi_lstm_cnn,
                                            100,
                                            nonlinearity=nonlinearities.elu,
                                            num_leading_axes=2,
                                            name='dense%d' % mlp)

    return TreeBiAffineCRFLayer(bi_lstm_cnn,
                                num_types,
                                mask_input=mask,
                                name='crf')