Exemplo n.º 1
0
def build_model():
    x = nn.Variable((batch_size, sentence_length_source))
    mask = get_mask(x)
    y = nn.Variable((batch_size, sentence_length_target))

    enc_input = time_distributed(PF.embed)(
        x, vocab_size_source, embedding_size, name='enc_embeddings') * mask
    # -> (batch_size, sentence_length_source, embedding_size)

    dec_input = F.concatenate(F.constant(w2i_target['<bos>'],
                                         shape=(batch_size, 1)),
                              y[:, :sentence_length_target - 1],
                              axis=1)

    dec_input = time_distributed(PF.embed)(dec_input,
                                           vocab_size_target,
                                           embedding_size,
                                           name='dec_embeddings')
    # -> (batch_size, sentence_length_target, embedding_size)

    # encoder
    with nn.parameter_scope('encoder'):
        enc_output, c, h = lstm(enc_input,
                                hidden,
                                mask=mask,
                                return_sequences=True,
                                return_state=True)
        # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden)

    # decoder
    with nn.parameter_scope('decoder'):
        dec_output = lstm(dec_input,
                          hidden,
                          initial_state=(c, h),
                          return_sequences=True)
        # -> (batch_size, sentence_length_target, hidden)

        attention_output = global_attention(dec_output,
                                            enc_output,
                                            mask=mask,
                                            score='dot')
        # -> (batch_size, sentence_length_target, hidden)

    output = F.concatenate(dec_output, attention_output, axis=2)

    output = time_distributed(PF.affine)(output,
                                         vocab_size_target,
                                         name='output')
    # -> (batch_size, sentence_length_target, vocab_size_target)

    t = F.reshape(y, (batch_size, sentence_length_target, 1))

    entropy = time_distributed_softmax_cross_entropy(output, t)

    mask = F.sum(F.sign(t), axis=2)  # do not predict 'pad'.
    count = F.sum(mask, axis=1)

    entropy *= mask
    loss = F.mean(F.sum(entropy, axis=1) / count)
    return x, y, loss
Exemplo n.º 2
0
def build_self_attention_model(train=True):
    x = nn.Variable((batch_size, max_len))
    t = nn.Variable((batch_size, 1))
    mask = get_mask(x)
    attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant(
        np.finfo(np.float32).min, shape=mask.shape)
    with nn.parameter_scope('embedding'):
        h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
    with nn.parameter_scope('forward'):
        h_f = lstm(h,
                   hidden_size,
                   mask=mask,
                   return_sequences=True,
                   return_state=False)
    with nn.parameter_scope('backward'):
        h_b = lstm(h[:, ::-1, ],
                   hidden_size,
                   mask=mask,
                   return_sequences=True,
                   return_state=False)[:, ::-1, ]
    h = F.concatenate(h_f, h_b, axis=2)
    if train:
        h = F.dropout(h, p=dropout_ratio)
    with nn.parameter_scope('da'):
        a = F.tanh(time_distributed(PF.affine)(h, da))
        if train:
            a = F.dropout(a, p=dropout_ratio)
    with nn.parameter_scope('r'):
        a = time_distributed(PF.affine)(a, r)
        if train:
            a = F.dropout(a, p=dropout_ratio)
        a = F.softmax(a + attention_mask, axis=1)
    m = F.batch_matmul(a, h, transpose_a=True)
    with nn.parameter_scope('output_mlp'):
        output = F.relu(PF.affine(m, output_mlp_size))
        if train:
            output = F.dropout(output, p=dropout_ratio)
    with nn.parameter_scope('output'):
        y = F.sigmoid(PF.affine(output, 1))

    accuracy = F.mean(F.equal(F.round(y), t))
    loss = F.mean(F.binary_cross_entropy(
        y, t)) + attention_penalty_coef * frobenius(
            F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r))
    return x, t, accuracy, loss
Exemplo n.º 3
0
def predict(x):
    with nn.auto_forward():
        x = x.reshape((1, sentence_length_source))
        enc_input = nn.Variable.from_numpy_array(x)
        mask = get_mask(enc_input)
        enc_input = time_distributed(PF.embed)(enc_input,
                                               vocab_size_source,
                                               embedding_size,
                                               name='enc_embeddings') * mask

        # encoder
        with nn.parameter_scope('encoder'):
            enc_output, c, h = lstm(enc_input,
                                    hidden,
                                    mask=mask,
                                    return_sequences=True,
                                    return_state=True)

        # decode
        pad = nn.Variable.from_numpy_array(np.array([w2i_target['<bos>']]))
        x = PF.embed(pad,
                     vocab_size_target,
                     embedding_size,
                     name='dec_embeddings')

        _cell, _hidden = c, h

        word_index = 0
        ret = []
        i = 0
        while i2w_target[word_index] != '。' and i < 20:
            with nn.parameter_scope('decoder'):
                with nn.parameter_scope('lstm'):
                    _cell, _hidden = lstm_cell(x, _cell, _hidden)
                    q = F.reshape(_hidden, (1, 1, hidden))
                    attention_output = global_attention(q,
                                                        enc_output,
                                                        mask=mask,
                                                        score='dot')
            attention_output = F.reshape(attention_output, (1, hidden))
            output = F.concatenate(_hidden, attention_output, axis=1)
            output = PF.affine(output, vocab_size_target, name='output')

            word_index = np.argmax(output.d[0])
            ret.append(word_index)
            x = nn.Variable.from_numpy_array(
                np.array([word_index], dtype=np.int32))
            x = PF.embed(x,
                         vocab_size_target,
                         embedding_size,
                         name='dec_embeddings')

            i += 1

        return ret
Exemplo n.º 4
0
def build_model(train=True, get_embeddings=False):
    x = nn.Variable((batch_size, sentence_length, ptb_dataset.word_length))
    mask = expand_dims(F.sign(x), axis=-1)
    t = nn.Variable((batch_size, sentence_length))

    with nn.parameter_scope('char_embedding'):
        h = PF.embed(x, char_vocab_size, char_embedding_dim) * mask
    h = F.transpose(h, (0, 3, 1, 2))
    output = []
    for f, f_size in zip(filters, filster_sizes):
        _h = PF.convolution(h, f, kernel=(1, f_size), pad=(0, f_size//2), name='conv_{}'.format(f_size))
        _h = F.max_pooling(_h, kernel=(1, ptb_dataset.word_length))
        output.append(_h)
    h = F.concatenate(*output, axis=1)
    h = F.transpose(h, (0, 2, 1, 3))

    mask = get_mask(F.sum(x, axis=2))
    embeddings = F.reshape(h, (batch_size, sentence_length, sum(filters))) * mask

    if get_embeddings:
        return x, embeddings

    with nn.parameter_scope('highway1'):
        h = time_distributed(highway)(embeddings)
    with nn.parameter_scope('highway2'):
        h = time_distributed(highway)(h)
    with nn.parameter_scope('lstm1'):
        h = lstm(h, lstm_size, mask=mask, return_sequences=True)
    with nn.parameter_scope('lstm2'):
        h = lstm(h, lstm_size, mask=mask, return_sequences=True)
    with nn.parameter_scope('hidden'):
        h = F.relu(time_distributed(PF.affine)(h, lstm_size))
    if train:
        h = F.dropout(h, p=dropout_ratio)
    with nn.parameter_scope('output'):
        y = time_distributed(PF.affine)(h, word_vocab_size)

    mask = F.sign(t) # do not predict 'pad'.
    entropy = time_distributed_softmax_cross_entropy(y, expand_dims(t, axis=-1)) * mask
    count = F.sum(mask, axis=1)
    loss = F.mean(F.div2(F.sum(entropy, axis=1), count))
    return x, t, loss
Exemplo n.º 5
0
                                     batch_size,
                                     shuffle=True,
                                     with_file_cache=False)


def global_average_pooling_1d(x, mask):
    count = F.sum(mask, axis=1)
    global_average_pooled = F.sum(h, axis=1) / count
    return global_average_pooled


x = nn.Variable((batch_size, max_len))
t = nn.Variable((batch_size, 1))
mask = get_mask(x)
with nn.parameter_scope('embedding'):
    h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
h = global_average_pooling_1d(h, mask)
with nn.parameter_scope('output'):
    y = F.sigmoid(PF.affine(h, 1))

accuracy = F.mean(F.equal(F.round(y), t))
loss = F.mean(F.binary_cross_entropy(y, t))

# Create solver.
solver = S.Adam()
solver.set_parameters(nn.get_parameters())

trainer = Trainer(inputs=[x, t],
                  loss=loss,
                  metrics={
                      'cross entropy': loss,
Exemplo n.º 6
0
                                       batch_size,
                                       shuffle=True,
                                       with_file_cache=False)

x = nn.Variable((batch_size, sentence_length))
mask = get_mask(x)
t = nn.Variable((batch_size, sentence_length))

with nn.parameter_scope('embedding'):
    h = PF.embed(x, vocab_size, embedding_size) * mask
with nn.parameter_scope('lstm1'):
    h = lstm(h, hidden_size, mask=mask, return_sequences=True)
with nn.parameter_scope('lstm2'):
    h = lstm(h, hidden_size, mask=mask, return_sequences=True)
with nn.parameter_scope('output'):
    y = time_distributed(PF.affine)(h, vocab_size)

mask = F.sum(mask, axis=2)  # do not predict 'pad'.
entropy = time_distributed_softmax_cross_entropy(y, expand_dims(
    t, axis=-1)) * mask
# count = F.sum(mask, axis=1)
# loss = F.mean(F.div2(F.sum(entropy, axis=1), count))
loss = F.sum(entropy) / F.sum(mask)

# Create solver.
solver = S.Momentum(1e-2, momentum=0.9)
solver.set_parameters(nn.get_parameters())

trainer = Trainer(inputs=[x, t],
                  loss=loss,
                  metrics={'PPL': np.e**loss},
Exemplo n.º 7
0
train_data_iter = data_iterator_simple(load_train_func,
                                       len(x_train),
                                       batch_size,
                                       shuffle=True,
                                       with_file_cache=False)
valid_data_iter = data_iterator_simple(load_valid_func,
                                       len(x_valid),
                                       batch_size,
                                       shuffle=True,
                                       with_file_cache=False)

x = nn.Variable((batch_size, sentence_length))
mask = get_mask(x)
t = nn.Variable((batch_size, sentence_length))
with nn.parameter_scope('embedding'):
    h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask
with nn.parameter_scope('rnn'):
    h = simple_rnn(h, hidden_size, mask=mask, return_sequences=True)
with nn.parameter_scope('output'):
    y = time_distributed(PF.affine)(h, vocab_size)

mask = F.sum(mask, axis=2)
entropy = time_distributed_softmax_cross_entropy(y, expand_dims(
    t, axis=-1)) * mask
count = F.sum(mask, axis=1)
loss = F.mean(F.div2(F.sum(entropy, axis=1), count))

# Create solver.
solver = S.Momentum(1e-2, momentum=0.9)
solver.set_parameters(nn.get_parameters())