def build_model(): x = nn.Variable((batch_size, sentence_length_source)) mask = get_mask(x) y = nn.Variable((batch_size, sentence_length_target)) enc_input = time_distributed(PF.embed)( x, vocab_size_source, embedding_size, name='enc_embeddings') * mask # -> (batch_size, sentence_length_source, embedding_size) dec_input = F.concatenate(F.constant(w2i_target['<bos>'], shape=(batch_size, 1)), y[:, :sentence_length_target - 1], axis=1) dec_input = time_distributed(PF.embed)(dec_input, vocab_size_target, embedding_size, name='dec_embeddings') # -> (batch_size, sentence_length_target, embedding_size) # encoder with nn.parameter_scope('encoder'): enc_output, c, h = lstm(enc_input, hidden, mask=mask, return_sequences=True, return_state=True) # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden) # decoder with nn.parameter_scope('decoder'): dec_output = lstm(dec_input, hidden, initial_state=(c, h), return_sequences=True) # -> (batch_size, sentence_length_target, hidden) attention_output = global_attention(dec_output, enc_output, mask=mask, score='dot') # -> (batch_size, sentence_length_target, hidden) output = F.concatenate(dec_output, attention_output, axis=2) output = time_distributed(PF.affine)(output, vocab_size_target, name='output') # -> (batch_size, sentence_length_target, vocab_size_target) t = F.reshape(y, (batch_size, sentence_length_target, 1)) entropy = time_distributed_softmax_cross_entropy(output, t) mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. count = F.sum(mask, axis=1) entropy *= mask loss = F.mean(F.sum(entropy, axis=1) / count) return x, y, loss
def build_self_attention_model(train=True): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant( np.finfo(np.float32).min, shape=mask.shape) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask with nn.parameter_scope('forward'): h_f = lstm(h, hidden_size, mask=mask, return_sequences=True, return_state=False) with nn.parameter_scope('backward'): h_b = lstm(h[:, ::-1, ], hidden_size, mask=mask, return_sequences=True, return_state=False)[:, ::-1, ] h = F.concatenate(h_f, h_b, axis=2) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('da'): a = F.tanh(time_distributed(PF.affine)(h, da)) if train: a = F.dropout(a, p=dropout_ratio) with nn.parameter_scope('r'): a = time_distributed(PF.affine)(a, r) if train: a = F.dropout(a, p=dropout_ratio) a = F.softmax(a + attention_mask, axis=1) m = F.batch_matmul(a, h, transpose_a=True) with nn.parameter_scope('output_mlp'): output = F.relu(PF.affine(m, output_mlp_size)) if train: output = F.dropout(output, p=dropout_ratio) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(output, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy( y, t)) + attention_penalty_coef * frobenius( F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r)) return x, t, accuracy, loss
def predict(x): with nn.auto_forward(): x = x.reshape((1, sentence_length_source)) enc_input = nn.Variable.from_numpy_array(x) mask = get_mask(enc_input) enc_input = time_distributed(PF.embed)(enc_input, vocab_size_source, embedding_size, name='enc_embeddings') * mask # encoder with nn.parameter_scope('encoder'): enc_output, c, h = lstm(enc_input, hidden, mask=mask, return_sequences=True, return_state=True) # decode pad = nn.Variable.from_numpy_array(np.array([w2i_target['<bos>']])) x = PF.embed(pad, vocab_size_target, embedding_size, name='dec_embeddings') _cell, _hidden = c, h word_index = 0 ret = [] i = 0 while i2w_target[word_index] != '。' and i < 20: with nn.parameter_scope('decoder'): with nn.parameter_scope('lstm'): _cell, _hidden = lstm_cell(x, _cell, _hidden) q = F.reshape(_hidden, (1, 1, hidden)) attention_output = global_attention(q, enc_output, mask=mask, score='dot') attention_output = F.reshape(attention_output, (1, hidden)) output = F.concatenate(_hidden, attention_output, axis=1) output = PF.affine(output, vocab_size_target, name='output') word_index = np.argmax(output.d[0]) ret.append(word_index) x = nn.Variable.from_numpy_array( np.array([word_index], dtype=np.int32)) x = PF.embed(x, vocab_size_target, embedding_size, name='dec_embeddings') i += 1 return ret
def build_model(train=True, get_embeddings=False): x = nn.Variable((batch_size, sentence_length, ptb_dataset.word_length)) mask = expand_dims(F.sign(x), axis=-1) t = nn.Variable((batch_size, sentence_length)) with nn.parameter_scope('char_embedding'): h = PF.embed(x, char_vocab_size, char_embedding_dim) * mask h = F.transpose(h, (0, 3, 1, 2)) output = [] for f, f_size in zip(filters, filster_sizes): _h = PF.convolution(h, f, kernel=(1, f_size), pad=(0, f_size//2), name='conv_{}'.format(f_size)) _h = F.max_pooling(_h, kernel=(1, ptb_dataset.word_length)) output.append(_h) h = F.concatenate(*output, axis=1) h = F.transpose(h, (0, 2, 1, 3)) mask = get_mask(F.sum(x, axis=2)) embeddings = F.reshape(h, (batch_size, sentence_length, sum(filters))) * mask if get_embeddings: return x, embeddings with nn.parameter_scope('highway1'): h = time_distributed(highway)(embeddings) with nn.parameter_scope('highway2'): h = time_distributed(highway)(h) with nn.parameter_scope('lstm1'): h = lstm(h, lstm_size, mask=mask, return_sequences=True) with nn.parameter_scope('lstm2'): h = lstm(h, lstm_size, mask=mask, return_sequences=True) with nn.parameter_scope('hidden'): h = F.relu(time_distributed(PF.affine)(h, lstm_size)) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('output'): y = time_distributed(PF.affine)(h, word_vocab_size) mask = F.sign(t) # do not predict 'pad'. entropy = time_distributed_softmax_cross_entropy(y, expand_dims(t, axis=-1)) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) return x, t, loss
batch_size, shuffle=True, with_file_cache=False) def global_average_pooling_1d(x, mask): count = F.sum(mask, axis=1) global_average_pooled = F.sum(h, axis=1) / count return global_average_pooled x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask h = global_average_pooling_1d(h, mask) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(h, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy(y, t)) # Create solver. solver = S.Adam() solver.set_parameters(nn.get_parameters()) trainer = Trainer(inputs=[x, t], loss=loss, metrics={ 'cross entropy': loss,
batch_size, shuffle=True, with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) mask = get_mask(x) t = nn.Variable((batch_size, sentence_length)) with nn.parameter_scope('embedding'): h = PF.embed(x, vocab_size, embedding_size) * mask with nn.parameter_scope('lstm1'): h = lstm(h, hidden_size, mask=mask, return_sequences=True) with nn.parameter_scope('lstm2'): h = lstm(h, hidden_size, mask=mask, return_sequences=True) with nn.parameter_scope('output'): y = time_distributed(PF.affine)(h, vocab_size) mask = F.sum(mask, axis=2) # do not predict 'pad'. entropy = time_distributed_softmax_cross_entropy(y, expand_dims( t, axis=-1)) * mask # count = F.sum(mask, axis=1) # loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) loss = F.sum(entropy) / F.sum(mask) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) trainer = Trainer(inputs=[x, t], loss=loss, metrics={'PPL': np.e**loss},
train_data_iter = data_iterator_simple(load_train_func, len(x_train), batch_size, shuffle=True, with_file_cache=False) valid_data_iter = data_iterator_simple(load_valid_func, len(x_valid), batch_size, shuffle=True, with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) mask = get_mask(x) t = nn.Variable((batch_size, sentence_length)) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask with nn.parameter_scope('rnn'): h = simple_rnn(h, hidden_size, mask=mask, return_sequences=True) with nn.parameter_scope('output'): y = time_distributed(PF.affine)(h, vocab_size) mask = F.sum(mask, axis=2) entropy = time_distributed_softmax_cross_entropy(y, expand_dims( t, axis=-1)) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters())