def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze( self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def model_func(inputs, is_train=True): # inputs = [src, src_sequence_length, trg, trg_sequence_length, label] # src = fluid.data(name="src", shape=[None, None], dtype="int64") # 源语言输入 src = inputs[0] src_sequence_length = inputs[1] src_embedding = fluid.embedding( input=src, size=[source_dict_size, hidden_dim], dtype="float32", param_attr=fluid.ParamAttr(name="src_emb_table")) # 编码器 encoder_output, encoder_state = encoder(src_embedding, src_sequence_length) encoder_output_proj = layers.fc(input=encoder_output, size=decoder_size, num_flatten_dims=2, bias_attr=False) src_mask = layers.sequence_mask(src_sequence_length, maxlen=layers.shape(src)[1], dtype="float32") encoder_padding_mask = (src_mask - 1.0) * 1e9 # 目标语言输入,训练时有、预测生成时无该输入 trg = inputs[2] if is_train else None # 解码器 output = decoder(encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_state=encoder_state, encoder_padding_mask=encoder_padding_mask, trg=trg, is_train=is_train) return output
def model_func(inputs, is_train=True): src = inputs[0] src_sequence_length = inputs[1] # source embedding src_embeder = lambda x: fluid.embedding( input=x, size=[source_dict_size, hidden_dim], dtype="float32", param_attr=fluid.ParamAttr(name="src_emb_table")) src_embedding = src_embeder(src) # encoder encoder_output, encoder_state = encoder(src_embedding, src_sequence_length) encoder_output_proj = layers.fc(input=encoder_output, size=decoder_size, num_flatten_dims=2, bias_attr=False) src_mask = layers.sequence_mask(src_sequence_length, maxlen=layers.shape(src)[1], dtype="float32") encoder_padding_mask = (src_mask - 1.0) * 1e9 trg = inputs[2] if is_train else None # decoder output = decoder(encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_state=encoder_state, encoder_padding_mask=encoder_padding_mask, trg=trg, is_train=is_train) return output
def test_with_input_lengths(self): mp = self.mp.clone() sp = self.sp rnn1 = self.rnn1 rnn2 = self.rnn2 exe = self.executor scope = self.scope x = np.random.randn(12, 4, 16) if not self.time_major: x = np.transpose(x, [1, 0, 2]) sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) y1, (h1, c1) = rnn1(x, sequence_length=sequence_length) with paddle.fluid.unique_name.guard(): with paddle.static.program_guard(mp, sp): x_data = paddle.data( "input", [-1, -1, 16], dtype=paddle.framework.get_default_dtype()) seq_len = paddle.data("seq_len", [-1], dtype="int64") mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) if self.time_major: mask = paddle.transpose(mask, [1, 0]) y, (h, c) = rnn2(x_data, sequence_length=seq_len) y = paddle.multiply(y, mask, axis=0) feed_dict = {x_data.name: x, seq_len.name: sequence_length} with paddle.static.scope_guard(scope): y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c]) np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5) np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
def forward(self, src, src_length): # encoding encoder_output, encoder_final_state = self.encoder(src, src_length) # decoder initial states decoder_initial_states = [ encoder_final_state, self.decoder.lstm_attention.cell.get_initial_states( batch_ref=encoder_output, shape=[self.hidden_size]) ] # attention mask to avoid paying attention on padddings src_mask = layers.sequence_mask( src_length, maxlen=layers.shape(src)[1], dtype=encoder_output.dtype) encoder_padding_mask = (src_mask - 1.0) * 1e9 encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) # Tile the batch dimension with beam_size encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output, self.beam_size) encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch( encoder_padding_mask, self.beam_size) # dynamic decoding with beam search rs, _ = self.beam_search_decoder( inits=decoder_initial_states, encoder_output=encoder_output, encoder_padding_mask=encoder_padding_mask) return rs
def _build_decoder(self, enc_final_state, mode='train', beam_size=10): output_layer = lambda x: layers.fc( x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr( name="output_w", initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)), bias_attr=False) dec_cell = AttentionDecoderCell(self.num_layers, self.hidden_size, self.dropout, self.init_scale) dec_initial_states = [ enc_final_state, dec_cell.get_initial_states(batch_ref=self.enc_output, shape=[self.hidden_size]) ] max_src_seq_len = layers.shape(self.src)[1] src_mask = layers.sequence_mask(self.src_sequence_length, maxlen=max_src_seq_len, dtype='float32') enc_padding_mask = (src_mask - 1.0) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=self.tar_emb, initial_states=dec_initial_states, sequence_length=None, enc_output=self.enc_output, enc_padding_mask=enc_padding_mask) dec_output = output_layer(dec_output) elif mode == 'beam_search': output_layer = lambda x: layers.fc( x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr(name="output_w"), bias_attr=False) beam_search_decoder = BeamSearchDecoder( dec_cell, self.beam_start_token, self.beam_end_token, beam_size, embedding_fn=self.tar_embeder, output_fn=output_layer) enc_output = beam_search_decoder.tile_beam_merge_with_batch( self.enc_output, beam_size) enc_padding_mask = beam_search_decoder.tile_beam_merge_with_batch( enc_padding_mask, beam_size) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=self.beam_max_step_num, enc_output=enc_output, enc_padding_mask=enc_padding_mask) return outputs return dec_output
def loss_func(logits, label, trg_sequence_length): probs = layers.softmax(logits) loss = layers.cross_entropy(input=probs, label=label) trg_mask = layers.sequence_mask(trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32") avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask) return avg_cost
def simple_rnn(rnn_input, init_hidden, hidden_size, kernel_param_attr=None, recurrent_param_attr=None, bias_attr=None, act='relu', sequence_length=None, name='simple_rnn'): # Transpose (sequence x batch x hidden) rnn_input = layers.transpose(rnn_input, [1, 0, 2]) # Generate Mask mask = None if sequence_length: max_seq_len = layers.shape(rnn_input)[0] mask = layers.sequence_mask(sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) # Init simple_rnn = SimpleRNN_unit(rnn_input, hidden_size, kernel_param_attr, recurrent_param_attr, bias_attr, act) rnn = PaddingRNN() with rnn.step(): step_in = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) if init_hidden: pre_hidden = rnn.memory(init=init_hidden) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) last_hidden = simple_rnn(step_in, pre_hidden) rnn.update_memory(pre_hidden, last_hidden) rnn.step_output(last_hidden) step_input = last_hidden rnn_out = rnn() last_hidden = rnn_out[-1] last_hidden = layers.reshape(last_hidden, shape=[1, -1, hidden_size]) rnn_output = layers.transpose(rnn_out, [1, 0, 2]) last_hidden = layers.transpose(last_hidden, [1, 0, 2]) return rnn_out, last_hidden
def learn(self, probs, label, weight=None, length=None): loss = layers.cross_entropy(input=probs, label=label, soft_label=False) max_seq_len = layers.shape(probs)[1] mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32") loss = loss * mask loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) optimizer = fluid.optimizer.Adam(self.lr) optimizer.minimize(loss) return loss
def loss_func(logits, label, trg_sequence_length): probs = layers.softmax(logits) # 使用交叉熵损失函数 loss = layers.cross_entropy(input=probs, label=label) # 根据长度生成掩码,并依此剔除 padding 部分计算的损失 trg_mask = layers.sequence_mask(trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32") avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask) return avg_cost
def predict_test_util(place, mode): place = paddle.set_device(place) paddle.seed(123) np.random.seed(123) class Net(paddle.nn.Layer): def __init__(self): super(Net, self).__init__() self.rnn = getattr(paddle.nn, mode)(16, 32, 2, direction="bidirectional", dropout=0.1) def forward(self, input): return self.rnn(input) x = paddle.randn((4, 10, 16)) x.stop_gradient = False seq_len = paddle.to_tensor(np.array([10, 6, 8, 5])) mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype) mask = paddle.unsqueeze(mask, [2]) rnn = Net() y, _ = rnn(x) y = y * mask loss = paddle.mean(y) loss.backward() optimizer = paddle.optimizer.Adam( learning_rate=0.1, parameters=rnn.parameters()) optimizer.step() rnn.eval() y, _ = rnn(x) # `jit.to_static` would include a train_program, eval mode might cause # some errors currently, such as dropout grad op gets `is_test == True`. rnn.train() rnn = paddle.jit.to_static( rnn, [paddle.static.InputSpec( shape=[None, None, 16], dtype=x.dtype)]) paddle.jit.save(rnn, "./inference/%s_infer" % mode) paddle.enable_static() new_scope = paddle.static.Scope() with paddle.static.scope_guard(new_scope): exe = paddle.static.Executor(place) [inference_program, feed_target_names, fetch_targets] = paddle.static.load_inference_model( "./inference/%s_infer" % mode, exe) results = exe.run(inference_program, feed={feed_target_names[0]: x.numpy()}, fetch_list=fetch_targets) np.testing.assert_equal( y.numpy(), results[0]) # eval results equal predict results paddle.disable_static()
def spec_loss(self, decoded, input, num_frames=None): if num_frames is None: l1_loss = F.reduce_mean(F.abs(decoded - input)) else: # mask the <pad> part of the decoder num_channels = decoded.shape[-1] l1_loss = F.abs(decoded - input) mask = F.sequence_mask(num_frames, dtype="float32") l1_loss *= F.unsqueeze(mask, axes=[-1]) l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels) return l1_loss
def forward(self, outputs, labels): predict, (trg_length, label) = outputs[0], labels # for target padding mask mask = layers.sequence_mask( trg_length, maxlen=layers.shape(predict)[1], dtype=predict.dtype) cost = layers.softmax_with_cross_entropy( logits=predict, label=label, soft_label=False) masked_cost = layers.elementwise_mul(cost, mask, axis=0) batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0]) seq_cost = layers.reduce_sum(batch_mean_cost) return seq_cost
def _birnn_encoder(self, inputs, input_len, name_lens, name_pos, name_tok_len): """forward Args: inputs (Variable): shape=[batch_size, max_seq_len, hidden_size] input_len (Variable): shape=[batch_size] name_lens (Variable): shape=[batch_size] name_pos (Variable): shape=[batch_size, max_name_len, max_tokens] name_tok_len (Variable): shape=[batch_size, max_name_len] Returns: TODO Raises: NULL """ rnn_output, rnn_final_state = self._rnn_encoder.forward( inputs, input_len) max_name_len = name_pos.shape[1] name_begin = name_pos[:, :, 0] name_repr_mask = layers.sequence_mask(name_lens, max_name_len, dtype=name_tok_len.dtype) len_delta = layers.elementwise_mul(name_tok_len - 1, name_repr_mask, axis=0) name_end = name_begin + len_delta if self._bidirectional: name_fwd_repr_gathered = nn_utils.batch_gather_2d( rnn_output, name_end)[:, :, :self._hidden_size] name_bwd_repr_gathered = nn_utils.batch_gather_2d( rnn_output, name_begin)[:, :, self._hidden_size:] name_repr_gathered = layers.concat( input=[name_fwd_repr_gathered, name_bwd_repr_gathered], axis=-1) new_hidden_size = self._hidden_size * 2 else: name_repr_gathered = layers.gather_nd(rnn_output, name_end) new_hidden_size = self._hidden_size name_repr_tmp = layers.reshape( name_repr_gathered, shape=[-1, max_name_len, new_hidden_size]) name_repr_mask = layers.cast(name_repr_mask, dtype=name_repr_tmp.dtype) name_repr = layers.elementwise_mul(name_repr_tmp, name_repr_mask, axis=0) return name_repr, None
def def_seq2seq_model(num_layers, hidden_size, dropout_prob, src_vocab_size, trg_vocab_size): "vanilla seq2seq model" # data source = fluid.data(name="src", shape=[None, None], dtype="int64") source_length = fluid.data(name="src_sequence_length", shape=[None], dtype="int64") target = fluid.data(name="trg", shape=[None, None], dtype="int64") target_length = fluid.data(name="trg_sequence_length", shape=[None], dtype="int64") label = fluid.data(name="label", shape=[None, None, 1], dtype="int64") # embedding src_emb = fluid.embedding(source, (src_vocab_size, hidden_size)) tar_emb = fluid.embedding(target, (src_vocab_size, hidden_size)) # encoder enc_cell = EncoderCell(num_layers, hidden_size, dropout_prob) enc_output, enc_final_state = dynamic_rnn(cell=enc_cell, inputs=src_emb, sequence_length=source_length) # decoder dec_cell = DecoderCell(num_layers, hidden_size, dropout_prob) dec_output, dec_final_state = dynamic_rnn(cell=dec_cell, inputs=tar_emb, initial_states=enc_final_state) logits = layers.fc(dec_output, size=trg_vocab_size, num_flatten_dims=len(dec_output.shape) - 1, bias_attr=False) # loss loss = layers.softmax_with_cross_entropy(logits=logits, label=label, soft_label=False) loss = layers.unsqueeze(loss, axes=[2]) max_tar_seq_len = layers.shape(target)[1] tar_mask = layers.sequence_mask(target_length, maxlen=max_tar_seq_len, dtype="float32") loss = loss * tar_mask loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) # optimizer optimizer = fluid.optimizer.Adam(0.001) optimizer.minimize(loss) return loss
def __call__(self, src, src_length, trg=None, trg_length=None): # encoder encoder_output, encoder_final_state = self.encoder( self.src_embeder(src), src_length) decoder_initial_states = [ encoder_final_state, self.decoder.decoder_cell.get_initial_states( batch_ref=encoder_output, shape=[encoder_output.shape[-1]]) ] src_mask = layers.sequence_mask(src_length, maxlen=layers.shape(src)[1], dtype="float32") encoder_padding_mask = (src_mask - 1.0) * 1e9 encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) # decoder decoder_kwargs = { "inputs": self.trg_embeder(trg), "sequence_length": trg_length, } if self.decoder.decoding_strategy == "train_greedy" else ( { "embedding_fn": self.trg_embeder, "beam_size": self.beam_size, "start_token": self.start_token, "end_token": self.end_token } if self.decoder.decoding_strategy == "beam_search" else { "embedding_fn": self.trg_embeder, "start_tokens": layers.fill_constant_batch_size_like(input=encoder_output, shape=[-1], dtype=src.dtype, value=self.start_token), "end_token": self.end_token }) decoder_kwargs["output_layer"] = self.output_layer (decoder_output, decoder_final_state, dec_seq_lengths) = self.decoder(decoder_initial_states, encoder_output, encoder_padding_mask, **decoder_kwargs) if self.decoder.decoding_strategy == "beam_search": # for inference return decoder_output logits, samples, sample_length = (decoder_output.cell_outputs, decoder_output.sample_ids, dec_seq_lengths) probs = layers.softmax(logits) return probs, samples, sample_length
def _compute_loss(self, dec_output): loss = layers.softmax_with_cross_entropy(logits=dec_output, label=self.label, soft_label=False) loss = layers.unsqueeze(loss, axes=[2]) max_tar_seq_len = layers.shape(self.tar)[1] tar_mask = layers.sequence_mask(self.tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32') loss = loss * tar_mask loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) return loss
def recv_func(msg): pad_value = L.assign(input=np.array([0.0], dtype=np.float32)) output, length = L.sequence_pad(msg, pad_value, maxlen=max_neigh) mask = L.sequence_mask(length, dtype="float32", maxlen=max_neigh) mask = L.unsqueeze(mask, [2]) input_mask = (L.matmul(mask, mask, transpose_y=True) - 1) * -10000 for layer in range(num_layers): output = self_attention_and_residual(output, hidden_size, input_mask, name="cross_feat_%s" % layer, maxlen=max_neigh) return L.reduce_sum(output * mask, 1) / L.reduce_sum(mask, 1)
def _select_table(condition, inputs, table_enc, table_len, table_mask_by_col, ptr_net, grammar, name=None): """select_table. Args: condition (TYPE): NULL inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 table_enc (TYPE): NULL table_len (TYPE): NULL ptr_net (TYPE): NULL grammar (TYPE): NULL name (str): table_mask_by_col (Variable): Returns: TODO Raises: NULL """ condition = layers.cast(condition, dtype='float32') table_mask_by_len = layers.sequence_mask(table_len, maxlen=grammar.MAX_TABLE, dtype='float32') table_mask_by_len = layers.reshape(table_mask_by_len, [-1, grammar.MAX_TABLE]) table_mask_by_col = layers.reshape(table_mask_by_col, [-1, grammar.MAX_TABLE]) table_mask = layers.elementwise_mul(table_mask_by_len, table_mask_by_col) predicts = ptr_net.forward(inputs, table_enc, table_mask) zeros_l = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.grammar_size], dtype='float32', value=-INF) zeros_r = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.MAX_COLUMN + grammar.MAX_VALUE], dtype='float32', value=-INF) final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) return true_final_output
def _select_column(condition, inputs, column_enc, column_len, ptr_net, grammar, column2table_mask, name=None): """select_column. Args: condition (TYPE): NULL inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 column_enc (TYPE): NULL column_len (TYPE): NULL ptr_net (TYPE): NULL grammar (TYPE): NULL column2table_mask (Variable): name (str): Returns: TODO Raises: NULL """ condition = layers.cast(condition, dtype='float32') column_mask = layers.sequence_mask(column_len, maxlen=grammar.MAX_COLUMN, dtype='float32') column_mask = layers.reshape(column_mask, [-1, grammar.MAX_COLUMN]) predicts = ptr_net.forward(inputs, column_enc, column_mask) pred_ids = layers.argmax(predicts, axis=-1) valid_table_mask = nn_utils.batch_gather(column2table_mask, pred_ids) ## concat zeros to vocab size zeros_l = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.grammar_size + grammar.MAX_TABLE], dtype='float32', value=-INF) zeros_r = tensor.fill_constant_batch_size_like( predicts, shape=[-1, grammar.MAX_VALUE], dtype='float32', value=-INF) final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) true_valid_table_mask = layers.elementwise_mul(valid_table_mask, condition, axis=0) return true_final_output, true_valid_table_mask
def _compute_loss(self, dec_output): loss = layers.softmax_with_cross_entropy(logits=dec_output, label=self.label, soft_label=False) loss = layers.reshape(loss, shape=[self.batch_size, -1]) max_tar_seq_len = layers.shape(self.tar)[1] tar_mask = layers.sequence_mask(self.tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32') loss = loss * tar_mask loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) loss.permissions = True return loss
def test_with_input_lengths(self): rnn1 = self.rnn1 rnn2 = self.rnn2 x = np.random.randn(12, 4, 16) if not self.time_major: x = np.transpose(x, [1, 0, 2]) sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) y1, h1 = rnn1(x, sequence_length=sequence_length) seq_len = paddle.to_variable(sequence_length) mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) if self.time_major: mask = paddle.transpose(mask, [1, 0]) y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len) y2 = paddle.multiply(y2, mask, axis=0) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
def _compute_loss(self, mean, logvars, dec_output): kl_loss = self._kl_dvg(mean, logvars) rec_loss = layers.softmax_with_cross_entropy(logits=dec_output, label=self.label, soft_label=False) rec_loss = layers.reshape(rec_loss, shape=[self.batch_size, -1]) max_tar_seq_len = layers.shape(self.tar)[1] tar_mask = layers.sequence_mask(self.tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32') rec_loss = rec_loss * tar_mask rec_loss = layers.reduce_mean(rec_loss, dim=[0]) rec_loss = layers.reduce_sum(rec_loss) loss = kl_loss * self.kl_weight + rec_loss return loss, kl_loss, rec_loss
def _select_value(condition, inputs, value_enc, value_len, ptr_net, grammar, name=None): """select_value. Args: condition (TYPE): NULL inputs (TYPE): NULL value_enc (TYPE): NULL value_len (TYPE): NULL ptr_net (TYPE): NULL grammar (TYPE): NULL Returns: TODO Raises: NULL """ condition = layers.cast(condition, dtype='float32') value_mask = layers.sequence_mask(value_len, maxlen=grammar.MAX_VALUE, dtype='float32') value_mask = layers.reshape(value_mask, [-1, grammar.MAX_VALUE]) predicts = ptr_net.forward(inputs, value_enc, value_mask) ## concat zeros to vocab size zeros_l = tensor.fill_constant_batch_size_like( predicts, shape=[ -1, grammar.grammar_size + grammar.MAX_TABLE + grammar.MAX_COLUMN ], dtype='float32', value=-INF) final_output = tensor.concat([zeros_l, predicts], axis=-1) true_final_output = layers.elementwise_mul(final_output, condition, axis=0) return true_final_output
def test_with_input_lengths(self): rnn1 = self.rnn1 rnn2 = self.rnn2 x = np.random.randn(12, 4, 16) if not self.time_major: x = np.transpose(x, [1, 0, 2]) sequence_length = np.array([12, 10, 9, 8], dtype=np.int64) y1, (fw_h1, bw_h1) = rnn1(x, sequence_length=sequence_length) seq_len = paddle.to_tensor(sequence_length) mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype()) if self.time_major: mask = paddle.transpose(mask, [1, 0]) y2, (fw_h2, bw_h2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len) mask = paddle.unsqueeze(mask, -1) y2 = paddle.multiply(y2, mask) np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(fw_h1, fw_h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(bw_h1, bw_h2.numpy(), atol=1e-8, rtol=1e-5)
def forward(self, src, src_length, trg): # encoder encoder_output, encoder_final_state = self.encoder(src, src_length) # decoder initial states: use input_feed and the structure is # [[h,c] * num_layers, input_feed], consistent with DecoderCell.states decoder_initial_states = [ encoder_final_state, self.decoder.lstm_attention.cell.get_initial_states( batch_ref=encoder_output, shape=[self.hidden_size]) ] # attention mask to avoid paying attention on padddings src_mask = layers.sequence_mask(src_length, maxlen=layers.shape(src)[1], dtype=encoder_output.dtype) encoder_padding_mask = (src_mask - 1.0) * 1e9 encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) # decoder with attentioon predict = self.decoder(trg, decoder_initial_states, encoder_output, encoder_padding_mask) return predict
def gen_mask(valid_lengths, max_len, dtype="float32"): """ Generate a mask tensor from valid lengths. note that it return a *reverse* mask. Indices within valid lengths correspond to 0, and those within padding area correspond to 1. Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is [[0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0]]. Args: valid_lengths (Variable): shape(B, ), dtype: int64. A rank-1 Tensor containing the valid lengths (timesteps) of each example, where B means beatch_size. max_len (int): The length (number of time steps) of the mask. dtype (str, optional): A string that specifies the data type of the returned mask. Defaults to 'float32'. Returns: mask (Variable): shape(B, max_len), dtype float32, a mask computed from valid lengths. """ mask = F.sequence_mask(valid_lengths, maxlen=max_len, dtype=dtype) mask = 1 - mask return mask
def _simple_sum_encoder(self, inputs, input_len, name_lens, name_pos, name_tok_len): """forward Args: inputs (Variable): shape=[batch_size, max_seq_len, hidden_size] input_len (Variable): shape=[batch_size] name_lens (Variable): shape=[batch_size] name_pos (Variable): shape=[batch_size, max_name_len, max_tokens] name_tok_len (Variable): shape=[batch_size, max_name_len] Returns: TODO Raises: NULL """ max_name_len = name_pos.shape[1] max_name_tok_len = name_pos.shape[2] hidden_size = inputs.shape[2] name_pos_1d = layers.reshape( name_pos, shape=[-1, max_name_len * max_name_tok_len]) name_enc = nn_utils.batch_gather_2d(inputs, name_pos_1d) name_enc = layers.reshape( name_enc, shape=[-1, max_name_len, max_name_tok_len, hidden_size]) # shape = [batch_size, name_len, token_len, hidden_size] name_tok_mask = layers.sequence_mask(name_tok_len, maxlen=max_name_tok_len, dtype=name_enc.dtype) name_enc_masked = layers.elementwise_mul(name_enc, name_tok_mask, axis=0) # shape = [batch_size, name_len, hidden_size] output = layers.reduce_sum(name_enc_masked, dim=2) return output, None
def basic_lstm(input, init_hidden, init_cell, hidden_size, num_layers=1, sequence_length=None, dropout_prob=0.0, bidirectional=False, batch_first=True, param_attr=None, bias_attr=None, gate_activation=None, activation=None, forget_bias=1.0, dtype='float32', name='basic_lstm'): """ LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM. .. math:: i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i) f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias ) o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o) \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c) c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} h_t &= o_t \odot tanh(c_t) Args: input (Variable): lstm input tensor, if batch_first = False, shape should be ( seq_len x batch_size x input_size ) if batch_first = True, shape should be ( batch_size x seq_len x hidden_size ) init_hidden(Variable|None): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. init_cell(Variable|None): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. hidden_size (int): Hidden size of the LSTM num_layers (int): The total number of layers of the LSTM sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance, This tensor will be convert to a mask to mask the padding ids If it's None means NO padding ids dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, NOT between time steps bidirectional (bool|False): If it is bidirectional batch_first (bool|True): The shape format of the input and output tensors. If true, the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false, the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default this function accepts input and emits output in batch-major form to be consistent with most of data format, though a bit less efficient because of extra transposes. param_attr(ParamAttr|None): The parameter attribute for the learnable weight matrix. Note: If it is set to None or one attribute of ParamAttr, lstm_unit will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|None): The parameter attribute for the bias of LSTM unit. If it is set to None or one attribute of ParamAttr, lstm_unit will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. gate_activation (function|None): The activation function for gates (actGate). Default: 'fluid.layers.sigmoid' activation (function|None): The activation function for cell (actNode). Default: 'fluid.layers.tanh' forget_bias (float|1.0) : Forget bias used to compute the forget gate dtype(string): Data type used in this unit name(string): Name used to identify parameters and biases Returns: rnn_out(Tensor), last_hidden(Tensor), last_cell(Tensor) - rnn_out is the result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ if is_bidirec set to True, it's shape will be ( seq_len x batch_sze x hidden_size*2) - last_hidden is the hidden state of the last step of LSTM \ with shape ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size), and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size) to use. - last_cell is the hidden state of the last step of LSTM \ with shape ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size), and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size) to use. Examples: .. code-block:: python import paddle.fluid.layers as layers from paddle.fluid.contrib.layers import basic_lstm batch_size = 20 input_size = 128 hidden_size = 256 num_layers = 2 dropout = 0.5 bidirectional = True batch_first = False input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32') pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32') pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32') sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32') rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \ hidden_size, num_layers = num_layers, \ sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \ batch_first = batch_first) """ fw_unit_list = [] for i in range(num_layers): new_name = name + "_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_fw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_fw_b_" + str(i) else: layer_bias_attr = bias_attr fw_unit_list.append( BasicLSTMUnit(new_name, hidden_size, param_attr=layer_param_attr, bias_attr=layer_bias_attr, gate_activation=gate_activation, activation=activation, forget_bias=forget_bias, dtype=dtype)) if bidirectional: bw_unit_list = [] for i in range(num_layers): new_name = name + "_reverse_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_bw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_bw_b_" + str(i) else: layer_bias_attr = param_attr bw_unit_list.append( BasicLSTMUnit(new_name, hidden_size, param_attr=layer_param_attr, bias_attr=layer_bias_attr, gate_activation=gate_activation, activation=activation, forget_bias=forget_bias, dtype=dtype)) if batch_first: input = layers.transpose(input, [1, 0, 2]) mask = None if sequence_length: max_seq_len = layers.shape(input)[0] mask = layers.sequence_mask(sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: direc_num = 2 # convert to [num_layers, 2, batch_size, hidden_size] if init_hidden: init_hidden = layers.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size]) init_cell = layers.reshape( init_cell, shape=[num_layers, direc_num, -1, hidden_size]) # forward direction def get_single_direction_output(rnn_input, unit_list, mask=None, direc_index=0): rnn = StaticRNN() with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) pre_cell = rnn.memory(init=init_cell[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) pre_cell = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size]) new_hidden, new_cell = unit_list[i](step_input, pre_hidden, pre_cell) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul(pre_hidden, (step_mask - 1), axis=0) new_cell = layers.elementwise_mul( new_cell, step_mask, axis=0) - layers.elementwise_mul( pre_cell, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.update_memory(pre_cell, new_cell) rnn.step_output(new_hidden) rnn.step_output(new_cell) step_input = new_hidden if dropout_prob != None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, dropout_implementation='upscale_in_train') rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] last_cell_array = [] rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i * 2] last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) last_cell = rnn_out[i * 2 + 1] last_cell = last_cell[-1] last_cell_array.append(last_cell) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size]) last_cell_output = layers.concat(last_cell_array, axis=0) last_cell_output = layers.reshape(last_cell_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output, last_cell_output # seq_len, batch_size, hidden_size fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output( input, fw_unit_list, mask, direc_index=0) if bidirectional: bw_input = layers.reverse(input, axis=[0]) bw_mask = None if mask: bw_mask = layers.reverse(mask, axis=[0]) bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output( bw_input, bw_unit_list, bw_mask, direc_index=1) bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0]) rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) last_hidden = layers.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size]) last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1) last_cell = layers.reshape( last_cell, shape=[num_layers * direc_num, -1, hidden_size]) if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, last_cell else: rnn_out = fw_rnn_out last_hidden = fw_last_hidden last_cell = fw_last_cell if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, last_cell
def basic_gru(input, init_hidden, hidden_size, num_layers=1, sequence_length=None, dropout_prob=0.0, bidirectional=False, batch_first=True, param_attr=None, bias_attr=None, gate_activation=None, activation=None, dtype='float32', name='basic_gru'): """ GRU implementation using basic operator, supports multiple layers and bidirectional gru. .. math:: u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u) r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r) m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m) h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t) Args: input (Variable): GRU input tensor, if batch_first = False, shape should be ( seq_len x batch_size x input_size ) if batch_first = True, shape should be ( batch_size x seq_len x hidden_size ) init_hidden(Variable|None): The initial hidden state of the GRU This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use. If it's None, it will be set to all 0. hidden_size (int): Hidden size of the GRU num_layers (int): The total number of layers of the GRU sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance, This tensor will be convert to a mask to mask the padding ids If it's None means NO padding ids dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, NOT between time steps bidirectional (bool|False): If it is bidirectional batch_first (bool|True): The shape format of the input and output tensors. If true, the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false, the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default this function accepts input and emits output in batch-major form to be consistent with most of data format, though a bit less efficient because of extra transposes. param_attr(ParamAttr|None): The parameter attribute for the learnable weight matrix. Note: If it is set to None or one attribute of ParamAttr, gru_unit will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr (ParamAttr|None): The parameter attribute for the bias of GRU unit. If it is set to None or one attribute of ParamAttr, gru_unit will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. gate_activation (function|None): The activation function for gates (actGate). Default: 'fluid.layers.sigmoid' activation (function|None): The activation function for cell (actNode). Default: 'fluid.layers.tanh' dtype(string): data type used in this unit name(string): name used to identify parameters and biases Returns: rnn_out(Tensor),last_hidden(Tensor) - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \ if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) - last_hidden is the hidden state of the last step of GRU \ shape is ( num_layers x batch_size x hidden_size ) \ if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size), can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size) Examples: .. code-block:: python import paddle.fluid.layers as layers from paddle.fluid.contrib.layers import basic_gru batch_size = 20 input_size = 128 hidden_size = 256 num_layers = 2 dropout = 0.5 bidirectional = True batch_first = False input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32') pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32') sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32') rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \ sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \ batch_first = batch_first) """ fw_unit_list = [] for i in range(num_layers): new_name = name + "_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_fw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_fw_b_" + str(i) else: layer_bias_attr = bias_attr fw_unit_list.append( BasicGRUUnit(new_name, hidden_size, layer_param_attr, layer_bias_attr, gate_activation, activation, dtype)) if bidirectional: bw_unit_list = [] for i in range(num_layers): new_name = name + "_reverse_layers_" + str(i) if param_attr is not None and param_attr.name is not None: layer_param_attr = copy.deepcopy(param_attr) layer_param_attr.name += "_bw_w_" + str(i) else: layer_param_attr = param_attr if bias_attr is not None and bias_attr.name is not None: layer_bias_attr = copy.deepcopy(bias_attr) layer_bias_attr.name += "_bw_b_" + str(i) else: layer_bias_attr = bias_attr bw_unit_list.append( BasicGRUUnit(new_name, hidden_size, layer_param_attr, layer_bias_attr, gate_activation, activation, dtype)) if batch_first: input = layers.transpose(input, [1, 0, 2]) mask = None if sequence_length: max_seq_len = layers.shape(input)[0] mask = layers.sequence_mask(sequence_length, maxlen=max_seq_len, dtype='float32') mask = layers.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: direc_num = 2 if init_hidden: init_hidden = layers.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size]) def get_single_direction_output(rnn_input, unit_list, mask=None, direc_index=0): rnn = StaticRNN() with rnn.step(): step_input = rnn.step_input(rnn_input) if mask: step_mask = rnn.step_input(mask) for i in range(num_layers): if init_hidden: pre_hidden = rnn.memory(init=init_hidden[i, direc_index]) else: pre_hidden = rnn.memory(batch_ref=rnn_input, shape=[-1, hidden_size], ref_batch_dim_idx=1) new_hidden = unit_list[i](step_input, pre_hidden) if mask: new_hidden = layers.elementwise_mul( new_hidden, step_mask, axis=0) - layers.elementwise_mul(pre_hidden, (step_mask - 1), axis=0) rnn.update_memory(pre_hidden, new_hidden) rnn.step_output(new_hidden) step_input = new_hidden if dropout_prob != None and dropout_prob > 0.0: step_input = layers.dropout( step_input, dropout_prob=dropout_prob, ) rnn.step_output(step_input) rnn_out = rnn() last_hidden_array = [] rnn_output = rnn_out[-1] for i in range(num_layers): last_hidden = rnn_out[i] last_hidden = last_hidden[-1] last_hidden_array.append(last_hidden) last_hidden_output = layers.concat(last_hidden_array, axis=0) last_hidden_output = layers.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size]) return rnn_output, last_hidden_output # seq_len, batch_size, hidden_size fw_rnn_out, fw_last_hidden = get_single_direction_output(input, fw_unit_list, mask, direc_index=0) if bidirectional: bw_input = layers.reverse(input, axis=[0]) bw_mask = None if mask: bw_mask = layers.reverse(mask, axis=[0]) bw_rnn_out, bw_last_hidden = get_single_direction_output(bw_input, bw_unit_list, bw_mask, direc_index=1) bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0]) rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) last_hidden = layers.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size]) if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden else: rnn_out = fw_rnn_out last_hidden = fw_last_hidden if batch_first: rnn_out = layers.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden