def attn_flow(q_enc, p_enc, p_ids_name, args): """Bidirectional Attention layer""" tag = p_ids_name + "__" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_mul = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t_sum = layers.reduce_sum(input=s_t_mul, dim=1, keep_dim=True) s_t_re = layers.reshape(s_t_sum, shape=[-1, 0]) s_t = layers.sequence_softmax(input=s_t_re) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') b_t = layers.sequence_pool(input=s_t_sum, pool_type='max') drnn.output(u_expr, b_t) U_expr, b = drnn() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=p_enc, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=p_enc, y=H_expr, axis=0) g = layers.concat(input=[p_enc, U_expr, h_u, h_h], axis=1) return dropout(g, args)
def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta
def test_sequence_softmax(self): program = Program() with program_guard(program): seq_data = layers.data( name='seq_data', shape=[10, 10], dtype='float32', lod_level=1) seq = layers.fc(input=seq_data, size=20) self.assertIsNotNone(layers.sequence_softmax(seq)) print(str(program))
def test_sequence_softmax(self): program = Program() with program_guard(program): seq_data = layers.data( name='seq_data', shape=[10, 10], dtype='float32', lod_level=1) seq = layers.fc(input=seq_data, size=20) self.assertIsNotNone(layers.sequence_softmax(seq)) print(str(program))
def static_rnn(step, p_vec=p_vec, init_state=None, para_name='', args=args): tag = para_name + "static_rnn_" ctx = layers.fc( input=p_vec, param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'), size=hidden_size, act=None) beta = [] c_prev = init_state m_prev = init_state for i in range(step): m_prev0 = layers.fc( input=m_prev, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b')) m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx) Fk = ctx + m_prev1 Fk = layers.tanh(Fk) logits = layers.fc( input=Fk, size=1, act=None, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b')) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step( attn_ctx, hidden_t_prev=m_prev, cell_t_prev=c_prev, size=hidden_size, para_name=tag, args=args) m_prev = hidden_t c_prev = cell_t beta.append(scores) return beta
def simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size): decoder_state_proj = layers.fc(input=decoder_state, size=decoder_size, bias_attr=False, name="decoder_state_proj_fc") decoder_state_expand = layers.sequence_expand( x=decoder_state_proj, y=encoder_proj) concated = layers.elementwise_add(encoder_proj, decoder_state_expand) concated = layers.tanh(x=concated) attention_weights = layers.fc(input=concated, size=1, act=None, bias_attr=False, name="attention_weights_fc") attention_weights = layers.sequence_softmax(input=attention_weights) weigths_reshape = layers.reshape(x=attention_weights, shape=[-1]) scaled = layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0) context = layers.sequence_pool(input=scaled, pool_type='sum') return context
def point_network_decoder(p_vec, q_vec, hidden_size, args): """Output layer - pointer network""" tag = 'pn_decoder_' init_random = fluid.initializer.Normal(loc=0.0, scale=1.0) random_attn = layers.create_parameter( shape=[1, hidden_size], dtype='float32', default_initializer=init_random) random_attn = layers.fc( input=random_attn, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_b')) random_attn = layers.reshape(random_attn, shape=[-1]) U = layers.fc(input=q_vec, param_attr=fluid.ParamAttr(name=tag + 'q_vec_fc_w'), bias_attr=False, size=hidden_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'), size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc( input=pooled_vec, param_attr=fluid.ParamAttr(name=tag + 'init_state_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'init_state_fc_b'), size=hidden_size, act=None) def custom_dynamic_rnn(p_vec, init_state, hidden_size, para_name, args): tag = para_name + "custom_dynamic_rnn_" def static_rnn(step, p_vec=p_vec, init_state=None, para_name='', args=args): tag = para_name + "static_rnn_" ctx = layers.fc( input=p_vec, param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'), size=hidden_size, act=None) beta = [] c_prev = init_state m_prev = init_state for i in range(step): m_prev0 = layers.fc( input=m_prev, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b')) m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx) Fk = ctx + m_prev1 Fk = layers.tanh(Fk) logits = layers.fc( input=Fk, size=1, act=None, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b')) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step( attn_ctx, hidden_t_prev=m_prev, cell_t_prev=c_prev, size=hidden_size, para_name=tag, args=args) m_prev = hidden_t c_prev = cell_t beta.append(scores) return beta return static_rnn( 2, p_vec=p_vec, init_state=init_state, para_name=para_name) fw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "fw_", args) bw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "bw_", args) start_prob = layers.elementwise_add( x=fw_outputs[0], y=bw_outputs[1], axis=0) / 2 end_prob = layers.elementwise_add( x=fw_outputs[1], y=bw_outputs[0], axis=0) / 2 return start_prob, end_prob
def attn_flow(q_enc, p_enc, p_ids_name): tag = p_ids_name + "::" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t1 = layers.reduce_sum(input=s_t_, dim=1) s_t = layers.sequence_softmax(input=s_t1) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') if args.debug == True: ''' layers.Print(h_expd, message='h_expd') layers.Print(h_cur, message='h_cur') layers.Print(u_all, message='u_all') layers.Print(s_t, message='s_t') layers.Print(s_t_, message='s_t_') layers.Print(u_expr, message='u_expr') ''' drnn.output(u_expr) U_expr = drnn() #''' drnn2 = layers.DynamicRNN() with drnn2.block(): h_cur = drnn2.step_input(p_enc) u_all = drnn2.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) b_t = layers.sequence_pool(input=s_t2, pool_type='max') if args.debug == True: ''' layers.Print(s_t2, message='s_t2') layers.Print(b_t, message='b_t') ''' drnn2.output(b_t) b = drnn2() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) #fusion m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) if args.debug == True: layers.Print(U_expr, message=tag + 'U_expr') layers.Print(H_expr, message=tag + 'H_expr') layers.Print(b, message=tag + 'b') layers.Print(b_norm, message=tag + 'b_norm') layers.Print(g, message=tag +'g') layers.Print(m, message=tag + 'm') layers.Print(h_h, message=tag + 'h_h') layers.Print(q_enc, message=tag + 'q_enc') layers.Print(p_enc, message=tag + 'p_enc') return m, g
def bidaf(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, max_length, args): def bi_lstm_encoder(input_seq, gate_size): # A bi-directional lstm encoder implementation. # Linear transformation part for input gate, output gate, forget gate # and cell activation vectors need be done outside of dynamic_lstm. # So the output size is 4 times of gate_size. input_forward_proj = layers.fc(input=input_seq, size=gate_size * 4, act='tanh', bias_attr=False) forward, _ = layers.dynamic_lstm( input=input_forward_proj, size=gate_size * 4, use_peepholes=False) input_reversed_proj = layers.fc(input=input_seq, size=gate_size * 4, act='tanh', bias_attr=False) reversed, _ = layers.dynamic_lstm( input=input_reversed_proj, size=gate_size * 4, is_reverse=True, use_peepholes=False) encoder_out = layers.concat(input=[forward, reversed], axis = 1) return encoder_out def encoder(input_name): input_ids = layers.data( name=input_name, shape=[1], dtype='int64', lod_level=1) input_embedding = layers.embedding( input=input_ids, size=[source_dict_dim, embedding_dim], dtype='float32', is_sparse=True) encoder_out = bi_lstm_encoder(input_seq=input_embedding, gate_size=embedding_dim) return encoder_out def attn_flow(q_enc, p_enc, p_ids_name): tag = p_ids_name + "::" drnn = layers.DynamicRNN() with drnn.block(): h_cur = drnn.step_input(p_enc) u_all = drnn.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t1 = layers.reduce_sum(input=s_t_, dim=1) s_t = layers.sequence_softmax(input=s_t1) u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0) u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') if args.debug == True: ''' layers.Print(h_expd, message='h_expd') layers.Print(h_cur, message='h_cur') layers.Print(u_all, message='u_all') layers.Print(s_t, message='s_t') layers.Print(s_t_, message='s_t_') layers.Print(u_expr, message='u_expr') ''' drnn.output(u_expr) U_expr = drnn() #''' drnn2 = layers.DynamicRNN() with drnn2.block(): h_cur = drnn2.step_input(p_enc) u_all = drnn2.static_input(q_enc) h_expd = layers.sequence_expand(x=h_cur, y=u_all) s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0) s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) b_t = layers.sequence_pool(input=s_t2, pool_type='max') if args.debug == True: ''' layers.Print(s_t2, message='s_t2') layers.Print(b_t, message='b_t') ''' drnn2.output(b_t) b = drnn2() b_norm = layers.sequence_softmax(input=b) h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0) h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') H_expr = layers.sequence_expand(x=h_expr, y=p_enc) H_expr = layers.lod_reset(x=H_expr, y=p_enc) h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0) h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) #fusion m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) if args.debug == True: layers.Print(U_expr, message=tag + 'U_expr') layers.Print(H_expr, message=tag + 'H_expr') layers.Print(b, message=tag + 'b') layers.Print(b_norm, message=tag + 'b_norm') layers.Print(g, message=tag +'g') layers.Print(m, message=tag + 'm') layers.Print(h_h, message=tag + 'h_h') layers.Print(q_enc, message=tag + 'q_enc') layers.Print(p_enc, message=tag + 'p_enc') return m, g def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def linear(inputs): return layers.fc(input=inputs, size=size, bias_attr=True) forget_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t])) input_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t])) output_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t])) cell_tilde = layers.tanh(x=linear([hidden_t_prev, x_t])) cell_t = layers.sums(input=[ layers.elementwise_mul( x=forget_gate, y=cell_t_prev), layers.elementwise_mul( x=input_gate, y=cell_tilde) ]) hidden_t = layers.elementwise_mul( x=output_gate, y=layers.tanh(x=cell_t)) return hidden_t, cell_t #point network def point_network_decoder(p_vec, q_vec, decoder_size): random_attn = layers.gaussian_random(shape=[1, decoder_size]) random_attn = layers.sequence_expand(x=random_attn, y=q_vec) random_attn = layers.fc(input=random_attn, size=decoder_size, act=None) U = layers.fc(input=q_vec, size=decoder_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc(input=pooled_vec, size=decoder_size, act=None) def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta fw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) bw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) def sequence_slice(x, index): #offset = layers.fill_constant(shape=[1, args.batch_size], value=index, dtype='float32') #length = layers.fill_constant(shape=[1, args.batch_size], value=1, dtype='float32') #return layers.sequence_slice(x, offset, length) idx = layers.fill_constant(shape=[1], value=1, dtype='int32') idx.stop_gradient = True from paddle.fluid.layers.control_flow import lod_rank_table from paddle.fluid.layers.control_flow import lod_tensor_to_array from paddle.fluid.layers.control_flow import array_read from paddle.fluid.layers.control_flow import array_to_lod_tensor table = lod_rank_table(x, level=0) table.stop_gradient = True array = lod_tensor_to_array(x, table) slice_array = array_read(array=array, i=idx) return array_to_lod_tensor(slice_array, table) start_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 0), y=sequence_slice(bw_outputs, 1), axis=0) / 2 end_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 1), y=sequence_slice(bw_outputs, 0), axis=0) / 2 return start_prob, end_prob q_enc = encoder('q_ids') if args.single_doc: p_enc = encoder('p_ids') m, g = attn_flow(q_enc, p_enc, 'p_ids') else: p_ids_names = [] ms = [] gs = [] for i in range(args.doc_num): p_ids_name = "pids_%d" % i p_ids_names.append(p_ids_name) p_enc = encoder(p_ids_name) m_i, g_i = attn_flow(q_enc, p_enc, p_ids_name) ms.append(m_i) gs.append(g_i) m = layers.sequence_concat(x=ms, axis = 0) g = layers.sequence_concat(x=gs, axis = 0) if args.simple_decode: m2 = bi_lstm_encoder(input_seq=m, gate_size=embedding_dim) gm1 = layers.concat(input=[g, m], axis = 1) gm2 = layers.concat(input=[g, m2], axis = 1) start_prob = layers.fc(input=gm1, size=1, act='softmax') end_prob = layers.fc(input=gm2, size=1, act='softmax') else: p_vec = layers.sequence_concat(x=m, axis = 0) q_vec = bi_lstm_encoder(input_seq=q_enc, gate_size=embedding_dim) start_prob, end_prob = point_network_decoder(p_vec=p_vec, q_vec=q_vec, decoder_size = decoder_size) start_prob = layers.sequence_softmax(start_prob) end_prob = layers.sequence_softmax(end_prob) pred = layers.concat(input=[start_prob, end_prob], axis = 0) #''' start_labels = layers.data( name="start_lables", shape=[1], dtype='float32', lod_level=1) end_labels = layers.data( name="end_lables", shape=[1], dtype='float32', lod_level=1) label = layers.concat(input=[start_labels, end_labels], axis=0) label.stop_gradient = True #compute loss cost = layers.cross_entropy(input=pred, label=label, soft_label=True) #cost = layers.cross_entropy(input=decode_out, label=end_labels, soft_label=True) cost = layers.reduce_sum(cost) / args.batch_size if args.debug == True: layers.Print(p1, message='p1') layers.Print(pred, message='pred') layers.Print(label, message='label') layers.Print(start_labels, message='start_labels') layers.Print(cost, message='cost') if args.single_doc: feeding_list = ['q_ids', "start_lables", "end_lables", 'p_ids'] else: feeding_list = ['q_ids', "start_lables", "end_lables" ] + p_ids_names return cost, feeding_list
def point_network_decoder(p_vec, q_vec, decoder_size): random_attn = layers.gaussian_random(shape=[1, decoder_size]) random_attn = layers.sequence_expand(x=random_attn, y=q_vec) random_attn = layers.fc(input=random_attn, size=decoder_size, act=None) U = layers.fc(input=q_vec, size=decoder_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc(input=pooled_vec, size=decoder_size, act=None) def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta fw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) bw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) def sequence_slice(x, index): #offset = layers.fill_constant(shape=[1, args.batch_size], value=index, dtype='float32') #length = layers.fill_constant(shape=[1, args.batch_size], value=1, dtype='float32') #return layers.sequence_slice(x, offset, length) idx = layers.fill_constant(shape=[1], value=1, dtype='int32') idx.stop_gradient = True from paddle.fluid.layers.control_flow import lod_rank_table from paddle.fluid.layers.control_flow import lod_tensor_to_array from paddle.fluid.layers.control_flow import array_read from paddle.fluid.layers.control_flow import array_to_lod_tensor table = lod_rank_table(x, level=0) table.stop_gradient = True array = lod_tensor_to_array(x, table) slice_array = array_read(array=array, i=idx) return array_to_lod_tensor(slice_array, table) start_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 0), y=sequence_slice(bw_outputs, 1), axis=0) / 2 end_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 1), y=sequence_slice(bw_outputs, 0), axis=0) / 2 return start_prob, end_prob