def forward(self, features): def FC(inputs, name, i, act): return L.fc(inputs, self.hidden_size, act=act, param_attr=F.ParamAttr( name='%s.fc.w_%d' % (name, i), initializer=F.initializer.XavierInitializer( fan_in=self.hidden_size, fan_out=self.hidden_size)), bias_attr=F.ParamAttr( name='%s.fc.b_%d' % (name, i), initializer=F.initializer.Constant(0.))) title_ids, comment_ids = features embedding_attr = F.ParamAttr( name='emb', initializer=F.initializer.XavierInitializer( fan_in=self.vocab_size, fan_out=self.embedding_size)) title_encoded = L.embedding(title_ids, [self.vocab_size, self.embedding_size], param_attr=embedding_attr) comment_encoded = L.embedding(comment_ids, [self.vocab_size, self.embedding_size], param_attr=embedding_attr) # Vsum zero = L.fill_constant(shape=[1], dtype='int64', value=0) title_pad = L.cast(L.logical_not(L.equal(title_ids, zero)), 'float32') comment_pad = L.cast(L.logical_not(L.equal(comment_ids, zero)), 'float32') title_encoded = L.reduce_sum(title_encoded * title_pad, dim=1) title_encoded = L.softsign(title_encoded) comment_encoded = L.reduce_sum(comment_encoded * comment_pad, dim=1) comment_encoded = L.softsign(comment_encoded) for i in range(self.num_layers): title_encoded = FC(title_encoded, 'title', i, 'tanh') for i in range(self.num_layers): comment_encoded = FC(comment_encoded, 'comment', i, 'tanh') score = L.reduce_sum(title_encoded * comment_encoded, dim=1, keep_dim=True) / np.sqrt(self.hidden_size) if self.mode is propeller.RunMode.PREDICT: probs = L.sigmoid(score) return probs else: return score
def forward(self, pred, target): target = 1 - target[:, 0] batch_size, vector_size = pred.shape[0], pred.shape[1] pred = L.l2_normalize(pred, axis=1, epsilon=1e-10) square_norm = L.reduce_sum(L.square(pred), dim=1) dist = L.elementwise_add(-2.0 * L.matmul(pred, pred, transpose_y=True), square_norm, axis=0) dist = L.elementwise_add(dist, square_norm, axis=1) dist = L.elementwise_max(dist, L.zeros_like(dist)) dist = L.sqrt(dist) ap_dist = L.reshape(dist, (0, 0, 1)) an_dist = L.reshape(dist, (0, 1, -1)) loss = L.expand(ap_dist, (1, 1, batch_size)) - L.expand( an_dist, (1, batch_size, 1)) + self.magin indice_equal = L.diag( L.fill_constant((batch_size, ), dtype='float32', value=1.0)) indice_not_equal = 1.0 - indice_equal broad_matrix = L.expand(L.reshape(target, (-1, 1)), (1, batch_size)) + L.expand( L.reshape(target, (1, -1)), (batch_size, 1)) pp = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix)), dtype='float32') pp = L.reshape(indice_not_equal * pp, (0, 0, 1)) pn = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix) + 1), dtype='float32') pn = L.reshape(indice_not_equal * pn, (1, 0, -1)) apn = L.expand(pp, (1, 1, batch_size)) * L.expand(pn, (batch_size, 1, 1)) loss = loss * L.cast(apn, dtype='float32') loss = L.elementwise_max(loss, L.zeros_like(loss)) num_tri = L.reduce_sum( L.cast(L.greater_than(loss, L.zeros_like(loss)), dtype='float32')) loss = L.reduce_sum(loss) * self.loss_weight / (num_tri + 1e-16) return loss
def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask): """跟进文法约束完成一步解码逻辑 Args: logits (Variable): shape = [batch_size, beam_size, vocab_size] next_cell_states (Variable): NULL decode_states (StateWrapper): NULL Returns: TODO Raises: NULL """ # 解码出符合语法规则的 token logits logits, valid_table_mask = self._output_layer(logits, actions, gmr_mask, decode_states.valid_table_mask) # 初始化 vocab size self._vocab_size = logits.shape[-1] self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1]) # 计算 log probs,并 mask 掉 finished 部分 step_log_probs = layers.log(layers.softmax(logits)) step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished) scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]) topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_indices = layers.reshape(topk_indices, shape=[-1]) # top-k 对应的 beam beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor) # top-k 对应的 token id token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor) # 根据 top k 的来源,重新组织 step_log_probs next_log_probs = nn_utils.batch_gather( layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]), topk_indices) def _beam_gather(x, beam_indices): """reshape x to beam dim, and gather each beam_indices Args: x (TYPE): NULL Returns: Variable """ x = self.split_batch_beams(x) return nn_utils.batch_gather(x, beam_indices) next_cell_states = layers.utils.map_structure(lambda x: _beam_gather(x, beam_indices), next_cell_states) next_finished = _beam_gather(decode_states.finished, beam_indices) next_lens = _beam_gather(decode_states.lengths, beam_indices) next_lens = layers.elementwise_add(next_lens, layers.cast(layers.logical_not(next_finished), next_lens.dtype)) next_finished = layers.logical_or(next_finished, layers.equal(token_indices, self._end_token_tensor)) decode_output = OutputWrapper(topk_scores, token_indices, beam_indices) decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask) return decode_output, decode_states
def build_position_ids(src_ids, dst_ids): src_shape = L.shape(src_ids) src_batch = src_shape[0] src_seqlen = src_shape[1] dst_seqlen = src_seqlen - 1 # without cls src_position_ids = L.reshape( L.range( 0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids, zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] src_pad_len = L.reduce_sum(input_mask, 1, keep_dim=True) # [B, 1, 1] dst_position_ids = L.reshape( L.range( src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1] dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1] position_ids = L.concat([src_position_ids, dst_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def __init__(self, label, pred): """doc""" if label.shape != pred.shape: raise ValueError( 'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s' % (repr(label), repr(pred))) self.eq = L.equal(pred, label) self.reset()
def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array, enc_memory) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape(topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array
def grow_topk(i, logits, alive_seq, alive_log_probs, states): logits = layers.reshape(logits, [batch_size, beam_size, -1]) candidate_log_probs = layers.log(layers.softmax(logits, axis=2)) log_probs = layers.elementwise_add(candidate_log_probs, alive_log_probs, 0) length_penalty = np.power(5.0 + (i + 1.0) / 6.0, alpha) curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1]) topk_scores, topk_ids = layers.topk(flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty topk_beam_index = topk_ids // self.trg_vocab_size topk_ids = topk_ids % self.trg_vocab_size # use gather as gather_nd, TODO: use gather_nd topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index, beam_size, batch_size) topk_seq = layers.concat( [topk_seq, layers.reshape(topk_ids, topk_ids.shape + [1])], axis=2) states = update_states(states, topk_beam_index, beam_size) eos = layers.fill_constant(shape=topk_ids.shape, dtype="int64", value=eos_id) topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32") #topk_seq: [batch_size, 2*beam_size, i+1] #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size] return topk_seq, topk_log_probs, topk_scores, topk_finished, states
def _build_position_ids(self, src_ids): src_shape = L.shape(src_ids) src_seqlen = src_shape[1] src_batch = src_shape[0] slot_seqlen = self.slot_seqlen num_b = (src_seqlen / slot_seqlen) - 1 a_position_ids = L.reshape(L.range(0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] a_position_ids = L.expand( a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1] b_position_ids = L.reshape(L.range(slot_seqlen, 2 * slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] b_position_ids = L.expand( b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1] b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1] position_ids = L.concat([a_position_ids, b_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def _build_input_mask(self, src_ids): zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.logical_not(L.equal(src_ids, zero)) # assume pad id == 0 input_mask = L.cast(input_mask, 'float') input_mask.stop_gradient = True return input_mask
def points_nms(heat, kernel=2): # kernel must be 2 hmax = L.pool2d(heat, pool_size=kernel, pool_stride=1, pool_padding=[[0, 0], [0, 0], [1, 0], [1, 0]], pool_type='max') keep = L.cast(L.equal(hmax, heat), 'float32') return heat * keep
def __init__(self, label, pred): """doc""" if label.shape != pred.shape: raise ValueError( 'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s' % (repr(label), repr(pred))) self.eq = _allgather_2dim(L.cast(L.equal(pred, label), 'int64')) self.reset()
def get_enc_bias(source_inputs): """ get_enc_bias """ source_inputs = layers.cast(source_inputs, 'float32') emb_sum = layers.reduce_sum(layers.abs(source_inputs), dim=-1) zero = layers.fill_constant([1], 'float32', value=0) bias = layers.cast(layers.equal(emb_sum, zero), 'float32') * -1e9 return layers.unsqueeze(layers.unsqueeze(bias, axes=[1]), axes=[1])
def test_return_var_tuple(self): def fn_1(): return layers.fill_constant(shape=[1, 2], dtype='int32', value=1), layers.fill_constant( shape=[2, 3], dtype='float32', value=2) def fn_2(): return layers.fill_constant(shape=[3, 4], dtype='int32', value=3), layers.fill_constant( shape=[4, 5], dtype='float32', value=4) def fn_3(): return layers.fill_constant(shape=[5], dtype='int32', value=5), layers.fill_constant( shape=[5, 6], dtype='float32', value=6) main_program = Program() startup_program = Program() with program_guard(main_program, startup_program): x = layers.fill_constant(shape=[1], dtype='float32', value=1) y = layers.fill_constant(shape=[1], dtype='float32', value=1) z = layers.fill_constant(shape=[1], dtype='float32', value=3) pred_1 = layers.equal(x, y) # true pred_2 = layers.equal(x, z) # false out = layers.case(((pred_1, fn_1), (pred_2, fn_2)), fn_3) place = fluid.CUDAPlace( 0) if core.is_compiled_with_cuda() else fluid.CPUPlace() exe = fluid.Executor(place) ret = exe.run(main_program, fetch_list=out) self.assertTrue( np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32))) self.assertTrue( np.allclose(np.asarray(ret[1]), np.full((2, 3), 2, np.float32)))
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): """Matrix NMS for multi-class masks. Args: seg_masks (Tensor): shape (n, h, w) 0、1组成的掩码 cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gauss' sigma (float): std in gaussian method sum_masks (Tensor): shape (n, ) n个物体的面积 Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = L.shape(cate_labels)[0] # 物体数 seg_masks = L.reshape(seg_masks, (n_samples, -1)) # [n, h*w] # inter. inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True) # [n, n] 自己乘以自己的转置。两两之间的交集面积。 # union. sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1]) # [n, n] sum_masks重复了n行得到sum_masks_x # iou. iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix) rows = L.range(0, n_samples, 1, 'int32') cols = L.range(0, n_samples, 1, 'int32') rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1]) cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples]) tri_mask = L.cast(rows > cols, 'float32') iou_matrix = tri_mask * iou_matrix # [n, n] 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # [n, n] cate_labels重复了n行得到cate_labels_x label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = tri_mask * label_matrix # [n, n] 只取上三角部分 # IoU compensation compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0) compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]) # [n, n] compensate_iou = L.transpose(compensate_iou, [1, 0]) # [n, n] # IoU decay decay_iou = iou_matrix * label_matrix # # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0) elif kernel == 'linear': decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, dim=0) else: raise NotImplementedError # update the score. cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def build_model(self): node_features = self.graph_wrapper.node_feat["feat"] output = self.gcn(gw=self.graph_wrapper, feature=node_features, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_1") output1 = output output = self.gcn(gw=self.graph_wrapper, feature=output, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_2") output2 = output output = self.gcn(gw=self.graph_wrapper, feature=output, hidden_size=self.hidden_size, activation="relu", norm=self.graph_wrapper.node_feat["norm"], name="gcn_layer_3") output = L.concat(input=[output1, output2, output], axis=-1) output, ratio_length = sag_pool(gw=self.graph_wrapper, feature=output, ratio=self.pooling_ratio, graph_id=self.graph_id, dataset=self.args.dataset_name, name="sag_pool_1") output = L.lod_reset(output, self.graph_wrapper.graph_lod) cat1 = L.sequence_pool(output, "sum") ratio_length = L.cast(ratio_length, dtype="float32") cat1 = L.elementwise_div(cat1, ratio_length, axis=-1) cat2 = L.sequence_pool(output, "max") output = L.concat(input=[cat2, cat1], axis=-1) output = L.fc(output, size=self.hidden_size, act="relu") output = L.dropout(output, dropout_prob=self.dropout_ratio) output = L.fc(output, size=self.hidden_size // 2, act="relu") output = L.fc(output, size=self.num_classes, act=None, param_attr=fluid.ParamAttr(name="final_fc")) self.labels = L.cast(self.labels, dtype="float32") loss = L.sigmoid_cross_entropy_with_logits(x=output, label=self.labels) self.loss = L.mean(loss) pred = L.sigmoid(output) self.pred = L.argmax(x=pred, axis=-1) correct = L.equal(self.pred, self.labels_1dim) correct = L.cast(correct, dtype="int32") self.correct = L.reduce_sum(correct)
def forward(self, features): src_ids, sent_ids = features dtype = 'float16' if self.hparam['fp16'] else 'float32' zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True bert = ErnieModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['fp16'] ) cls_feats = bert.get_pooled_output() cls_feats = L.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train" ) logits = L.fc( input=cls_feats, size=self.hparam['num_label'], param_attr=F.ParamAttr( name="cls_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr( name="cls_out_b", initializer=F.initializer.Constant(0.)) ) propeller.summary.histogram('pred', logits) if self.mode is propeller.RunMode.PREDICT: probs = L.softmax(logits) return probs else: return logits
def bow(ids): embed = L.embedding( input=ids, size=[self.config.vocab_size, self.config.emb_size], dtype=self._emb_dtype, param_attr=F.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) zero = L.fill_constant(shape=[1], dtype='int64', value=0) pad = L.cast(L.logical_not(L.equal(ids, zero)), 'float32') sumed = L.reduce_sum(embed * pad, dim=1) sumed = L.softsign(sumed) return sumed
def empty(cls, stack_data, dtype='bool'): """Return True if stack is empty(pos == 0) Args: stack_data (TYPE): NULL dtype (str): result dtype. Default is bool. Returns: Variable shape=[-1], dtype=params<dtype> Raises: NULL """ zeros = layers.zeros_like(stack_data.pos) output = layers.equal(stack_data.pos, zeros) if dtype != 'bool': output = layers.cast(output, dtype=dtype) return output
def build_model(self, enc_input, dec_input, tgt_label, label_weights): """Build the model with source encoding and target decoding""" enc_word_output, enc_sen_output = self.encode(enc_input) dec_output = self.decode(dec_input, enc_word_output, enc_sen_output) predict_token_idx = layers.argmax(dec_output, axis=-1) correct_token_idx = layers.cast(layers.equal( tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])), dtype='float32') weighted_correct = layers.elementwise_mul(x=correct_token_idx, y=label_weights, axis=0) sum_correct = layers.reduce_sum(weighted_correct) sum_correct.stop_gradient = True # Padding index do not contribute to the total loss. The weights is used to # cancel padding index in calculating the loss. if self._label_smooth_eps: # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing # the enforcement that the last dimension of label must be 1. tgt_label = layers.label_smooth(label=layers.one_hot( input=tgt_label, depth=self.voc_size), epsilon=self._label_smooth_eps) cost = layers.softmax_with_cross_entropy( logits=dec_output, label=tgt_label, soft_label=True if self._label_smooth_eps else False) weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0) sum_cost = layers.reduce_sum(weighted_cost) token_num = layers.reduce_sum(label_weights) token_num.stop_gradient = True avg_cost = sum_cost / token_num graph_vars = { "loss": avg_cost, "sum_correct": sum_correct, "token_num": token_num, } for k, v in graph_vars.items(): v.persistable = True return graph_vars
def grow_topk(self, i, logits, alive_seq, alive_log_probs, cache, enc_output, enc_bias): """ grow_topk """ logits = layers.reshape(logits, [self.batch_size, self.beam_size, -1]) candidate_log_probs = layers.log(layers.softmax(logits, axis=2)) log_probs = candidate_log_probs + layers.unsqueeze(alive_log_probs, axes=[2]) base_1 = layers.cast(i, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, self.alpha) #length_penalty = layers.pow(((5.0 + layers.cast(i+1, 'float32')) / 6.0), self.alpha) curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [self.batch_size, self.beam_size * self.vocab_size]) topk_scores, topk_ids = layers.topk(flat_curr_scores, k=self.beam_size * 2) topk_log_probs = topk_scores * length_penalty select_beam_index = topk_ids // self.vocab_size select_id = topk_ids % self.vocab_size #layers.Print(select_id, message="select_id", summarize=1024) #layers.Print(topk_scores, message="topk_scores", summarize=10000000) flat_select_beam_index = layers.reshape(select_beam_index, [-1]) + self.gather_top2k_append_index topk_seq = layers.gather(alive_seq, [flat_select_beam_index]) topk_seq = layers.reshape(topk_seq, [self.batch_size, 2 * self.beam_size, -1]) #concat with current ids topk_seq = layers.concat([topk_seq, layers.unsqueeze(select_id, axes=[2])], axis=2) topk_finished = layers.cast(layers.equal(select_id, self.eos_id), 'float32') #gather cache self.gather_cache(cache, flat_select_beam_index) #topk_seq: [batch_size, 2*beam_size, i+1] #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size] return topk_seq, topk_log_probs, topk_scores, topk_finished, cache
def forward(self, features): src_ids, sent_ids, input_seqlen = features zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids, zero), 'float32') # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like( src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True model = ErnieModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['use_fp16']) enc_out = model.get_sequence_output() logits = L.fc( input=enc_out, size=self.num_label, num_flatten_dims=2, param_attr=F.ParamAttr( name="cls_seq_label_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr(name="cls_seq_label_out_b", initializer=F.initializer.Constant(0.))) propeller.summary.histogram('pred', logits) return logits, input_seqlen
def _check_finished(decoder, next_inputs, finished, outputs_array): """check finished instance by next_inputs.action, and update finished tag and write END to outputs Args: decoder (TYPE): NULL next_inputs (TYPE): NULL finished (TYPE): NULL outputs_array (TYPE): NULL Returns: TODO Raises: NULL """ act_stop = tensor.fill_constant_batch_size_like( next_inputs.action, shape=next_inputs.action.shape, value=decoder._grammar.ACTION_STOP, dtype='int64') new_finished = layers.logical_and( layers.equal(next_inputs.action, act_stop), layers.logical_not(finished)) end_token_id = tensor.fill_constant_batch_size_like( outputs_array.data, shape=[-1], value=decoder._grammar.END, dtype=outputs_array.data.dtype) out_data_tmp, out_pos_tmp = data_structure.Array.push(outputs_array, end_token_id, in_place=False) new_data, new_pos = nn_utils.ifelse( new_finished, [out_data_tmp, out_pos_tmp], [outputs_array.data, outputs_array.pos]) layers.assign(new_data, outputs_array.data) layers.assign(new_pos, outputs_array.pos) layers.logical_or(finished, new_finished, out=finished)
def training_network(self, img, caption): # build caption and mask target = caption[:, 1:] source = caption[:, :-1] padding_filled = layers.fill_constant_batch_size_like(target, shape=[-1, decoder_config['sentence_length'] - 1], dtype='int64', value=config.dc['padding_idx']) mask = layers.equal(target, padding_filled) mask = layers.cast(layers.logical_not(mask), 'float32') scale_factor = layers.reduce_sum(mask) mask.stop_gradient = True scale_factor.stop_gradient = True # mdl decoder = Decoder(decoder_config['hidden_dim'], rnn_layer=1) image_embed, global_image_feat = self._img2feature(img) # [batch, k+1, hidden], [batch, hidden] # 这里要改,要么在rnn里面做embedding,要么在外面做! seq_out = decoder.call(global_image_feat, image_embed, embedding_function, words=source) loss = layers.squeeze(ImageCaptionModel.loss(target, seq_out), axes=[2]) loss = layers.elementwise_mul(loss, mask) output_loss = layers.elementwise_div(layers.reduce_sum(loss), scale_factor, name='loss') return output_loss
def decode_with_grammar(decoder, inits, decode_vocab, max_step_num, **kwargs): """A modification of paddle.fluid.layers.dynamic_decode(...). Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned Tensor indicating finished status contains all True values or the number of decoding step reachs to :attr:`max_step_num`. :code:`decoder.initialize()` would be called once before the decoding loop. If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()` would be called once after the decoding loop. Args: decoder(Decoder): An instance of `Decoder`. inits(tuple): Argument passed to `decoder.initialize`. decode_vocab(DecoderDynamicVocab): namedtuple(table table_len column column_len value value_len) max_step_num(int): The maximum number of steps. **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. Returns: tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ outputs and states, both are Tensor or nested structure of Tensor. \ `final_outputs` has the same structure and data types as \ :code:`decoder.output_dtype` , and each Tenser in `final_outputs` \ is the stacked of all decoding steps' outputs, which might be revised \ by :code:`decoder.finalize` . `final_states` is the counterpart \ at last time step of initial states returned by :code:`decoder.initialize` , \ thus has the same structure with it and has tensors with same shapes \ and data types. """ step_cnt = tensor.fill_constant(shape=[1], dtype="int64", value=1) max_step_num_tensor = tensor.fill_constant(shape=[1], dtype="int64", value=max_step_num - 2) # shape = [batch_size, beam_size, ...] initial_inputs, initial_states, initial_finished = decoder.initialize( inits, decode_vocab) global_inputs, global_states, global_finished = (initial_inputs, initial_states, initial_finished) inputs = initial_inputs states = initial_states # 保存输出结果 outputs_arr_data = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, max_step_num], dtype=decoder.output_dtype.predicted_ids, value=0) outputs_arr_pos = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0) outputs_array = data_structure.ArrayData( decoder.merge_batch_beams(outputs_arr_data), decoder.merge_batch_beams(outputs_arr_pos)) sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64") # 按语法解码的相关约束数据结构 grammar_stack_dat = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, max_step_num * STACK_EXPAND_TIMES], dtype='int64', value=0) grammar_stack_pos = tensor.fill_constant_batch_size_like( inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0) grammar_stack = data_structure.StackData( decoder.merge_batch_beams(grammar_stack_dat), decoder.merge_batch_beams(grammar_stack_pos)) ############ 循环解码,直到全部为 finish 状态 ############ # finish 的判断:通过 global_finished/next_finished && max_step_num 判断 cond = layers.logical_not((layers.reduce_all(initial_finished))) while_op = layers.While(cond) with while_op.block(): # step_outputs --> OutputWrapper # next_states --> StateWrapper # next_inputs --> DecoderInputsWrapper step_outputs, next_states, next_inputs = decoder.step( inputs, states, **kwargs) predicted_ids = step_outputs.predicted_ids _save_predict_output(outputs_array, predicted_ids, next_states.finished) pred_gmr_type = decoder.grammar_type(predicted_ids) cond_type_leaf = layers.equal(pred_gmr_type, decoder.GMR_TYPE.LEAF) cond_type_midd = layers.equal(pred_gmr_type, decoder.GMR_TYPE.MID) _process_type_leaf(cond_type_leaf, decoder, grammar_stack, next_inputs, next_states.finished) _process_type_midd(cond_type_midd, decoder, grammar_stack, next_inputs, predicted_ids) ##next_sequence_lengths = layers.elementwise_add(sequence_lengths, ## tensor.cast(layers.logical_not(global_finished), sequence_lengths.dtype)) _check_finished(decoder, next_inputs, next_states.finished, outputs_array) layers.utils.map_structure(tensor.assign, next_inputs, global_inputs) layers.utils.map_structure(tensor.assign, next_states, global_states) tensor.assign(next_states.finished, global_finished) ##tensor.assign(next_sequence_lengths, sequence_lengths) # 更新循环条件 layers.increment(x=step_cnt, value=1.0, in_place=True) layers.logical_and( layers.logical_not(layers.reduce_all(next_states.finished)), layers.less_equal(step_cnt, max_step_num_tensor), cond) final_outputs = outputs_array.data final_states = global_states final_outputs, final_states = decoder.finalize(final_outputs, global_states, sequence_lengths) return final_outputs, final_states
def __init__(self, label, pred): """doc""" self.eq = L.equal(pred, label) self.reset()
def knowledge_seq2seq(config): """ knowledge seq2seq """ emb_size = config.embed_size hidden_size = config.hidden_size input_size = emb_size num_layers = config.num_layers bi_direc = config.bidirectional batch_size = config.batch_size vocab_size = config.vocab_size run_type = config.run_type enc_input = layers.data(name="enc_input", shape=[1], dtype='int64', lod_level=1) #enc_input --> goal enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32') goal_input = layers.data(name="goal_input", shape=[1], dtype='int64', lod_level=1) #goal_input --> x cue_input = layers.data(name="cue_input", shape=[1], dtype='int64', lod_level=1) #cue_input --> kg #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32') memory_mask = layers.data(name='memory_mask', shape=[-1, 1], dtype='float32') tar_input = layers.data(name='tar_input', shape=[1], dtype='int64', lod_level=1) #tar_input --> y # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32') rnn_hidden_size = hidden_size if bi_direc: rnn_hidden_size //= 2 enc_out, enc_last_hidden = \ rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc") goal_out, goal_last_hidden = \ rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="rnn_enc1") context_goal_out = fluid.layers.concat( input=[enc_last_hidden, goal_last_hidden], axis=2) context_goal_out = layers.reshape(context_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # context_goal_out = layers.squeeze(context_goal_out, axes=[1]) context_goal_out = fluid.layers.fc(context_goal_out, size=rnn_hidden_size * 2, bias_attr=False) context_goal_out = layers.unsqueeze(context_goal_out, axes=[0]) bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge") bridge_out = layers.tanh(bridge_out) cue_last_mask = layers.data(name='cue_last_mask', shape=[-1], dtype='float32') knowledge_out, knowledge_last_hidden = \ rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc") query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1]) query = layers.squeeze(query, axes=[0]) query = layers.unsqueeze(query, axes=[1]) query = layers.reshape(query, shape=[batch_size, -1, hidden_size]) cue_memory = layers.slice(knowledge_last_hidden, axes=[0], starts=[0], ends=[1]) cue_memory = layers.reshape(cue_memory, shape=[batch_size, -1, hidden_size]) memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1]) weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask) cue_att = layers.reshape(cue_att, shape=[batch_size, -1]) knowledge = weighted_cue if config.use_posterior: print("config.use_posterior", config.use_posterior) target_out, target_last_hidden = \ rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc, dropout=0.0, batch_first=True, name="knowledge_enc1") target_goal_out = fluid.layers.concat( input=[target_last_hidden, goal_last_hidden], axis=2) target_goal_out = layers.reshape(target_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # target_goal_out = layers.squeeze(target_goal_out, axes=[1]) target_goal_out = fluid.layers.fc(target_goal_out, size=rnn_hidden_size * 2, bias_attr=False) target_goal_out = layers.unsqueeze(target_goal_out, axes=[0]) # get attenion # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1]) target_query = layers.slice(target_goal_out, axes=[0], starts=[0], ends=[1]) target_query = layers.squeeze(target_query, axes=[0]) target_query = layers.unsqueeze(target_query, axes=[1]) target_query = layers.reshape(target_query, shape=[batch_size, -1, hidden_size]) weight_target, target_att = dot_attention(target_query, cue_memory, mask=memory_mask) target_att = layers.reshape(target_att, shape=[batch_size, -1]) # add to output knowledge = weight_target enc_memory_mask = layers.data(name="enc_memory_mask", shape=[-1, 1], dtype='float32') enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1]) # decoder init_hidden, enc_memory, enc_mask dec_init_hidden = bridge_out pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32')) enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out, pad_value=pad_value) enc_memory.persistable = True gru_unit = GRU_unit(input_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_gru_unit") cue_gru_unit = GRU_unit(hidden_size + hidden_size, hidden_size, num_layers=num_layers, dropout=0.0, name="decoder_cue_gru_unit") tgt_vocab_size = config.vocab_size if run_type == "train": if config.use_bow: bow_logits = fc(knowledge, hidden_size, hidden_size, name='bow_fc_1') bow_logits = layers.tanh(bow_logits) bow_logits = fc(bow_logits, hidden_size, tgt_vocab_size, name='bow_fc_2') bow_logits = layers.softmax(bow_logits) bow_label = layers.data(name='bow_label', shape=[-1, config.max_len], dtype='int64') bow_mask = layers.data(name="bow_mask", shape=[-1, config.max_len], dtype='float32') bow_logits = layers.expand(bow_logits, [1, config.max_len, 1]) bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size]) bow_label = layers.reshape(bow_label, shape=[-1, 1]) bow_loss = layers.cross_entropy(bow_logits, bow_label, soft_label=False) bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len]) bow_loss *= bow_mask bow_loss = layers.reduce_sum(bow_loss, dim=[1]) bow_loss = layers.reduce_mean(bow_loss) dec_input = layers.data(name="dec_input", shape=[-1, 1, 1], dtype='int64') dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32') dec_knowledge = weight_target knowledge_goal_out = fluid.layers.concat( input=[dec_knowledge, target_query], axis=2) knowledge_goal_out = layers.reshape(knowledge_goal_out, shape=[-1, 1, rnn_hidden_size * 4]) # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1]) knowledge_goal_out = fluid.layers.fc(knowledge_goal_out, size=rnn_hidden_size * 2, bias_attr=False) knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0]) decoder_logits = \ rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers, enc_memory, enc_memory_mask, dec_knowledge, vocab_size, init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout) target_label = layers.data(name='target_label', shape=[-1, 1], dtype='int64') target_mask = layers.data(name='target_mask', shape=[-1, 1], dtype='float32') decoder_logits = layers.reshape(decoder_logits, shape=[-1, tgt_vocab_size]) target_label = layers.reshape(target_label, shape=[-1, 1]) nll_loss = layers.cross_entropy(decoder_logits, target_label, soft_label=False) nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1]) nll_loss *= target_mask nll_loss = layers.reduce_sum(nll_loss, dim=[1]) nll_loss = layers.reduce_mean(nll_loss) prior_attn = cue_att + 1e-10 posterior_att = target_att posterior_att.stop_gradient = True prior_attn = layers.log(prior_attn) kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) - prior_attn) kl_loss = layers.reduce_mean(kl_loss) kl_and_nll_factor = layers.data(name='kl_and_nll_factor', shape=[1], dtype='float32') kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1]) final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor return [bow_loss, kl_loss, nll_loss, final_loss] elif run_type == "test": beam_size = config.beam_size batch_size = config.batch_size token = layers.fill_constant(shape=[batch_size * beam_size, 1], value=config.bos_id, dtype='int64') token = layers.reshape(token, shape=[-1, 1]) max_decode_len = config.max_dec_len dec_knowledge = knowledge INF = 100000000.0 init_score_np = np.ones([beam_size * batch_size], dtype='float32') * -INF for i in range(batch_size): init_score_np[i * beam_size] = 0.0 pre_score = layers.assign(init_score_np) pos_index_np = np.arange(batch_size).reshape(-1, 1) pos_index_np = \ np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size pos_index = layers.assign(pos_index_np) id_array = [] score_array = [] index_array = [] init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1]) init_enc_memory = layers.reshape( init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size]) init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1]) init_enc_mask = layers.reshape(init_enc_mask, shape=[batch_size * beam_size, 1, -1]) dec_knowledge = layers.reshape(dec_knowledge, shape=[-1, 1, hidden_size]) init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1]) init_dec_knowledge = layers.reshape( init_dec_knowledge, shape=[batch_size * beam_size, -1, hidden_size]) dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size]) dec_init_hidden = layers.reshape(dec_init_hidden, shape=[1, -1, hidden_size]) length_average = config.length_average UNK = config.unk_id EOS = config.eos_id for i in range(1, max_decode_len + 1): dec_emb = get_embedding(token, input_size, vocab_size) dec_out, dec_last_hidden = \ decoder_step(gru_unit, cue_gru_unit, dec_emb, dec_init_hidden, input_size, hidden_size, init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None) output_in_size = hidden_size + hidden_size rnnout = layers.dropout(dec_out, dropout_prob=config.dropout, is_test=True) rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1') rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2') log_softmax_output = log_softmax(rnnout) log_softmax_output = layers.squeeze(log_softmax_output, axes=[1]) if i > 1: if length_average: log_softmax_output = layers.elementwise_add( (log_softmax_output / i), (pre_score * (1.0 - 1.0 / i)), axis=0) else: log_softmax_output = layers.elementwise_add( log_softmax_output, pre_score, axis=0) else: log_softmax_output = layers.elementwise_add(log_softmax_output, pre_score, axis=0) log_softmax_output = layers.reshape(log_softmax_output, shape=[batch_size, -1]) topk_score, topk_index = layers.topk(log_softmax_output, k=beam_size) topk_score = layers.reshape(topk_score, shape=[-1]) topk_index = layers.reshape(topk_index, shape=[-1]) vocab_var = layers.fill_constant([1], dtype='int64', value=vocab_size) new_token = topk_index % vocab_var index = topk_index // vocab_var id_array.append(new_token) index_array.append(index) index = index + pos_index score_array.append(topk_score) eos_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=EOS) unk_ids = layers.fill_constant([beam_size * batch_size], dtype='int64', value=UNK) eos_eq = layers.cast(layers.equal(new_token, eos_ids), dtype='float32') topk_score += eos_eq * -100000000.0 unk_eq = layers.cast(layers.equal(new_token, unk_ids), dtype='float32') topk_score += unk_eq * -100000000.0 # update token = new_token pre_score = topk_score token = layers.reshape(token, shape=[-1, 1]) index = layers.cast(index, dtype='int32') dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0]) dec_init_hidden = layers.gather(dec_last_hidden, index=index) dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0]) init_enc_memory = layers.gather(init_enc_memory, index) init_enc_mask = layers.gather(init_enc_mask, index) init_dec_knowledge = layers.gather(init_dec_knowledge, index) final_score = layers.concat(score_array, axis=0) final_ids = layers.concat(id_array, axis=0) final_index = layers.concat(index_array, axis=0) final_score = layers.reshape( final_score, shape=[max_decode_len, beam_size * batch_size]) final_ids = layers.reshape( final_ids, shape=[max_decode_len, beam_size * batch_size]) final_index = layers.reshape( final_index, shape=[max_decode_len, beam_size * batch_size]) return final_score, final_ids, final_index
def _greedy_search(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=0, eos_id=1, max_len=256): # run encoder enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) # constant number batch_size = enc_output.shape[0] max_len = (enc_output.shape[1] + 20) if max_len is None else max_len end_token_tensor = layers.fill_constant(shape=[batch_size, 1], dtype="int64", value=eos_id) predict_ids = [] log_probs = layers.fill_constant(shape=[batch_size, 1], dtype="float32", value=0) trg_word = layers.fill_constant(shape=[batch_size, 1], dtype="int64", value=bos_id) finished = layers.fill_constant(shape=[batch_size, 1], dtype="bool", value=0) ## init states (caches) for transformer caches = [{ "k": layers.fill_constant(shape=[batch_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[batch_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) step_log_probs = layers.log(layers.softmax(logits)) log_probs = layers.elementwise_add(x=step_log_probs, y=log_probs, axis=0) scores = log_probs topk_scores, topk_indices = layers.topk(input=scores, k=1) finished = layers.logical_or( finished, layers.equal(topk_indices, end_token_tensor)) trg_word = topk_indices log_probs = topk_scores predict_ids.append(topk_indices) if layers.reduce_all(finished).numpy(): break predict_ids = layers.stack(predict_ids, axis=0) finished_seq = layers.transpose(predict_ids, [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores
def grammar_output(inputs, actions, gmr_mask, last_col2tbl_mask, decode_vocab, grammar, name=None, column2table=None): """output logits according to grammar Args: inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1 actions (Variable): shape = [batch_size, max_len]. infer 阶段 max_len 恒为1 gmr_mask (Variable): shape = [batch_size, max_len, grammar_size]. infer 阶段 max_len 恒为1 last_col2tbl_mask (Variable): shape = [batch_size, max_len, max_table]. 解码过程中,上一个step为column时,其对应的 table mask decode_vocab (DecoderDynamicVocab): (table, table_len, column, column_len, value, value_len, column2table_mask). 这里的column2table_mask是跟column一一对应的table mask。 gramamr (Grammar): NULL name (str): Variable 的 name 前缀。用于多次调用时的参数共享。默认为 None,表示参数不会共享。 Returns: (Variable, Variable) output: 词表输出概率 valid_table_mask: 只在预测阶段有效 Raises: NULL """ batch_size = layers.shape(inputs)[0] max_len = inputs.shape[1] vocab_size = grammar.vocab_size action_shape = [batch_size, max_len] act_apply_rule = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_APPLY, dtype='int64') act_stop = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_STOP, dtype='int64') act_select_t = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_T, dtype='int64') act_select_c = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_C, dtype='int64') act_select_v = tensor.fill_constant(shape=action_shape, value=grammar.ACTION_SELECT_V, dtype='int64') cond_apply_rule = layers.logical_or(layers.equal(actions, act_apply_rule), layers.equal(actions, act_stop)) cond_select_t = layers.equal(actions, act_select_t) cond_select_c = layers.equal(actions, act_select_c) cond_select_v = layers.equal(actions, act_select_v) # expand vocab to [-1, max_len, ...] if max_len == 1: expand_to_seq_len = lambda x: layers.unsqueeze(x, [1]) else: expand_to_seq_len = lambda x: layers.expand(layers.unsqueeze( x, [1]), [1, max_len] + [1] * (len(x.shape) - 1)) table_enc = expand_to_seq_len(decode_vocab.table) table_len = expand_to_seq_len(decode_vocab.table_len) column_enc = expand_to_seq_len(decode_vocab.column) column_len = expand_to_seq_len(decode_vocab.column_len) value_enc = expand_to_seq_len(decode_vocab.value) value_len = expand_to_seq_len(decode_vocab.value_len) column2table_mask = expand_to_seq_len(decode_vocab.column2table_mask) # merge batch & seq_len dim inputs = nn_utils.merge_first_ndim(inputs, n=2) actions = nn_utils.merge_first_ndim(actions, n=2) gmr_mask = nn_utils.merge_first_ndim(gmr_mask, n=2) last_col2tbl_mask = nn_utils.merge_first_ndim(last_col2tbl_mask, n=2) table_enc = nn_utils.merge_first_ndim(table_enc, n=2) table_len = nn_utils.merge_first_ndim(table_len, n=2) column_enc = nn_utils.merge_first_ndim(column_enc, n=2) column_len = nn_utils.merge_first_ndim(column_len, n=2) value_enc = nn_utils.merge_first_ndim(value_enc, n=2) value_len = nn_utils.merge_first_ndim(value_len, n=2) column2table_mask = nn_utils.merge_first_ndim(column2table_mask, n=2) cond_apply_rule = nn_utils.merge_first_ndim(cond_apply_rule, n=2) cond_select_t = nn_utils.merge_first_ndim(cond_select_t, n=2) cond_select_c = nn_utils.merge_first_ndim(cond_select_c, n=2) cond_select_v = nn_utils.merge_first_ndim(cond_select_v, n=2) t_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_t_ptr') c_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_c_ptr') v_ptr_net = models.PointerNetwork(score_type="affine", name='gmr_output_v_ptr') ## 核心处理逻辑 ## apply_rule_output = _apply_rule(cond_apply_rule, inputs, gmr_mask, grammar, name=name) select_t_output = \ _select_table(cond_select_t, inputs, table_enc, table_len, last_col2tbl_mask, t_ptr_net, grammar) select_c_output, valid_table_mask = \ _select_column(cond_select_c, inputs, column_enc, column_len, c_ptr_net, grammar, column2table_mask) select_v_output = _select_value(cond_select_v, inputs, value_enc, value_len, v_ptr_net, grammar) output = fluider.elementwise_add(apply_rule_output, select_t_output, select_c_output, select_v_output, axis=0) output = layers.reshape(output, shape=[batch_size, max_len, vocab_size]) return output, valid_table_mask
def beam_search(self, src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=0, eos_id=1, beam_size=4, max_len=256): def expand_to_beam_size(tensor, beam_size): tensor = layers.reshape(tensor, [tensor.shape[0], 1] + tensor.shape[1:]) tile_dims = [1] * len(tensor.shape) tile_dims[1] = beam_size return layers.expand(tensor, tile_dims) def merge_batch_beams(tensor): return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] + tensor.shape[2:]) def split_batch_beams(tensor): return fluid.layers.reshape(tensor, shape=[-1, beam_size] + list(tensor.shape[1:])) def mask_probs(probs, finished, noend_mask_tensor): # TODO: use where_op finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul(layers.expand( layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]), noend_mask_tensor, axis=-1) - layers.elementwise_mul( probs, (finished - 1), axis=0) return probs def gather(x, indices, batch_pos): topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) return layers.gather_nd(x, topk_coordinates) # run encoder enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias) # constant number inf = float(1. * 1e7) batch_size = enc_output.shape[0] max_len = (enc_output.shape[1] + 20) if max_len is None else max_len vocab_size_tensor = layers.fill_constant(shape=[1], dtype="int64", value=self.trg_vocab_size) end_token_tensor = to_variable( np.full([batch_size, beam_size], eos_id, dtype="int64")) noend_array = [-inf] * self.trg_vocab_size noend_array[eos_id] = 0 noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) batch_pos = layers.expand( layers.unsqueeze( to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]), [1, beam_size]) predict_ids = [] parent_ids = [] ### initialize states of beam search ### log_probs = to_variable( np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size, dtype="float32")) finished = to_variable( np.full([batch_size, beam_size], 0, dtype="bool")) ### initialize inputs and states of transformer decoder ### ## init inputs for decoder, shaped `[batch_size*beam_size, ...]` trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id) trg_pos = layers.zeros_like(trg_word) trg_src_attn_bias = merge_batch_beams( expand_to_beam_size(trg_src_attn_bias, beam_size)) enc_output = merge_batch_beams( expand_to_beam_size(enc_output, beam_size)) ## init states (caches) for transformer, need to be updated according to selected beam caches = [{ "k": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_key], dtype=enc_output.dtype, value=0), "v": layers.fill_constant( shape=[batch_size * beam_size, self.n_head, 0, self.d_value], dtype=enc_output.dtype, value=0), } for i in range(self.n_layer)] for i in range(max_len): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) caches = map_structure( # can not be reshaped since the 0 size lambda x: x if i == 0 else merge_batch_beams(x), caches) logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias, enc_output, caches) caches = map_structure(split_batch_beams, caches) step_log_probs = split_batch_beams( fluid.layers.log(fluid.layers.softmax(logits))) step_log_probs = mask_probs(step_log_probs, finished, noend_mask_tensor) log_probs = layers.elementwise_add(x=step_log_probs, y=log_probs, axis=0) log_probs = layers.reshape(log_probs, [-1, beam_size * self.trg_vocab_size]) scores = log_probs topk_scores, topk_indices = fluid.layers.topk(input=scores, k=beam_size) beam_indices = fluid.layers.elementwise_floordiv( topk_indices, vocab_size_tensor) token_indices = fluid.layers.elementwise_mod( topk_indices, vocab_size_tensor) # update states caches = map_structure( lambda x: gather(x, beam_indices, batch_pos), caches) log_probs = gather(log_probs, topk_indices, batch_pos) finished = gather(finished, beam_indices, batch_pos) finished = layers.logical_or( finished, layers.equal(token_indices, end_token_tensor)) trg_word = layers.reshape(token_indices, [-1, 1]) predict_ids.append(token_indices) parent_ids.append(beam_indices) if layers.reduce_all(finished).numpy(): break predict_ids = layers.stack(predict_ids, axis=0) parent_ids = layers.stack(parent_ids, axis=0) finished_seq = layers.transpose( layers.gather_tree(predict_ids, parent_ids), [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores
def build_model(self, model_configs): self.update_params(model_configs) features = fluid.layers.data(name="features", shape=[None, self.seq_len_], dtype='int64') labels = fluid.layers.data(name="labels", shape=[None, self.seq_len_], dtype='int64') sequence_length_ph = fluid.layers.data(name="seq_len_ph", shape=[None], dtype='int64') sequence_mask_ph = fluid.layers.data(name="seq_mask_ph", shape=[None], dtype='float32') init_hidden = fluid.layers.data( name="init_hidden", shape=[None, self.num_layers_, self.n_hidden_], dtype='float32') init_cell = fluid.layers.data( name="init_cell", shape=[None, self.num_layers_, self.n_hidden_], dtype='float32') init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2]) init_cell = layers.transpose(init_cell, perm=[1, 0, 2]) init_hidden_reshape = layers.reshape( init_hidden, shape=[self.num_layers_, -1, self.n_hidden_]) init_cell_reshape = layers.reshape( init_cell, shape=[self.num_layers_, -1, self.n_hidden_]) features = layers.reshape(features, shape=[-1, self.seq_len_, 1]) # word embedding inputs = layers.embedding( input=features, size=[self.vocab_size_, self.n_hidden_], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale_, high=self.init_scale_))) # LSTM output, last_hidden, last_cell = self._build_rnn_graph( inputs, init_hidden, init_cell, sequence_length_ph) output = layers.reshape(output, shape=[-1, self.seq_len_, self.n_hidden_], inplace=True) self.last_hidden_ = layers.reshape( last_hidden, [-1, self.num_layers_, self.n_hidden_]) self.last_cell_ = layers.reshape( last_cell, [-1, self.num_layers_, self.n_hidden_]) # softmax softmax_w = layers.create_parameter( [self.n_hidden_, self.vocab_size_], dtype="float32", name="softmax_w", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale_, high=self.init_scale_)) softmax_b = layers.create_parameter( [self.vocab_size_], dtype="float32", name='softmax_b', default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale_, high=self.init_scale_)) logits = layers.matmul(output, softmax_w) logits = layers.elementwise_add(logits, softmax_b) logits = layers.reshape(logits, shape=[-1, self.vocab_size_], inplace=True) # correct predictions labels_reshaped = layers.reshape(labels, [-1]) pred = layers.cast(layers.argmax(logits, 1), dtype="int64") correct_pred = layers.cast(layers.equal(pred, labels_reshaped), dtype="int64") self.pred_ = pred # predicting unknown is always considered wrong # only in paddle 1.8 unk_tensor = layers.fill_constant(layers.shape(labels_reshaped), value=self.unk_symbol_, dtype='int64') pred_unk = layers.cast(layers.equal(pred, unk_tensor), dtype="int64") correct_unk = layers.elementwise_mul(pred_unk, correct_pred) # predicting padding is always considered wrong pad_tensor = layers.fill_constant(layers.shape(labels_reshaped), value=self.pad_symbol_, dtype='int64') pred_pad = layers.cast(layers.equal(pred, pad_tensor), dtype="int64") correct_pad = layers.elementwise_mul(pred_pad, correct_pred) # Reshape logits to be a 3-D tensor for sequence loss logits = layers.reshape(logits, [-1, self.seq_len_, self.vocab_size_]) labels = layers.reshape(labels, [-1, self.seq_len_, 1]) loss = layers.softmax_with_cross_entropy(logits=logits, label=labels, soft_label=False, return_softmax=False) sequence_mask = layers.reshape(sequence_mask_ph, [-1, self.seq_len_, 1]) loss = layers.reduce_mean(layers.elementwise_mul(loss, sequence_mask)) eval_metric_ops = fluid.layers.reduce_sum(correct_pred) \ - fluid.layers.reduce_sum(correct_unk) \ - fluid.layers.reduce_sum(correct_pad) self.loss_ = loss self.correct_ = eval_metric_ops self.input_name_list_ = [ 'features', 'labels', 'seq_len_ph', 'seq_mask_ph', 'init_hidden', 'init_cell' ] self.target_var_names_ = [ self.loss_, self.last_hidden_, self.last_cell_, self.correct_ ] self.program_ = fluid.default_main_program() self.startup_program_ = fluid.default_startup_program()