Пример #1
0
 def lossweighed(ce_loss, labels):
     one_hot = fluid.one_hot(input=labels, depth=args["num_labels"])
     lw = fluid.layers.matmul(one_hot, weight)
     lw = fluid.layers.reduce_sum(lw, dim=1)
     loss = fluid.layers.elementwise_mul(lw, ce_loss)
     loss = fluid.layers.mean(loss)
     return loss
Пример #2
0
 def test_api_with_dygraph(self):
     depth = 10
     label = np.array([np.random.randint(0, depth - 1)
                       for i in range(6)]).reshape([6, 1])
     with fluid.dygraph.guard():
         one_hot_label = fluid.one_hot(
             input=fluid.dygraph.to_variable(label), depth=depth)
Пример #3
0
    def create_loss_op(self, predict, label, epsilon=1e-7):
        """compute loss with tensor

         Args:
         predict: model output tensor activated by softmax
         label: a non-sparse tensor

         Returns:
         loss: cross-entropy loss
         """
        if self.loss_type == "nl" and self.model_type == "train":
            one_hot_label = fluid.one_hot(label, depth=predict.shape[-1])
            one_hot_label = FL.squeeze(one_hot_label, axes=[-2])
            # log
            neg_prob = 1 - predict
            log_neg_prob = FL.log(
                fluid.layers.clip(neg_prob, min=epsilon, max=1.))
            ce_loss = -1 * log_neg_prob * one_hot_label
            cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True)
        else:  # PL or evaluation
            cost = FL.cross_entropy(predict, label)

        loss = FL.mean(cost)

        return loss
Пример #4
0
def soft_dice_loss(logits, labels):
    probs = L.softmax(logits, axis=-1)
    one_hot = F.one_hot(labels, depth=probs.shape[-1])
    intersection = L.reduce_sum(probs * one_hot, dim=-1)
    # union = L.reduce_sum(probs, axis=-1) + L.reduce_sum(labels, axis=-1)
    loss = 1 - intersection
    return L.reduce_mean(loss)
Пример #5
0
def chunk_softmax(logits, labels, topk=10):
    after_exp = L.exp(logits)
    out, _ = L.argsort(after_exp, axis=-1)
    denorm = L.reduce_sum(out[:, -topk:], dim=-1, keep_dim=True)
    probs = after_exp / denorm
    one_hot = F.one_hot(labels, depth=probs.shape[-1])
    loss = -L.reduce_sum(one_hot * L.log(probs)) / logits.shape[0]
    return loss
Пример #6
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    _, vocab_size = logits.shape

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]
    #log.debug(gather_idx.numpy())
    #log.debug(state.finished.numpy())
    #log.debug(next_finished.numpy())

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    #log.debug(next_word_id.numpy())
    #log.debug(next_beam_id.numpy())
    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
Пример #7
0
 def _labelsmoothing(self, target):
     if target.shape[-1] != self._class_dim:
         one_hot_target = fluid.one_hot(input=target, depth=self._class_dim)
     else:
         one_hot_target = target
     soft_target = fluid.layers.label_smooth(label=one_hot_target,
                                             epsilon=self._epsilon,
                                             dtype="float32")
     return soft_target
Пример #8
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    beam_size, vocab_size = logits.shape  # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size
    logits_np = logits.numpy()
    for i in range(beam_size):
        logits_np[i][17963] = 0  # make [UNK] prob = 0
    logits = D.to_variable(logits_np)

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
Пример #9
0
def cross_entropy_label_smooth(preds, targets, epsilon):
    preds = fluid.layers.softmax(preds)
    targets_one_hot = fluid.one_hot(input=targets, depth=args.class_num)
    targets_smooth = fluid.layers.label_smooth(targets_one_hot,
                                               epsilon=epsilon,
                                               dtype="float32")
    loss = fluid.layers.cross_entropy(input=preds,
                                      label=targets_smooth,
                                      soft_label=True)
    return loss
Пример #10
0
 def req_cost(self, program, score):
     score = fluid.one_hot(score, CLASSIFY_NUM)
     loss = program.current_block().create_var(name="cosnn_loss_tmp",
                                               dtype="float32",
                                               shape=[1])
     layers.py_func(func=_gt_score_loss,
                    x=[self.layers_out, score],
                    out=loss,
                    backward_func=_backward_gt_score)
     # loss = layers.cross_entropy(self.layers_out, score)
     return layers.mean(loss)
Пример #11
0
    def _run(self, depth):
        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
        one_hot_label = fluid.one_hot(input=label, depth=depth)

        place = fluid.NPUPlace(0)
        label_data = np.array([np.random.randint(0, 10 - 1)
                               for i in range(6)]).reshape([6, 1])

        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        ret = exe.run(feed={'label': label_data, },
                      fetch_list=[one_hot_label],
                      return_numpy=False)
Пример #12
0
    def test_api_with_dygraph(self):
        depth = 10
        label = np.array([np.random.randint(0, depth - 1)
                          for i in range(6)]).reshape([6, 1])
        with fluid.dygraph.guard():
            one_hot_label = fluid.one_hot(
                input=fluid.dygraph.to_variable(label), depth=depth)

            one_hot_label = paddle.nn.functional.one_hot(
                fluid.dygraph.to_variable(label), depth)
            with _test_eager_guard():
                one_hot_label = paddle.nn.functional.one_hot(
                    paddle.to_tensor(label), depth)
Пример #13
0
    def build_program(self, backward=False, dtype=None):
        import paddle.fluid as fluid

        self.name = "one_hot"
        with fluid.program_guard(self.main_program, self.startup_program):
            input = fluid.data(name='input',
                               shape=config.input_shape,
                               dtype='int32',
                               lod_level=0)
            input.stop_gradient = False
            result = fluid.one_hot(input=input, depth=config.depth)

            self.feed_vars = [input]
            self.fetch_vars = [result]
Пример #14
0
    def arc_margin_product(self,
                           input,
                           label,
                           out_dim,
                           m,
                           s,
                           easy_margin=False):
        # input = fluid.layers.l2_normalize(input, axis=1)
        input_norm = fluid.layers.sqrt(
            fluid.layers.reduce_sum(fluid.layers.square(input), dim=1))
        input = fluid.layers.elementwise_div(input, input_norm, axis=0)

        if self.weight is None:
            self.weight = fluid.layers.create_parameter(
                shape=[self.class_dim, input.shape[1]],
                dtype='float32',
                name='weight_norm',
                attr=fluid.param_attr.ParamAttr(
                    initializer=fluid.initializer.Xavier()))
        # weight = fluid.layers.l2_normalize(weight, axis=1)
        weight_norm = fluid.layers.sqrt(
            fluid.layers.reduce_sum(fluid.layers.square(self.weight), dim=1))
        weight = fluid.layers.elementwise_div(self.weight, weight_norm, axis=0)
        weight = fluid.layers.transpose(weight, perm=[1, 0])
        cosine = fluid.layers.mul(input, weight)
        sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine) + 1e-6)

        cos_m = math.cos(m)
        sin_m = math.sin(m)
        phi = cosine * cos_m - sine * sin_m

        th = math.cos(math.pi - m)
        mm = math.sin(math.pi - m) * m
        if easy_margin:
            phi = self.paddle_where_more_than(cosine, 0, phi, cosine)
        else:
            phi = self.paddle_where_more_than(cosine, th, phi, cosine - mm)

        one_hot = fluid.one_hot(input=label, depth=out_dim)
        one_hot = fluid.layers.squeeze(input=one_hot, axes=[1])
        output = fluid.layers.elementwise_mul(
            one_hot, phi) + fluid.layers.elementwise_mul(
                (1.0 - one_hot), cosine)
        output = output * s
        return output
Пример #15
0
    def arc_margin_product(self, input, label, out_dim, s=32.0, m=0.50,
                           mode=2):
        input_norm = fluid.layers.sqrt(
            fluid.layers.reduce_sum(
                fluid.layers.square(input), dim=1))
        input = fluid.layers.elementwise_div(input, input_norm, axis=0)

        weight = fluid.layers.create_parameter(
            shape=[out_dim, input.shape[1]],
            dtype='float32',
            name='weight_norm',
            attr=fluid.param_attr.ParamAttr(
                initializer=fluid.initializer.Xavier(),
                regularizer=fluid.regularizer.L2Decay(4e-4)))

        weight_norm = fluid.layers.sqrt(
            fluid.layers.reduce_sum(
                fluid.layers.square(weight), dim=1))
        weight = fluid.layers.elementwise_div(weight, weight_norm, axis=0)
        weight = fluid.layers.transpose(weight, perm=[1, 0])
        cosine = fluid.layers.mul(input, weight)
        sine = fluid.layers.sqrt(1.0 - fluid.layers.square(cosine))

        cos_m = math.cos(m)
        sin_m = math.sin(m)
        phi = cosine * cos_m - sine * sin_m

        th = math.cos(math.pi - m)
        mm = math.sin(math.pi - m) * m

        if mode == 1:
            phi = self.paddle_where_more_than(cosine, 0, phi, cosine)
        elif mode == 2:
            phi = self.paddle_where_more_than(cosine, th, phi, cosine - mm)
        else:
            pass

        one_hot = fluid.one_hot(input=label, depth=out_dim)
        output = fluid.layers.elementwise_mul(
            one_hot, phi) + fluid.layers.elementwise_mul(
                (1.0 - one_hot), cosine)
        output = output * s
        return output
Пример #16
0
    def __init__(self, pretrain_path, N, K, max_length, hidden_size,
                 att_dim, induction_iters, relation_size):
        """
        Args:
            pretrain_path: str. Path for word embedding and word id.
            N: int. N-way.
            K: int. K-shot.
            max_length: int. 
            hidden_size: int.
            att_dim: int.
            induction_iters: int.
            relation_size: int."""
        totalQ = fluid.data(name="totalQ", shape=[None], dtype="int32")  # total query
        total_Q = totalQ[0]
        support = fluid.data(name="support", shape=[None, N, K, max_length], dtype="int64")  # [B, N, K, T]
        support_len = fluid.data(name="support_len", shape=[None, N, K], dtype="int64")  # [B, N, K]
        query = fluid.data(name="query", shape=[None, None, max_length], dtype="int64")  # [B, totalQ, T]
        query_len = fluid.data(name="query_len", shape=[None, None], dtype="int64")  # [B, totalQ]
        label = fluid.data(name="label", shape=[None, None], dtype="int64")  # [B, totalQ]

        # Must be 3 data readers.
        # See https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/static_mode/use_py_reader.html
        self.train_reader = fluid.io.DataLoader.from_generator(
            feed_list=[totalQ, support, support_len, query, query_len, label],
            capacity=8, iterable=True
        )
        self.val_reader = fluid.io.DataLoader.from_generator(
            feed_list=[totalQ, support, support_len, query, query_len, label],
            capacity=8, iterable=True
        )
        self.test_reader = fluid.io.DataLoader.from_generator(
            feed_list=[totalQ, support, support_len, query, query_len, label],
            capacity=8, iterable=True
        )

        # 1. Encoder
        word_vec, vocab_size, embed_size = self.__load_embed_matrix(pretrain_path)

        support = fluid.layers.reshape(support, shape=[-1, max_length])  # [BNK, T]
        support_len = fluid.layers.reshape(support_len, shape=[-1])  # [BNK]
        support_emb = self.encoder_module(
            support, support_len, max_length, word_vec,
            vocab_size, embed_size, hidden_size, att_dim)  # [BNK, 2H]
        support_emb = fluid.layers.reshape(support_emb, shape=[-1, N, K, 2 * hidden_size])  # [B, N, K, 2H]

        query = fluid.layers.reshape(query, shape=[-1, max_length])  # [B*totalQ, T]
        query_len = fluid.layers.reshape(query_len, shape=[-1])  # [B*totalQ]
        query_emb = self.encoder_module(
            query, query_len, max_length, word_vec,
            vocab_size, embed_size, hidden_size, att_dim)  # [B*totalQ, 2H]
        query_emb = fluid.layers.reshape(query_emb, shape=[-1, total_Q, 2 * hidden_size])  # [B, totalQ, 2H]
        
        # 2. Induction
        class_emb = self.induction_module(support_emb, N, K, induction_iters,
                                          hidden_size)  # [B, N, 1, 2H]
        
        # 3. Relation
        relation_score = self.relation_module(class_emb, query_emb, N, total_Q,
                                              hidden_size, relation_size)  # [B, totalQ, N]
        
        # Return
        label_onehot = fluid.one_hot(label, depth=N)  # [B, totalQ, N]
        self.loss = fluid.layers.mse_loss(relation_score, label_onehot)  # [1]
        self.mean_acc = fluid.layers.accuracy(
            input=fluid.layers.reshape(relation_score, shape=[-1, N]),
            label=fluid.layers.reshape(label, shape=[-1, 1]))  # [1]
        self.prediction = fluid.layers.argmax(relation_score, axis=-1)  # [B, totalQ]
Пример #17
0
def seq2seq(model, tokenizer, args):
    log.info('Training starts with args: %r' % args)
    attn_id = tokenizer.vocab[args.attn_token]

    def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
        if query_len is None:
            query_len = batch_ids.shape[1]
        if mask_type != 'empty':
            mask = (batch_ids != pad_value).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
            if mask_type == 'causal':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask)
            elif mask_type == 'causal_without_diag':
                assert query_len == batch_ids.shape[1]
                mask = np.tril(mask, -1)
            elif mask_type == 'diag':
                assert query_len == batch_ids.shape[1]
                mask = np.stack([np.diag(np.diag(m)) for m in mask], 0)
        else:
            mask_type == 'empty'
            mask = np.zeros_like(batch_ids).astype(np.float32)
            mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1])
        return mask

    def make_some_noice(ids):
        if args.use_random_noice:
            noice_ids = np.random.randint(1,
                                          len(tokenizer.vocab),
                                          size=ids.shape)
        else:
            noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]']
        pos, = np.where(np.ones_like(ids))
        np.random.shuffle(pos)
        pos = pos[:int(args.noise_prob * len(pos))]
        ids[pos, ] = noice_ids[pos, ]
        return ids

    def map_fn(example_id, src_ids, tgt_ids):
        src_ids = src_ids[:args.max_encode_len]
        tgt_ids = tgt_ids[:args.max_decode_len]
        src_ids, src_sids = tokenizer.build_for_ernie(src_ids)
        src_pids = np.arange(len(src_ids))

        tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids)
        tgt_pids = np.arange(len(tgt_ids)) + len(src_ids)  # continues position
        tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id

        attn_ids = np.ones_like(tgt_ids) * attn_id
        if args.noise_prob > 0.:
            tgt_labels = deepcopy(tgt_ids)
            tgt_ids = make_some_noice(tgt_ids)  #corrupted
        else:
            tgt_labels = tgt_ids

        return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids,
                tgt_sids, attn_ids, tgt_labels)

    def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids,
                      tgt_pids, tgt_sids, attn_ids, tgt_labels):
        '''
        attention mask:
        ***  src,  tgt, attn
        src  00,   01,   11
        tgt  10,   11,   12
        attn 20,   21,   22

        ***   s1, s2 | t1 t2 t3| attn1 attn2 attn3
        s1    1,  1  | 0, 0, 0,| 0,    0,    0,
        s2    1,  1  | 0, 0, 0,| 0,    0,    0,
        -
        t1    1,  1, | 1, 0, 0,| 0,    0,    0,
        t2    1,  1, | 1, 1, 0,| 0,    0,    0,
        t3    1,  1, | 1, 1, 1,| 0,    0,    0,
        -
        attn1 1,  1, | 0, 0, 0,| 1,    0,    0,
        attn2 1,  1, | 1, 0, 0,| 0,    1,    0,
        attn3 1,  1, | 1, 1, 0,| 0,    0,    1,

        for details, see Fig3. https://arxiv.org/abs/2001.11314
        '''

        src_len = src_ids.shape[1]
        tgt_len = tgt_ids.shape[1]
        mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len)
        mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len)
        mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len)

        mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len)
        mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len)

        mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len)
        mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len)
        mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len)
        '''
        mask = np.concatenate([
            np.concatenate([mask_00, mask_01, mask_02], 2),
            np.concatenate([mask_10, mask_11, mask_12], 2),
            np.concatenate([mask_20, mask_21, mask_22], 2),
        ], 1)

        ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1)
        pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1)
        sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1)

        '''

        mask_src_2_src = mask_00
        mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2)
        mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2)

        tgt_labels = tgt_labels[np.where(tgt_labels != 0)]
        return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                mask_attn_2_srctgtattn, tgt_labels)

    bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()}
    feature_column = propeller.data.FeatureColumns([
        propeller.data.LabelColumn('id'),
        propeller.data.TextColumn('src',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
        propeller.data.TextColumn('tgt',
                                  unk_id=tokenizer.unk_id,
                                  vocab_dict=bytes_vocab),
    ])

    train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \
                                   .map(map_fn)

    dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
                                   .map(map_fn) \
                                   .padded_batch(args.eval_bsz) \
                                   .map(after_padding)

    log.debug('shard %d of %d' %
              (D.parallel.Env().dev_id, D.parallel.Env().nranks))
    train_ds = train_ds.shard(
        D.parallel.Env().nranks,
        D.parallel.Env().dev_id).shuffle(10000).padded_batch(
            args.bsz).map(after_padding)
    dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id)

    shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]]
    types = ['int64'] * 11

    train_ds.data_shapes = shapes
    train_ds.data_types = types
    dev_ds.data_shapes = shapes
    dev_ds.data_types = types

    vocab_size, _ = model.word_emb.weight.shape
    ctx = D.parallel.prepare_context()
    model = D.parallel.DataParallel(model, ctx)
    g_clip = F.clip.GradientClipByGlobalNorm(1.0)
    opt = AdamW(learning_rate=LinearDecay(
        args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps),
                parameter_list=model.parameters(),
                weight_decay=args.wd,
                grad_clip=g_clip)
    attn_id = tokenizer.vocab[args.attn_token]
    for step, data in enumerate(train_ds.start(place)):
        (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids,
         attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn,
         tgt_labels) = data

        _, __, info = model(src_ids,
                            sent_ids=src_sids,
                            pos_ids=src_pids,
                            attn_bias=mask_src_2_src,
                            encode_only=True)
        cached_k, cached_v = info['caches']
        _, __, info = model(tgt_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_tgt_2_srctgt,
                            past_cache=(cached_k, cached_v),
                            encode_only=True)
        cached_k2, cached_v2 = info['caches']
        past_cache_k = [
            L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
        ]
        past_cache_v = [
            L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
        ]
        if args.label_smooth > 0.:
            tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size),
                                        epsilon=args.label_smooth)
        loss, _, __ = model(attn_ids,
                            sent_ids=tgt_sids,
                            pos_ids=tgt_pids,
                            attn_bias=mask_attn_2_srctgtattn,
                            past_cache=(past_cache_k, past_cache_v),
                            tgt_labels=tgt_labels,
                            tgt_pos=L.where(attn_ids == attn_id))

        scaled_loss = model.scale_loss(loss)
        scaled_loss.backward()
        model.apply_collective_grads()
        opt.minimize(scaled_loss)
        model.clear_gradients()
        if step % 10 == 0:
            loss = loss.numpy()
            ppl = np.exp(loss)
            log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' %
                      (step, loss, ppl, opt.current_step_lr()))
        if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env(
        ).dev_id == 0:
            F.save_dygraph(model.state_dict(), args.save_dir)
        if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0:
            assert os.path.exists(
                args.predict_output_dir
            ), 'predict_output_dir not found: %s' % args.predict_output_dir
            log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id)
            evaluate(model, dev_ds, step, args)
        if step > args.max_steps:
            break
    evaluate(model, dev_ds, step, args)

    if args.save_dir is not None:
        F.save_dygraph(model.state_dict(), args.save_dir)
Пример #18
0
def softmax_with_loss(logit,
                      label,
                      ignore_mask=None,
                      num_classes=2,
                      weight=None):
    ignore_mask = fluid.layers.cast(ignore_mask, 'float32')
    label = fluid.layers.elementwise_min(
        label, fluid.layers.assign(np.array([num_classes - 1],
                                            dtype=np.int32)))
    logit = fluid.layers.transpose(logit, [0, 2, 3, 1])
    logit = fluid.layers.reshape(logit, [-1, num_classes])
    label = fluid.layers.reshape(label, [-1, 1])
    label = fluid.layers.cast(label, 'int64')
    ignore_mask = fluid.layers.reshape(ignore_mask, [-1, 1])
    if weight is None:
        loss, probs = fluid.layers.softmax_with_cross_entropy(
            logit,
            label,
            ignore_index=cfg.DATASET.IGNORE_INDEX,
            return_softmax=True)
    else:
        label = fluid.layers.squeeze(label, axes=[-1])
        label_one_hot = fluid.one_hot(input=label, depth=num_classes)
        if isinstance(weight, list):
            assert len(
                weight
            ) == num_classes, "weight length must equal num of classes"
            weight = fluid.layers.assign(np.array([weight], dtype='float32'))
        elif isinstance(weight, str):
            assert weight.lower(
            ) == 'dynamic', 'if weight is string, must be dynamic!'
            tmp = []
            total_num = fluid.layers.cast(
                fluid.layers.shape(label)[0], 'float32')
            for i in range(num_classes):
                cls_pixel_num = fluid.layers.reduce_sum(label_one_hot[:, i])
                ratio = total_num / (cls_pixel_num + 1)
                tmp.append(ratio)
            weight = fluid.layers.concat(tmp)
            weight = weight / fluid.layers.reduce_sum(weight) * num_classes
        elif isinstance(weight, fluid.layers.Variable):
            pass
        else:
            raise ValueError(
                'Expect weight is a list, string or Variable, but receive {}'.
                format(type(weight)))
        weight = fluid.layers.reshape(weight, [1, num_classes])
        weighted_label_one_hot = fluid.layers.elementwise_mul(
            label_one_hot, weight)
        probs = fluid.layers.softmax(logit)
        # weighted_label_one_hot = weighted_label_one_hot*(1-probs)
        loss = fluid.layers.cross_entropy(
            probs,
            weighted_label_one_hot,
            soft_label=True,
            ignore_index=cfg.DATASET.IGNORE_INDEX)
        weighted_label_one_hot.stop_gradient = True

    loss = loss * ignore_mask
    avg_loss = fluid.layers.mean(loss) / (fluid.layers.mean(ignore_mask) +
                                          cfg.MODEL.DEFAULT_EPSILON)

    label.stop_gradient = True
    ignore_mask.stop_gradient = True
    return avg_loss
Пример #19
0
def get_reg_loss(pred_reg, reg_label, fg_mask, point_num, loc_scope,
                 loc_bin_size, num_head_bin, anchor_size,
                 get_xz_fine=True, get_y_by_bin=False, loc_y_scope=0.5,
                 loc_y_bin_size=0.25, get_ry_fine=False):

    """
    Bin-based 3D bounding boxes regression loss. See https://arxiv.org/abs/1812.04244 for more details.

    :param pred_reg: (N, C)
    :param reg_label: (N, 7) [dx, dy, dz, h, w, l, ry]
    :param loc_scope: constant
    :param loc_bin_size: constant
    :param num_head_bin: constant
    :param anchor_size: (N, 3) or (3)
    :param get_xz_fine:
    :param get_y_by_bin:
    :param loc_y_scope:
    :param loc_y_bin_size:
    :param get_ry_fine:
    :return:
    """
    fg_num = fluid.layers.cast(fluid.layers.reduce_sum(fg_mask), dtype=pred_reg.dtype)
    fg_num = fluid.layers.clip(fg_num, min=1.0, max=point_num)
    fg_scale = float(point_num) / fg_num

    per_loc_bin_num = int(loc_scope / loc_bin_size) * 2
    loc_y_bin_num = int(loc_y_scope / loc_y_bin_size) * 2

    reg_loss_dict = {}

    # xz localization loss
    x_offset_label, y_offset_label, z_offset_label = reg_label[:, 0:1], reg_label[:, 1:2], reg_label[:, 2:3]
    x_shift = fluid.layers.clip(x_offset_label + loc_scope, 0., loc_scope * 2 - 1e-3)
    z_shift = fluid.layers.clip(z_offset_label + loc_scope, 0., loc_scope * 2 - 1e-3)
    x_bin_label = fluid.layers.cast(x_shift / loc_bin_size, dtype='int64')
    z_bin_label = fluid.layers.cast(z_shift / loc_bin_size, dtype='int64')

    x_bin_l, x_bin_r = 0, per_loc_bin_num
    z_bin_l, z_bin_r = per_loc_bin_num, per_loc_bin_num * 2
    start_offset = z_bin_r

    loss_x_bin = fluid.layers.softmax_with_cross_entropy(pred_reg[:, x_bin_l: x_bin_r], x_bin_label)
    loss_x_bin = fluid.layers.reduce_mean(loss_x_bin * fg_mask) * fg_scale
    loss_z_bin = fluid.layers.softmax_with_cross_entropy(pred_reg[:, z_bin_l: z_bin_r], z_bin_label)
    loss_z_bin = fluid.layers.reduce_mean(loss_z_bin * fg_mask) * fg_scale
    reg_loss_dict['loss_x_bin'] = loss_x_bin
    reg_loss_dict['loss_z_bin'] = loss_z_bin
    loc_loss = loss_x_bin + loss_z_bin

    if get_xz_fine:
        x_res_l, x_res_r = per_loc_bin_num * 2, per_loc_bin_num * 3
        z_res_l, z_res_r = per_loc_bin_num * 3, per_loc_bin_num * 4
        start_offset = z_res_r

        x_res_label = x_shift - (fluid.layers.cast(x_bin_label, dtype=x_shift.dtype) * loc_bin_size + loc_bin_size / 2.)
        z_res_label = z_shift - (fluid.layers.cast(z_bin_label, dtype=z_shift.dtype) * loc_bin_size + loc_bin_size / 2.)
        x_res_norm_label = x_res_label / loc_bin_size
        z_res_norm_label = z_res_label / loc_bin_size

        x_bin_onehot = fluid.one_hot(x_bin_label[:, 0], depth=per_loc_bin_num)
        z_bin_onehot = fluid.one_hot(z_bin_label[:, 0], depth=per_loc_bin_num)

        loss_x_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, x_res_l: x_res_r] * x_bin_onehot, dim=1, keep_dim=True), x_res_norm_label)
        loss_x_res = fluid.layers.reduce_mean(loss_x_res * fg_mask) * fg_scale
        loss_z_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, z_res_l: z_res_r] * z_bin_onehot, dim=1, keep_dim=True), z_res_norm_label)
        loss_z_res = fluid.layers.reduce_mean(loss_z_res * fg_mask) * fg_scale
        reg_loss_dict['loss_x_res'] = loss_x_res
        reg_loss_dict['loss_z_res'] = loss_z_res
        loc_loss += loss_x_res + loss_z_res

    # y localization loss
    if get_y_by_bin:
        y_bin_l, y_bin_r = start_offset, start_offset + loc_y_bin_num
        y_res_l, y_res_r = y_bin_r, y_bin_r + loc_y_bin_num
        start_offset = y_res_r

        y_shift = fluid.layers.clip(y_offset_label + loc_y_scope, 0., loc_y_scope * 2 - 1e-3)
        y_bin_label = fluid.layers.cast(y_shift / loc_y_bin_size, dtype='int64')
        y_res_label = y_shift - (fluid.layers.cast(y_bin_label, dtype=y_shift.dtype) * loc_y_bin_size + loc_y_bin_size / 2.)
        y_res_norm_label = y_res_label / loc_y_bin_size

        y_bin_onehot = fluid.one_hot(y_bin_label[:, 0], depth=per_loc_bin_num)

        loss_y_bin = fluid.layers.cross_entropy(pred_reg[:, y_bin_l: y_bin_r], y_bin_label)
        loss_y_bin = fluid.layers.reduce_mean(loss_y_bin * fg_mask) * fg_scale
        loss_y_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, y_res_l: y_res_r] * y_bin_onehot, dim=1, keep_dim=True), y_res_norm_label)
        loss_y_res = fluid.layers.reduce_mean(loss_y_res * fg_mask) * fg_scale

        reg_loss_dict['loss_y_bin'] = loss_y_bin
        reg_loss_dict['loss_y_res'] = loss_y_res

        loc_loss += loss_y_bin + loss_y_res
    else:
        y_offset_l, y_offset_r = start_offset, start_offset + 1
        start_offset = y_offset_r

        loss_y_offset = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, y_offset_l: y_offset_r], dim=1, keep_dim=True), y_offset_label)
        loss_y_offset = fluid.layers.reduce_mean(loss_y_offset * fg_mask) * fg_scale
        reg_loss_dict['loss_y_offset'] = loss_y_offset
        loc_loss += loss_y_offset

    # angle loss
    ry_bin_l, ry_bin_r = start_offset, start_offset + num_head_bin
    ry_res_l, ry_res_r = ry_bin_r, ry_bin_r + num_head_bin

    ry_label = reg_label[:, 6:7]

    if get_ry_fine:
        # divide pi/2 into several bins
        angle_per_class = (np.pi / 2) / num_head_bin

        ry_label = ry_label % (2 * np.pi)  # 0 ~ 2pi
        opposite_flag = fluid.layers.logical_and(ry_label > np.pi * 0.5, ry_label < np.pi * 1.5)
        opposite_flag = fluid.layers.cast(opposite_flag, dtype=ry_label.dtype)
        shift_angle = (ry_label + opposite_flag * np.pi + np.pi * 0.5) % (2 * np.pi)  # (0 ~ pi)
        shift_angle.stop_gradient = True

        shift_angle = fluid.layers.clip(shift_angle - np.pi * 0.25, min=1e-3, max=np.pi * 0.5 - 1e-3)  # (0, pi/2)

        # bin center is (5, 10, 15, ..., 85)
        ry_bin_label = fluid.layers.cast(shift_angle / angle_per_class, dtype='int64')
        ry_res_label = shift_angle - (fluid.layers.cast(ry_bin_label, dtype=shift_angle.dtype) * angle_per_class + angle_per_class / 2)
        ry_res_norm_label = ry_res_label / (angle_per_class / 2)

    else:
        # divide 2pi into several bins
        angle_per_class = (2 * np.pi) / num_head_bin
        heading_angle = ry_label % (2 * np.pi)  # 0 ~ 2pi

        shift_angle = (heading_angle + angle_per_class / 2) % (2 * np.pi)
        shift_angle.stop_gradient = True
        ry_bin_label = fluid.layers.cast(shift_angle / angle_per_class, dtype='int64')
        ry_res_label = shift_angle - (fluid.layers.cast(ry_bin_label, dtype=shift_angle.dtype) * angle_per_class + angle_per_class / 2)
        ry_res_norm_label = ry_res_label / (angle_per_class / 2)

    ry_bin_onehot = fluid.one_hot(ry_bin_label[:, 0], depth=num_head_bin)
    loss_ry_bin = fluid.layers.softmax_with_cross_entropy(pred_reg[:, ry_bin_l:ry_bin_r], ry_bin_label)
    loss_ry_bin = fluid.layers.reduce_mean(loss_ry_bin * fg_mask) * fg_scale
    loss_ry_res = fluid.layers.smooth_l1(fluid.layers.reduce_sum(pred_reg[:, ry_res_l: ry_res_r] * ry_bin_onehot, dim=1, keep_dim=True), ry_res_norm_label)
    loss_ry_res = fluid.layers.reduce_mean(loss_ry_res * fg_mask) * fg_scale

    reg_loss_dict['loss_ry_bin'] = loss_ry_bin
    reg_loss_dict['loss_ry_res'] = loss_ry_res
    angle_loss = loss_ry_bin + loss_ry_res

    # size loss
    size_res_l, size_res_r = ry_res_r, ry_res_r + 3
    assert pred_reg.shape[1] == size_res_r, '%d vs %d' % (pred_reg.shape[1], size_res_r)

    anchor_size_var = fluid.layers.zeros(shape=[3], dtype=reg_label.dtype)
    fluid.layers.assign(np.array(anchor_size).astype('float32'), anchor_size_var)
    size_res_norm_label = (reg_label[:, 3:6] - anchor_size_var) / anchor_size_var
    size_res_norm_label = fluid.layers.reshape(size_res_norm_label, shape=[-1, 1], inplace=True)
    size_res_norm = pred_reg[:, size_res_l:size_res_r]
    size_res_norm = fluid.layers.reshape(size_res_norm, shape=[-1, 1], inplace=True)
    size_loss = fluid.layers.smooth_l1(size_res_norm, size_res_norm_label)
    size_loss = fluid.layers.reshape(size_loss, shape=[-1, 3])
    size_loss = fluid.layers.reduce_mean(size_loss * fg_mask) * fg_scale

    # Total regression loss
    reg_loss_dict['loss_loc'] = loc_loss
    reg_loss_dict['loss_angle'] = angle_loss
    reg_loss_dict['loss_size'] = size_loss

    return loc_loss, angle_loss, size_loss, reg_loss_dict
Пример #20
0
    def _build_decoder(self,
                       z_mean=None,
                       z_log_var=None,
                       enc_output=None,
                       mode='train',
                       beam_size=10):
        dec_input = layers.dropout(self.tar_emb,
                                   dropout_prob=self.dec_dropout_in,
                                   dropout_implementation="upscale_in_train")

        # `output_layer` will be used within BeamSearchDecoder
        output_layer = lambda x: layers.fc(x,
                                           size=self.tar_vocab_size,
                                           num_flatten_dims=len(x.shape) - 1,
                                           name="output_w")

        # `sample_output_layer` samples an id from the logits distribution instead of argmax(logits)
        # it will be used within BeamSearchDecoder
        sample_output_layer = lambda x: layers.unsqueeze(
            fluid.one_hot(layers.unsqueeze(
                layers.sampling_id(layers.softmax(
                    layers.squeeze(output_layer(x), [1])),
                                   dtype='int'), [1]),
                          depth=self.tar_vocab_size), [1])

        if mode == 'train':
            latent_z = self._sampling(z_mean, z_log_var)
        else:
            latent_z = layers.gaussian_random_batch_size_like(
                self.tar, shape=[-1, self.latent_size])
        dec_first_hidden_cell = layers.fc(latent_z,
                                          2 * self.hidden_size *
                                          self.num_layers,
                                          name='fc_hc')
        dec_first_hidden, dec_first_cell = layers.split(
            dec_first_hidden_cell, 2)
        if self.num_layers > 1:
            dec_first_hidden = layers.split(dec_first_hidden, self.num_layers)
            dec_first_cell = layers.split(dec_first_cell, self.num_layers)
        else:
            dec_first_hidden = [dec_first_hidden]
            dec_first_cell = [dec_first_cell]
        dec_initial_states = [[h, c]
                              for h, c in zip(dec_first_hidden, dec_first_cell)
                              ]
        dec_cell = DecoderCell(self.num_layers, self.hidden_size, latent_z,
                               self.param_attr_initializer,
                               self.param_attr_scale, self.dec_dropout_out)

        if mode == 'train':
            dec_output, _ = rnn(cell=dec_cell,
                                inputs=dec_input,
                                initial_states=dec_initial_states,
                                sequence_length=self.tar_sequence_length)
            dec_output = output_layer(dec_output)

            return dec_output
        elif mode == 'greedy':
            start_token = 1
            end_token = 2
            max_length = 100
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                start_token,
                end_token,
                beam_size=1,
                embedding_fn=self.tar_embeder,
                output_fn=output_layer)
            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=max_length)
            return outputs

        elif mode == 'sampling':
            start_token = 1
            end_token = 2
            max_length = 100
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                start_token,
                end_token,
                beam_size=1,
                embedding_fn=self.tar_embeder,
                output_fn=sample_output_layer)

            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=max_length)
            return outputs
        else:
            print("mode not supprt", mode)
Пример #21
0
    def finetune(
            self,
            train_path,
            dev_path=None,
            save_dir="ernie_gen_result",
            init_ckpt_path=None,
            use_gpu=True,
            max_steps=500,
            batch_size=8,
            max_encode_len=50,
            max_decode_len=50,
            learning_rate=5e-5,
            warmup_proportion=0.1,
            weight_decay=0.1,
            noise_prob=0,
            label_smooth=0,
            beam_width=5,
            length_penalty=1.0,
            log_interval=100,
            save_interval=200,
    ):
        """
        finetune with the specified dataset.

        Args:
            train_path(str): the train dataset path.
            dev_path(str): the dev dataset path.
            save_dir(str): the model params and dev dataset predict result save path.
            init_ckpt_path(str): incremental training load path.
            use_gpu(bool): use gpu or not.
            max_steps(int): max training steps.
            batch_size(int): the batch size.
            max_encode_len(int): the max encode length.
            max_decode_len(int): the max decode length.
            learning_rate(float): the learning rate.
            warmup_proportion(float): the warmup proportion.
            weight_decay(float): the weight decay magnitude.
            noise_prob(float): the nosie probability. see the ernie gen paper for details.
            label_smooth(float): the label smooth magnitude.
            beam_width(int): the beam size during evaluating the dev dataset.
            length_penalty(float): the length penalty during evaluating the dev dataset.
            log_interval(int): the log interval.
            save_interval(int): the save interval. dev set will be evaluated after saving.

        Return:
            result(dict): A Dictionary of shape::
                {
                    last_save_path(str): last model save path.
                    last_ppl(float): last model ppl.
                }
        """
        self.max_encode_len = max_encode_len
        self.max_decode_len = max_decode_len
        self.noise_prob = noise_prob

        place = F.CUDAPlace(0) if use_gpu else F.CPUPlace()

        with F.dygraph.guard(place):
            if init_ckpt_path is not None:
                logger.info('loading checkpoint from %s' % init_ckpt_path)
                sd, _ = D.load_dygraph(init_ckpt_path)
                self.model.set_dict(sd)

            feature_column = propeller.data.FeatureColumns([
                propeller.data.LabelColumn('id'),
                propeller.data.TextColumn(
                    'src',
                    unk_id=self.tokenizer.unk_id,
                    vocab_dict=self.tokenizer.vocab,
                    tokenizer=self.tokenizer.tokenize),
                propeller.data.TextColumn(
                    'tgt',
                    unk_id=self.tokenizer.unk_id,
                    vocab_dict=self.tokenizer.vocab,
                    tokenizer=self.tokenizer.tokenize),
            ])

            train_ds = feature_column.build_dataset('train', data_file=train_path, shuffle=False,
                                                    repeat=True, use_gz=False)\
                .map(self._map_fn).shuffle(10000).padded_batch(batch_size).map(self._after_padding)
            train_ds.data_shapes = [[None, None]] * 7 + [[None, None, None]
                                                         ] * 3 + [[None]]
            train_ds.data_types = ['int64'] * 11

            if dev_path:
                dev_ds = feature_column.build_dataset('dev', data_file=dev_path, shuffle=False,
                                                    repeat=False, use_gz=False) \
                    .map(self._map_fn) \
                    .padded_batch(1) \
                    .map(self._after_padding)
                dev_ds.data_shapes = [[None, None]] * 7 + [[None, None, None]
                                                           ] * 3 + [[None]]
                dev_ds.data_types = ['int64'] * 11

            vocab_size, _ = self.model.word_emb.weight.shape
            g_clip = F.clip.GradientClipByGlobalNorm(1.0)
            opt = AdamW(
                learning_rate=LinearDecay(learning_rate,
                                          int(warmup_proportion * max_steps),
                                          max_steps),
                parameter_list=self.model.parameters(),
                weight_decay=weight_decay,
                grad_clip=g_clip)

            loss = None

            save_path = None
            ppl = None

            if save_dir and not os.path.exists(save_dir):
                os.makedirs(save_dir)
            for step, data in enumerate(train_ds.start(place)):
                (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids,
                 tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt,
                 mask_attn_2_srctgtattn, tgt_labels) = data

                _, __, info = self.model(
                    src_ids,
                    sent_ids=src_sids,
                    pos_ids=src_pids,
                    attn_bias=mask_src_2_src,
                    encode_only=True)
                cached_k, cached_v = info['caches']
                _, __, info = self.model(
                    tgt_ids,
                    sent_ids=tgt_sids,
                    pos_ids=tgt_pids,
                    attn_bias=mask_tgt_2_srctgt,
                    past_cache=(cached_k, cached_v),
                    encode_only=True)
                cached_k2, cached_v2 = info['caches']
                past_cache_k = [
                    L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)
                ]
                past_cache_v = [
                    L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)
                ]
                if label_smooth > 0.:
                    tgt_labels = L.label_smooth(
                        F.one_hot(tgt_labels, vocab_size), epsilon=label_smooth)
                loss, _, __ = self.model(
                    attn_ids,
                    sent_ids=tgt_sids,
                    pos_ids=tgt_pids,
                    attn_bias=mask_attn_2_srctgtattn,
                    past_cache=(past_cache_k, past_cache_v),
                    tgt_labels=tgt_labels,
                    tgt_pos=L.where(attn_ids == self.tokenizer.vocab['[MASK]']))

                loss.backward()
                opt.minimize(loss)
                self.model.clear_gradients()

                if step % log_interval == 0:
                    loss_np = loss.numpy()
                    ppl = np.exp(loss_np)
                    logger.info(
                        '[step %d / %d]train loss %.5f, ppl %.5f, elr %.3e' %
                        (step, max_steps, loss_np, ppl, opt.current_step_lr()))
                if save_dir and step % save_interval == 0 and step > 0:
                    loss_np = loss.numpy()
                    ppl = np.exp(loss_np)
                    save_name = "step_%s_ppl_%.5f" % (step, ppl)
                    save_path = os.path.join(save_dir, save_name)
                    logger.info("save the model in %s" % save_path)
                    F.save_dygraph(self.model.state_dict(), save_path)

                    if dev_path:
                        logger.info('evaluating...')
                        res = self._evaluate(dev_ds, place, beam_width,
                                             length_penalty)
                        output_path = os.path.join(
                            save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl))
                        logger.info(
                            'save the predict result in %s' % output_path)
                        with open(output_path, 'w') as fout:
                            fout.write(('\n'.join(res)))

                if step > max_steps:
                    break

            if loss:
                loss_np = loss.numpy()
                ppl = np.exp(loss_np)
                logger.info('[final step %d]train loss %.5f, ppl %.5f, elr %.3e'
                            % (step, loss_np, ppl, opt.current_step_lr()))
                if save_dir:
                    save_name = "step_%s_ppl_%.5f" % (step, ppl)
                    save_path = os.path.join(save_dir, save_name)
                    logger.info("save the model in %s" % save_path)
                    F.save_dygraph(self.model.state_dict(), save_path)

                    if dev_path:
                        logger.info('evaluating...')
                        res = self._evaluate(dev_ds, place, beam_width,
                                             length_penalty)
                        output_path = os.path.join(
                            save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl))
                        logger.info(
                            'save the predict result in %s' % output_path)
                        with open(output_path, 'w') as fout:
                            fout.write(('\n'.join(res)))

            result = {
                "last_save_path": "%s.pdparams" % save_path,
                "last_ppl": ppl[0],
            }

            return result
Пример #22
0
 def test_bad_x():
     label = fluid.layers.data(name="label",
                               shape=[4],
                               append_batch_size=False,
                               dtype="float32")
     one_hot_label = fluid.one_hot(input=label, depth=4)
Пример #23
0
def focal_loss(logits, label, gamma=1):
    probs = L.softmax(logits, axis=-1)
    one_hot = F.one_hot(label, depth=probs.shape[-1])
    loss = -L.reduce_sum(one_hot * (
        (1.0 - probs)**gamma) * L.log(probs)) / logits.shape[0]
    return loss
Пример #24
0
    def _init_train(self):
        
        instances = self.instances
        Backbone = self.Backbone
        bb_conf = self.bb_conf
        bb_name = self.bb_name
        dev_count = self.dev_count
        num_instances = len(instances)
        mrs = self.mrs

        # set first_target/main task instance
        main_inst = None
        for inst in instances:
            if inst.is_target:
                main_inst = inst
                inst.is_first_target = True
                break
        main_conf = main_inst.config
        if not os.path.exists(main_conf['save_path']):
            os.makedirs(main_conf['save_path'])
            os.makedirs(os.path.join(main_conf['save_path'], 'ckpt'))
        
        # prepare backbone
        train_backbone = Backbone(bb_conf, phase='train')
        pred_backbone = Backbone(bb_conf, phase='pred')

        # create reader, task
        # then check i/o across reader, backbone and task_layer
        task_attrs = []
        pred_task_attrs = []
        for inst in instances:
            train_reader = inst.Reader(inst.config, phase='train')
            inst.reader['train'] = train_reader
            train_parad = inst.Paradigm(inst.config, phase='train', backbone_config=bb_conf)
            inst.task_layer['train'] = train_parad
            task_attr_from_reader = _encode_inputs(train_parad.inputs_attrs['reader'], inst.name)
            task_attrs.append(task_attr_from_reader)

            _check_io(train_backbone.inputs_attr, train_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train')
            _check_io(train_parad.inputs_attrs['reader'], train_reader.outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train')
            _check_io(train_parad.inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone')

            if inst.is_target:
                if 'pred_file' not in inst.config:
                    inst.config['pred_file'] = ''
                pred_reader = inst.Reader(inst.config, phase='pred')
                pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf)
                inst.task_layer['pred'] = pred_parad
                task_attr_from_reader = _encode_inputs(pred_parad.inputs_attrs['reader'], inst.name)
                pred_task_attrs.append(task_attr_from_reader)
                _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
                _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
                _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')

        # merge reader input attrs from backbone and task_instances
        joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(train_backbone.inputs_attr, task_attrs)
        pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False, insert_batchsize=False, insert_seqlen=False, insert_batchsize_x_seqlen=False)
        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]

        if DEBUG:
            print('----- for debug -----')
            print('joint input names:')
            print(joint_input_names)
            print('joint input shape and dtypes:')
            print(joint_shape_and_dtypes)

        # load data
        for inst in instances:
            print(inst.name+": preparing data...", end='')
            inst.reader['train'].load_data()
            print('ok!')

        # merge dataset iterators and create net input vars
        iterators = []
        prefixes = []
        mrs = []

        for inst in instances:
            iterators.append(inst.reader['train'].iterator())
            prefixes.append(inst.name)
            mrs.append(inst.mix_ratio)

        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE, return_type='dict')
        self._joint_iterator_fn = joint_iterator_fn

        input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)]
        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
        # net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)
        net_inputs = create_net_inputs(input_attrs, async=False)
        self._net_inputs = net_inputs

        # build backbone and task layers
        train_prog = fluid.default_main_program()
        train_init_prog = fluid.default_startup_program()
        bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_')

        assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys())

        pred_prog = fluid.Program()
        pred_init_prog = fluid.Program()

        with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog):
            pred_net_inputs = create_net_inputs(pred_input_attrs)
            pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_')

        fluid.framework.switch_main_program(train_prog)
        fluid.framework.switch_startup_program(train_init_prog)

        task_output_vars = {}
        for inst in instances:
            task_inputs = {'backbone': bb_output_vars}
            task_inputs_from_reader = _decode_inputs(net_inputs, inst.name)
            task_inputs['reader'] = task_inputs_from_reader
       
            scope = inst.task_reuse_scope + '/'
            with fluid.unique_name.guard(scope):
               
                output_vars = inst.build_task_layer(task_inputs, phase='train', scope=scope)
                output_vars = {inst.name+'/'+key: val for key, val in output_vars.items()}
                old = len(task_output_vars) # for debug
                task_output_vars.update(output_vars)
                assert len(task_output_vars) - old == len(output_vars) # for debug
            # prepare predict vars for saving inference model
            if inst.is_target:
                with fluid.program_guard(pred_prog, pred_init_prog):
                    cur_inputs = _decode_inputs(pred_net_inputs, inst.name)
                    inst.pred_input = cur_inputs
                    pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs}
                    scope = inst.task_reuse_scope + '/'
                    with fluid.unique_name.guard(scope):
                        inst.build_task_layer(pred_task_inputs, phase='pred', scope=scope)


        bb_fetches = {k: v.name for k,v in bb_output_vars.items()}
        task_fetches = {k: v.name for k,v in task_output_vars.items()}
        fetches = task_fetches
        fetches['__task_id'] = net_inputs['__task_id'].name

        # compute loss
        task_id_var = net_inputs['__task_id']
        task_id_vec = fluid.one_hot(task_id_var, num_instances)
        losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
        loss = layers.reduce_sum(task_id_vec * losses)

        main_reader = main_inst.reader['train']

        num_examples = main_reader.num_examples
        for inst in instances:
            max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * (num_examples // main_conf['batch_size']  // dev_count))
            if inst.is_target:
                print('{}: expected train steps {}.'.format(inst.name, max_train_steps))
            inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size']  // dev_count
            inst.expected_train_steps = max_train_steps

        global_max_train_steps = int(main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size']  // dev_count))
        print('Estimated overall train steps {}.'.format(global_max_train_steps))

        if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0:
            warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion'])
            print('Warmup steps: '+str(warmup_steps))
        else:
            warmup_steps = 0

        # build optimizer
        if 'optimizer' in main_conf:
            optim_mod = importlib.import_module(OPTIMIZER_DIR + '.' + main_conf['optimizer'])
            optimize = getattr(optim_mod, OPTIMIZE_METHOD)
            optimize(loss, main_conf, max_train_steps, warmup_steps, fluid.default_main_program())

            loss.persistable = True
            if main_conf.get('use_ema', False):
                assert 'ema_decay' in main_conf, "ema_decay should be set when use_ema is enabled."
                ema = fluid.optimizer.ExponentialMovingAverage(main_conf['ema_decay'])
                ema.update()

        # prepare for train
        self.train_backbone = train_backbone
        self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
        self.saver_program = fluid.default_main_program()

        self.main_inst = main_inst
        self.fetches = fetches
        self.has_init_train = True
        self.has_init_pred = True

        self.exe.run(fluid.default_startup_program())
        print("\nRandomly initialize parameters...\n")
Пример #25
0
    input_layer7, out_logits7 = model7.x2paddle_net(input=adv_image)
    out7 = fluid.layers.softmax(out_logits7[0])

    model8 = models.__dict__[model_name8]()
    input_layer8, out_logits8 = model8.x2paddle_net(input=adv_image)
    out8 = fluid.layers.softmax(out_logits8[0])

    model9 = models.__dict__[model_name9]()
    input_layer9, out_logits9 = model9.x2paddle_net(input=adv_image)
    out9 = fluid.layers.softmax(out_logits9[0])

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    one_hot_label = fluid.one_hot(input=label, depth=121)
    one_hot_label2 = fluid.one_hot(input=label2, depth=121)
    smooth_label = fluid.layers.label_smooth(label=one_hot_label,
                                             epsilon=0.1,
                                             dtype="float32")[0]
    #print(smooth_label.shape)
    smooth_label2 = fluid.layers.label_smooth(label=one_hot_label2,
                                              epsilon=0.1,
                                              dtype="float32")[0]
    #print(smooth_label2.shape)
    #print(one_hot_label)
    #print(smooth_label)
    #尝试三种损失函数
    #第一种
    loss_logp = -1*fluid.layers.log(1-fluid.layers.matmul(out1,one_hot_label[0],transpose_y=True))\
            -1*fluid.layers.log(1-fluid.layers.matmul(out2,one_hot_label[0],transpose_y=True))\
Пример #26
0
        def beam_search():
            """Beam search function"""

            max_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.max_out_len,
                                           force_cpu=True)
            min_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.min_out_len)
            neg_inf = layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=-INF)
            step_idx = layers.fill_constant(shape=[1],
                                            dtype=start_tokens.dtype,
                                            value=0,
                                            force_cpu=True)
            step_next_idx = layers.fill_constant(shape=[1],
                                                 dtype=start_tokens.dtype,
                                                 value=1,
                                                 force_cpu=True)
            cond = layers.less_than(x=step_idx,
                                    y=max_len)  # default force_cpu=True
            while_op = layers.While(cond)
            # array states will be stored for each step.
            ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                     step_idx)
            scores = layers.array_write(init_scores, step_idx)
            # cell states will be overwrited at each step.
            # caches contains states of history steps in decoder self-attention
            # and static encoder output projections in encoder-decoder attention
            # to reduce redundant computation.
            caches = [
                {
                    "k":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "v":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "static_k_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_v_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_k_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype),
                    "static_v_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype)
                } for i in range(self._dec_n_layer)
            ]

            trigram_blocking = TrigramBlocking(start_tokens,
                                               self.tokenizer,
                                               use_fp16=self._use_fp16,
                                               beam_size=self.beam_size)

            with while_op.block():
                pre_ids = layers.array_read(array=ids, i=step_idx)
                pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                # Since beam_search_op dosen't enforce pre_ids' shape, we can do
                # inplace reshape here which actually change the shape of pre_ids.
                # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                pre_scores = layers.array_read(array=scores, i=step_idx)
                # gather cell states corresponding to selected parent
                pre_src_words_attn_bias = layers.gather(
                    tgt_src_words_attn_bias, index=parent_idx)
                pre_src_sents_attn_bias = layers.gather(
                    tgt_src_sents_attn_bias, index=parent_idx)
                pre_graph_attn_bias = layers.gather(graph_attn_bias,
                                                    index=parent_idx)
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=
                        pre_src_sents_attn_bias,  # cann't use lod tensor here
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

                logits = self.decode(
                    dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias,
                               pre_src_sents_attn_bias, pre_graph_attn_bias),
                    enc_words_output=enc_words_output,
                    enc_sents_output=enc_sents_output,
                    caches=caches,
                    gather_idx=parent_idx)

                # prevent generating end token if length less than min_out_len
                eos_index = layers.fill_constant(
                    shape=[layers.shape(logits)[0]],
                    dtype='int64',
                    value=self.eos_idx)
                eos_index = fluid.one_hot(eos_index, depth=self.voc_size)
                less_cond = layers.cast(layers.less_than(x=step_idx,
                                                         y=min_len),
                                        dtype='float32')
                less_val = layers.elementwise_mul(less_cond, neg_inf)
                eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
                revised_logits = layers.elementwise_add(logits,
                                                        eos_val,
                                                        axis=0)

                # topK reduction across beams, also contain special handle of
                # end beams and end sentences(batch reduction)
                topk_scores, topk_indices = layers.topk(
                    input=layers.softmax(revised_logits), k=self.beam_size)

                # Roll-Back previous-scores for length-penalty
                # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
                # because of doing this, we need store the length-penaltied score in `scores`
                # while calculating use the un-penaltied score
                # -> safe for step_idx == 0 (initialization state), because previous-score == 0
                pre_timestep_length_penalty = fluid.layers.pow(
                    ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) /
                     6.0), self.len_penalty)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                    pre_scores, pre_timestep_length_penalty)

                # calc trigram-blocking delta scores for current alive sequence
                if self.block_trigram:
                    trigram_blocking.update_seq(pre_ids, parent_idx)
                    trigram_blocking.expand_cand_seq(topk_indices)
                    fluid.layers.py_func(
                        func=trigram_blocking.blocking_forward,
                        x=[
                            trigram_blocking.cand_seq,
                            trigram_blocking.id2is_full_token
                        ],
                        out=trigram_blocking.delta_score_out,
                        backward_func=None)
                    layers.Print(trigram_blocking.delta_score_out,
                                 summarize=100,
                                 message="trigram_blocking.delta_score_out")
                    pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                        x=trigram_blocking.delta_score_out,
                        y=pre_scores_wo_len_penalty,
                        axis=0)
                # => [N, topk]

                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores),
                    y=pre_scores_wo_len_penalty,
                    axis=0)

                cur_timestep_length_penalty = layers.pow(
                    ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) /
                     6.0), self.len_penalty)
                curr_scores = layers.elementwise_div(
                    accu_scores, cur_timestep_length_penalty)

                # beam_search op uses lod to differentiate branches.
                curr_scores = layers.lod_reset(curr_scores, pre_ids)
                topk_indices = layers.lod_reset(topk_indices, pre_ids)
                selected_ids, selected_scores, gather_idx = layers.beam_search(
                    pre_ids=pre_ids,
                    pre_scores=pre_scores,
                    ids=topk_indices,
                    scores=curr_scores,
                    beam_size=self.beam_size,
                    end_id=self.eos_idx,
                    return_parent_idx=True)

                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.increment(x=step_next_idx, value=1.0, in_place=True)
                # cell states(caches) have been updated in wrap_decoder,
                # only need to update beam search states here.
                layers.array_write(selected_ids, i=step_idx, array=ids)
                layers.array_write(selected_scores, i=step_idx, array=scores)
                layers.assign(gather_idx, parent_idx)
                layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias)
                layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias)
                layers.assign(pre_graph_attn_bias, graph_attn_bias)

                length_cond = layers.less_than(x=step_idx, y=max_len)
                finish_cond = layers.logical_not(
                    layers.is_empty(x=selected_ids))
                layers.logical_and(x=length_cond, y=finish_cond, out=cond)

            finished_ids, finished_scores = layers.beam_search_decode(
                ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

            return finished_ids, finished_scores
Пример #27
0
    def fast_decode(self):
        """create model for inference"""
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_tensor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=70,
                                                      iterable=False)
        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        unimo = UNIMOModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.gene_config,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_out_len,
                                       force_cpu=True)
        min_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.min_out_len,
                                       force_cpu=True)
        neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        step_next_idx = layers.fill_constant(shape=[1],
                                             dtype=tgt_ids.dtype,
                                             value=1,
                                             force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(tgt_pos, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        trigram_blocking = TrigramBlocking(tgt_ids,
                                           self.tokenizer,
                                           beam_size=self.beam_size)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                """generate batch"""
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.array_read(tgt_masks, i=step_idx)
            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            pre_pos = pre_pos + pos_bias  ####################### pos start from 2

            pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype)

            dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos}
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = role_ids
                dec_emb_ids["turn_embedding"] = turn_ids
            else:
                dec_emb_ids["sent_embedding"] = pre_sent

            dec_out = unimo.encode(emb_ids=dec_emb_ids,
                                   input_mask=pre_mask,
                                   gather_idx=parent_idx)
            fc_out = self.cal_logit(dec_out, None)

            # prevent generating end token if length less than min_out_len
            eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]],
                                             dtype='int64',
                                             value=self.eos_id)
            eos_index = fluid.one_hot(eos_index, depth=self.vocab_size)
            less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len),
                                    dtype='float32')
            less_val = layers.elementwise_mul(less_cond, neg_inf)
            eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
            revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0)

            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(revised_logits), k=self.beam_size)

            # Roll-Back previous-scores for length-penalty
            # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
            # because of doing this, we need store the length-penaltied score in `scores`
            # while calculating use the un-penaltied score
            # -> safe for step_idx == 0 (initialization state), because previous-score == 0
            pre_timestep_length_penalty = fluid.layers.pow(
                ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0),
                self.length_penalty)
            pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                pre_scores, pre_timestep_length_penalty)

            # calc trigram-blocking delta scores for current alive sequence
            if self.block_trigram:
                trigram_blocking.update_seq(pre_ids, parent_idx)
                trigram_blocking.expand_cand_seq(topk_indices)
                fluid.layers.py_func(func=trigram_blocking.blocking_forward,
                                     x=[
                                         trigram_blocking.cand_seq,
                                         trigram_blocking.id2is_full_token
                                     ],
                                     out=trigram_blocking.delta_score_out,
                                     backward_func=None)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                    x=trigram_blocking.delta_score_out,
                    y=pre_scores_wo_len_penalty,
                    axis=0)
            # => [N, topk]
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores_wo_len_penalty,
                                                 axis=0)

            cur_timestep_length_penalty = layers.pow(
                ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0),
                self.length_penalty)
            curr_scores = layers.elementwise_div(accu_scores,
                                                 cur_timestep_length_penalty)

            # beam_search op uses lod to differentiate branches.
            curr_scores = layers.lod_reset(curr_scores, pre_ids)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=curr_scores,
                beam_size=self.beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=step_next_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)
            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_id)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars