示例#1
0
    def forward(self, features):
        def FC(inputs, name, i, act):
            return L.fc(inputs,
                        self.hidden_size,
                        act=act,
                        param_attr=F.ParamAttr(
                            name='%s.fc.w_%d' % (name, i),
                            initializer=F.initializer.XavierInitializer(
                                fan_in=self.hidden_size,
                                fan_out=self.hidden_size)),
                        bias_attr=F.ParamAttr(
                            name='%s.fc.b_%d' % (name, i),
                            initializer=F.initializer.Constant(0.)))

        title_ids, comment_ids = features

        embedding_attr = F.ParamAttr(
            name='emb',
            initializer=F.initializer.XavierInitializer(
                fan_in=self.vocab_size, fan_out=self.embedding_size))

        title_encoded = L.embedding(title_ids,
                                    [self.vocab_size, self.embedding_size],
                                    param_attr=embedding_attr)
        comment_encoded = L.embedding(comment_ids,
                                      [self.vocab_size, self.embedding_size],
                                      param_attr=embedding_attr)

        # Vsum
        zero = L.fill_constant(shape=[1], dtype='int64', value=0)
        title_pad = L.cast(L.logical_not(L.equal(title_ids, zero)), 'float32')
        comment_pad = L.cast(L.logical_not(L.equal(comment_ids, zero)),
                             'float32')

        title_encoded = L.reduce_sum(title_encoded * title_pad, dim=1)
        title_encoded = L.softsign(title_encoded)
        comment_encoded = L.reduce_sum(comment_encoded * comment_pad, dim=1)
        comment_encoded = L.softsign(comment_encoded)

        for i in range(self.num_layers):
            title_encoded = FC(title_encoded, 'title', i, 'tanh')

        for i in range(self.num_layers):
            comment_encoded = FC(comment_encoded, 'comment', i, 'tanh')

        score = L.reduce_sum(title_encoded * comment_encoded,
                             dim=1,
                             keep_dim=True) / np.sqrt(self.hidden_size)
        if self.mode is propeller.RunMode.PREDICT:
            probs = L.sigmoid(score)
            return probs
        else:
            return score
示例#2
0
    def forward(self, pred, target):
        target = 1 - target[:, 0]
        batch_size, vector_size = pred.shape[0], pred.shape[1]

        pred = L.l2_normalize(pred, axis=1, epsilon=1e-10)

        square_norm = L.reduce_sum(L.square(pred), dim=1)
        dist = L.elementwise_add(-2.0 * L.matmul(pred, pred, transpose_y=True),
                                 square_norm,
                                 axis=0)
        dist = L.elementwise_add(dist, square_norm, axis=1)
        dist = L.elementwise_max(dist, L.zeros_like(dist))
        dist = L.sqrt(dist)

        ap_dist = L.reshape(dist, (0, 0, 1))
        an_dist = L.reshape(dist, (0, 1, -1))

        loss = L.expand(ap_dist, (1, 1, batch_size)) - L.expand(
            an_dist, (1, batch_size, 1)) + self.magin

        indice_equal = L.diag(
            L.fill_constant((batch_size, ), dtype='float32', value=1.0))
        indice_not_equal = 1.0 - indice_equal

        broad_matrix = L.expand(L.reshape(target, (-1, 1)),
                                (1, batch_size)) + L.expand(
                                    L.reshape(target, (1, -1)),
                                    (batch_size, 1))

        pp = L.cast(L.equal(broad_matrix, L.zeros_like(broad_matrix)),
                    dtype='float32')
        pp = L.reshape(indice_not_equal * pp, (0, 0, 1))

        pn = L.cast(L.equal(broad_matrix,
                            L.zeros_like(broad_matrix) + 1),
                    dtype='float32')
        pn = L.reshape(indice_not_equal * pn, (1, 0, -1))

        apn = L.expand(pp,
                       (1, 1, batch_size)) * L.expand(pn, (batch_size, 1, 1))

        loss = loss * L.cast(apn, dtype='float32')
        loss = L.elementwise_max(loss, L.zeros_like(loss))

        num_tri = L.reduce_sum(
            L.cast(L.greater_than(loss, L.zeros_like(loss)), dtype='float32'))

        loss = L.reduce_sum(loss) * self.loss_weight / (num_tri + 1e-16)

        return loss
示例#3
0
    def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask):
        """跟进文法约束完成一步解码逻辑

        Args:
            logits (Variable): shape = [batch_size, beam_size, vocab_size]
            next_cell_states (Variable): NULL
            decode_states (StateWrapper): NULL

        Returns: TODO

        Raises: NULL

        """
        # 解码出符合语法规则的 token logits
        logits, valid_table_mask = self._output_layer(logits, actions, gmr_mask, decode_states.valid_table_mask)

        # 初始化 vocab size
        self._vocab_size = logits.shape[-1]
        self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1])

        # 计算 log probs,并 mask 掉 finished 部分
        step_log_probs = layers.log(layers.softmax(logits))
        step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished)

        scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size])
        topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size)
        topk_scores = layers.reshape(topk_scores, shape=[-1])
        topk_indices = layers.reshape(topk_indices, shape=[-1])

        # top-k 对应的 beam
        beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor)
        # top-k 对应的 token id
        token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor)

        # 根据 top k 的来源,重新组织 step_log_probs
        next_log_probs = nn_utils.batch_gather(
                layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]),
                topk_indices)
        def _beam_gather(x, beam_indices):
            """reshape x to beam dim, and gather each beam_indices
            Args:
                x (TYPE): NULL
            Returns: Variable
            """
            x = self.split_batch_beams(x)
            return nn_utils.batch_gather(x, beam_indices)
        next_cell_states = layers.utils.map_structure(lambda x: _beam_gather(x, beam_indices),
                                                      next_cell_states)
        next_finished = _beam_gather(decode_states.finished, beam_indices)
        next_lens = _beam_gather(decode_states.lengths, beam_indices)

        next_lens = layers.elementwise_add(next_lens,
                layers.cast(layers.logical_not(next_finished), next_lens.dtype))
        next_finished = layers.logical_or(next_finished,
                layers.equal(token_indices, self._end_token_tensor))

        decode_output = OutputWrapper(topk_scores, token_indices, beam_indices)
        decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask)

        return decode_output, decode_states
示例#4
0
        def build_position_ids(src_ids, dst_ids):
            src_shape = L.shape(src_ids)
            src_batch = src_shape[0]
            src_seqlen = src_shape[1]
            dst_seqlen = src_seqlen - 1 # without cls

            src_position_ids = L.reshape(
                L.range(
                    0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1],
                inplace=True) # [1, slot_seqlen, 1]
            src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]
            zero = L.fill_constant([1], dtype='int64', value=0)
            input_mask = L.cast(L.equal(src_ids, zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
            src_pad_len = L.reduce_sum(input_mask, 1, keep_dim=True) # [B, 1, 1]

            dst_position_ids = L.reshape(
                L.range(
                    src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1],
                inplace=True) # [1, slot_seqlen, 1]
            dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1]
            dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1]

            position_ids = L.concat([src_position_ids, dst_position_ids], 1)
            position_ids = L.cast(position_ids, 'int64')
            position_ids.stop_gradient = True
            return position_ids
示例#5
0
 def __init__(self, label, pred):
     """doc"""
     if label.shape != pred.shape:
         raise ValueError(
             'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s' % (repr(label), repr(pred)))
     self.eq = L.equal(pred, label)
     self.reset()
            def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx):
                pre_ids = alive_seq

                dec_step_emb = layers.embedding(
                    input=pre_ids,
                    size=[self.tar_vocab_size, self.hidden_size],
                    dtype='float32',
                    is_sparse=False,
                    param_attr=fluid.ParamAttr(
                        name='target_embedding',
                        initializer=fluid.initializer.UniformInitializer(
                            low=-self.init_scale, high=self.init_scale)))

                dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                    dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array,
                    enc_memory)

                projection = layers.matmul(dec_att_out, softmax_weight)

                logits = layers.softmax(projection)
                current_log = layers.elementwise_add(x=layers.log(logits),
                                                     y=alive_log_prob,
                                                     axis=0)
                base_1 = layers.cast(step_idx, 'float32') + 6.0
                base_1 /= 6.0
                length_penalty = layers.pow(base_1, alpha)

                len_pen = layers.pow(
                    ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha)

                current_log = layers.reshape(current_log, shape=[1, -1])

                current_log = current_log / length_penalty
                topk_scores, topk_indices = layers.topk(input=current_log,
                                                        k=beam_size)

                topk_scores = layers.reshape(topk_scores, shape=[-1])

                topk_log_probs = topk_scores * length_penalty

                generate_id = layers.reshape(topk_indices,
                                             shape=[-1]) % self.tar_vocab_size

                selected_beam = layers.reshape(
                    topk_indices, shape=[-1]) // self.tar_vocab_size

                topk_finished = layers.equal(generate_id, eos_ids)

                topk_finished = layers.cast(topk_finished, 'float32')

                generate_id = layers.reshape(generate_id, shape=[-1, 1])

                pre_tokens_list = layers.gather(tokens, selected_beam)

                full_tokens_list = layers.concat(
                    [pre_tokens_list, generate_id], axis=1)


                return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                        dec_att_out, new_hidden_array, new_cell_array
示例#7
0
        def grow_topk(i, logits, alive_seq, alive_log_probs, states):
            logits = layers.reshape(logits, [batch_size, beam_size, -1])
            candidate_log_probs = layers.log(layers.softmax(logits, axis=2))
            log_probs = layers.elementwise_add(candidate_log_probs,
                                               alive_log_probs, 0)

            length_penalty = np.power(5.0 + (i + 1.0) / 6.0, alpha)
            curr_scores = log_probs / length_penalty
            flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1])

            topk_scores, topk_ids = layers.topk(flat_curr_scores,
                                                k=beam_size * 2)

            topk_log_probs = topk_scores * length_penalty

            topk_beam_index = topk_ids // self.trg_vocab_size
            topk_ids = topk_ids % self.trg_vocab_size

            # use gather as gather_nd, TODO: use gather_nd
            topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index,
                                           beam_size, batch_size)
            topk_seq = layers.concat(
                [topk_seq,
                 layers.reshape(topk_ids, topk_ids.shape + [1])],
                axis=2)
            states = update_states(states, topk_beam_index, beam_size)
            eos = layers.fill_constant(shape=topk_ids.shape,
                                       dtype="int64",
                                       value=eos_id)
            topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32")

            #topk_seq: [batch_size, 2*beam_size, i+1]
            #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size]
            return topk_seq, topk_log_probs, topk_scores, topk_finished, states
示例#8
0
    def _build_position_ids(self, src_ids):
        src_shape = L.shape(src_ids)
        src_seqlen = src_shape[1]
        src_batch = src_shape[0]

        slot_seqlen = self.slot_seqlen

        num_b = (src_seqlen / slot_seqlen) - 1
        a_position_ids = L.reshape(L.range(0, slot_seqlen, 1, dtype='int32'),
                                   [1, slot_seqlen, 1],
                                   inplace=True)  # [1, slot_seqlen, 1]
        a_position_ids = L.expand(
            a_position_ids, [src_batch, 1, 1])  # [B, slot_seqlen * num_b, 1]

        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero),
                            "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
        a_pad_len = L.reduce_sum(input_mask, 1)  # [B, 1, 1]

        b_position_ids = L.reshape(L.range(slot_seqlen,
                                           2 * slot_seqlen,
                                           1,
                                           dtype='int32'), [1, slot_seqlen, 1],
                                   inplace=True)  # [1, slot_seqlen, 1]
        b_position_ids = L.expand(
            b_position_ids,
            [src_batch, num_b, 1])  # [B, slot_seqlen * num_b, 1]
        b_position_ids = b_position_ids - a_pad_len  # [B, slot_seqlen * num_b, 1]

        position_ids = L.concat([a_position_ids, b_position_ids], 1)
        position_ids = L.cast(position_ids, 'int64')
        position_ids.stop_gradient = True
        return position_ids
示例#9
0
 def _build_input_mask(self, src_ids):
     zero = L.fill_constant([1], dtype='int64', value=0)
     input_mask = L.logical_not(L.equal(src_ids,
                                        zero))  # assume pad id == 0
     input_mask = L.cast(input_mask, 'float')
     input_mask.stop_gradient = True
     return input_mask
示例#10
0
def points_nms(heat, kernel=2):
    # kernel must be 2
    hmax = L.pool2d(heat, pool_size=kernel, pool_stride=1,
                    pool_padding=[[0, 0], [0, 0], [1, 0], [1, 0]],
                    pool_type='max')
    keep = L.cast(L.equal(hmax, heat), 'float32')
    return heat * keep
示例#11
0
 def __init__(self, label, pred):
     """doc"""
     if label.shape != pred.shape:
         raise ValueError(
             'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s'
             % (repr(label), repr(pred)))
     self.eq = _allgather_2dim(L.cast(L.equal(pred, label), 'int64'))
     self.reset()
示例#12
0
def get_enc_bias(source_inputs):
    """
        get_enc_bias
    """
    source_inputs = layers.cast(source_inputs, 'float32')
    emb_sum = layers.reduce_sum(layers.abs(source_inputs), dim=-1)
    zero = layers.fill_constant([1], 'float32', value=0) 
    bias = layers.cast(layers.equal(emb_sum, zero), 'float32') * -1e9
    return layers.unsqueeze(layers.unsqueeze(bias, axes=[1]), axes=[1])
示例#13
0
    def test_return_var_tuple(self):
        def fn_1():
            return layers.fill_constant(shape=[1, 2], dtype='int32',
                                        value=1), layers.fill_constant(
                                            shape=[2, 3],
                                            dtype='float32',
                                            value=2)

        def fn_2():
            return layers.fill_constant(shape=[3, 4], dtype='int32',
                                        value=3), layers.fill_constant(
                                            shape=[4, 5],
                                            dtype='float32',
                                            value=4)

        def fn_3():
            return layers.fill_constant(shape=[5], dtype='int32',
                                        value=5), layers.fill_constant(
                                            shape=[5, 6],
                                            dtype='float32',
                                            value=6)

        main_program = Program()
        startup_program = Program()
        with program_guard(main_program, startup_program):
            x = layers.fill_constant(shape=[1], dtype='float32', value=1)
            y = layers.fill_constant(shape=[1], dtype='float32', value=1)
            z = layers.fill_constant(shape=[1], dtype='float32', value=3)

            pred_1 = layers.equal(x, y)  # true
            pred_2 = layers.equal(x, z)  # false

            out = layers.case(((pred_1, fn_1), (pred_2, fn_2)), fn_3)

            place = fluid.CUDAPlace(
                0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
            exe = fluid.Executor(place)
            ret = exe.run(main_program, fetch_list=out)

            self.assertTrue(
                np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32)))
            self.assertTrue(
                np.allclose(np.asarray(ret[1]), np.full((2, 3), 2,
                                                        np.float32)))
示例#14
0
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None):
    """Matrix NMS for multi-class masks.

    Args:
        seg_masks (Tensor): shape (n, h, w)   0、1组成的掩码
        cate_labels (Tensor): shape (n), mask labels in descending order
        cate_scores (Tensor): shape (n), mask scores in descending order
        kernel (str):  'linear' or 'gauss'
        sigma (float): std in gaussian method
        sum_masks (Tensor):  shape (n, )      n个物体的面积

    Returns:
        Tensor: cate_scores_update, tensors of shape (n)
    """
    n_samples = L.shape(cate_labels)[0]   # 物体数
    seg_masks = L.reshape(seg_masks, (n_samples, -1))   # [n, h*w]
    # inter.
    inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True)   # [n, n] 自己乘以自己的转置。两两之间的交集面积。
    # union.
    sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1])     # [n, n]  sum_masks重复了n行得到sum_masks_x
    # iou.
    iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix)
    rows = L.range(0, n_samples, 1, 'int32')
    cols = L.range(0, n_samples, 1, 'int32')
    rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1])
    cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples])
    tri_mask = L.cast(rows > cols, 'float32')
    iou_matrix = tri_mask * iou_matrix   # [n, n]   只取上三角部分

    # label_specific matrix.
    cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1])     # [n, n]  cate_labels重复了n行得到cate_labels_x
    label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32')
    label_matrix = tri_mask * label_matrix   # [n, n]   只取上三角部分

    # IoU compensation
    compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0)
    compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1])     # [n, n]
    compensate_iou = L.transpose(compensate_iou, [1, 0])      # [n, n]

    # IoU decay
    decay_iou = iou_matrix * label_matrix

    # # matrix nms
    if kernel == 'gaussian':
        decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2))
        compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2))
        decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0)
    elif kernel == 'linear':
        decay_matrix = (1-decay_iou)/(1-compensate_iou)
        decay_coefficient = L.reduce_min(decay_matrix, dim=0)
    else:
        raise NotImplementedError

    # update the score.
    cate_scores_update = cate_scores * decay_coefficient
    return cate_scores_update
示例#15
0
    def build_model(self):
        node_features = self.graph_wrapper.node_feat["feat"]

        output = self.gcn(gw=self.graph_wrapper,
                          feature=node_features,
                          hidden_size=self.hidden_size,
                          activation="relu",
                          norm=self.graph_wrapper.node_feat["norm"],
                          name="gcn_layer_1")
        output1 = output
        output = self.gcn(gw=self.graph_wrapper,
                          feature=output,
                          hidden_size=self.hidden_size,
                          activation="relu",
                          norm=self.graph_wrapper.node_feat["norm"],
                          name="gcn_layer_2")
        output2 = output
        output = self.gcn(gw=self.graph_wrapper,
                          feature=output,
                          hidden_size=self.hidden_size,
                          activation="relu",
                          norm=self.graph_wrapper.node_feat["norm"],
                          name="gcn_layer_3")

        output = L.concat(input=[output1, output2, output], axis=-1)

        output, ratio_length = sag_pool(gw=self.graph_wrapper,
                                        feature=output,
                                        ratio=self.pooling_ratio,
                                        graph_id=self.graph_id,
                                        dataset=self.args.dataset_name,
                                        name="sag_pool_1")
        output = L.lod_reset(output, self.graph_wrapper.graph_lod)
        cat1 = L.sequence_pool(output, "sum")
        ratio_length = L.cast(ratio_length, dtype="float32")
        cat1 = L.elementwise_div(cat1, ratio_length, axis=-1)
        cat2 = L.sequence_pool(output, "max")
        output = L.concat(input=[cat2, cat1], axis=-1)

        output = L.fc(output, size=self.hidden_size, act="relu")
        output = L.dropout(output, dropout_prob=self.dropout_ratio)
        output = L.fc(output, size=self.hidden_size // 2, act="relu")
        output = L.fc(output,
                      size=self.num_classes,
                      act=None,
                      param_attr=fluid.ParamAttr(name="final_fc"))

        self.labels = L.cast(self.labels, dtype="float32")
        loss = L.sigmoid_cross_entropy_with_logits(x=output, label=self.labels)
        self.loss = L.mean(loss)
        pred = L.sigmoid(output)
        self.pred = L.argmax(x=pred, axis=-1)
        correct = L.equal(self.pred, self.labels_1dim)
        correct = L.cast(correct, dtype="int32")
        self.correct = L.reduce_sum(correct)
示例#16
0
    def forward(self, features):
        src_ids, sent_ids = features
        dtype = 'float16' if self.hparam['fp16'] else 'float32'
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0
        #input_mask = L.unsqueeze(input_mask, axes=[2])
        d_shape = L.shape(src_ids)
        seqlen = d_shape[1]
        batch_size = d_shape[0]
        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
        pos_ids = L.expand(pos_ids, [batch_size, 1])
        pos_ids = L.unsqueeze(pos_ids, axes=[2])
        pos_ids = L.cast(pos_ids, 'int64')
        pos_ids.stop_gradient = True
        input_mask.stop_gradient = True
        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
        task_ids.stop_gradient = True

        bert = ErnieModel(
            src_ids=src_ids,
            position_ids=pos_ids,
            sentence_ids=sent_ids,
            task_ids=task_ids,
            input_mask=input_mask,
            config=self.hparam,
            use_fp16=self.hparam['fp16']
        )

        cls_feats = bert.get_pooled_output()

        cls_feats = L.dropout(
            x=cls_feats,
            dropout_prob=0.1,
            dropout_implementation="upscale_in_train"
        )

        logits = L.fc(
            input=cls_feats,
            size=self.hparam['num_label'],
            param_attr=F.ParamAttr(
                name="cls_out_w",
                initializer=F.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=F.ParamAttr(
                name="cls_out_b", initializer=F.initializer.Constant(0.))
        )

        propeller.summary.histogram('pred', logits)

        if self.mode is propeller.RunMode.PREDICT:
            probs = L.softmax(logits)
            return probs
        else:
            return logits
示例#17
0
        def bow(ids):
            embed = L.embedding(
                input=ids,
                size=[self.config.vocab_size, self.config.emb_size],
                dtype=self._emb_dtype,
                param_attr=F.ParamAttr(name=self._word_emb_name,
                                       initializer=self._param_initializer),
                is_sparse=False)

            zero = L.fill_constant(shape=[1], dtype='int64', value=0)
            pad = L.cast(L.logical_not(L.equal(ids, zero)), 'float32')
            sumed = L.reduce_sum(embed * pad, dim=1)
            sumed = L.softsign(sumed)
            return sumed
示例#18
0
    def empty(cls, stack_data, dtype='bool'):
        """Return True if stack is empty(pos == 0)

        Args:
            stack_data (TYPE): NULL
            dtype (str): result dtype. Default is bool.

        Returns: Variable
                 shape=[-1], dtype=params<dtype>

        Raises: NULL
        """
        zeros = layers.zeros_like(stack_data.pos)
        output = layers.equal(stack_data.pos, zeros)
        if dtype != 'bool':
            output = layers.cast(output, dtype=dtype)
        return output
示例#19
0
    def build_model(self, enc_input, dec_input, tgt_label, label_weights):
        """Build the model with source encoding and target decoding"""

        enc_word_output, enc_sen_output = self.encode(enc_input)
        dec_output = self.decode(dec_input, enc_word_output, enc_sen_output)

        predict_token_idx = layers.argmax(dec_output, axis=-1)
        correct_token_idx = layers.cast(layers.equal(
            tgt_label, layers.reshape(predict_token_idx, shape=[-1, 1])),
                                        dtype='float32')
        weighted_correct = layers.elementwise_mul(x=correct_token_idx,
                                                  y=label_weights,
                                                  axis=0)
        sum_correct = layers.reduce_sum(weighted_correct)
        sum_correct.stop_gradient = True

        # Padding index do not contribute to the total loss. The weights is used to
        # cancel padding index in calculating the loss.
        if self._label_smooth_eps:
            # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
            # the enforcement that the last dimension of label must be 1.
            tgt_label = layers.label_smooth(label=layers.one_hot(
                input=tgt_label, depth=self.voc_size),
                                            epsilon=self._label_smooth_eps)

        cost = layers.softmax_with_cross_entropy(
            logits=dec_output,
            label=tgt_label,
            soft_label=True if self._label_smooth_eps else False)

        weighted_cost = layers.elementwise_mul(x=cost, y=label_weights, axis=0)
        sum_cost = layers.reduce_sum(weighted_cost)
        token_num = layers.reduce_sum(label_weights)
        token_num.stop_gradient = True
        avg_cost = sum_cost / token_num

        graph_vars = {
            "loss": avg_cost,
            "sum_correct": sum_correct,
            "token_num": token_num,
        }
        for k, v in graph_vars.items():
            v.persistable = True

        return graph_vars
    def grow_topk(self, i, logits, alive_seq, alive_log_probs, cache, enc_output, enc_bias):
        """
            grow_topk
        """
        logits = layers.reshape(logits, [self.batch_size, self.beam_size, -1])
        
        candidate_log_probs = layers.log(layers.softmax(logits, axis=2))
        log_probs = candidate_log_probs + layers.unsqueeze(alive_log_probs, axes=[2]) 
        
        base_1 = layers.cast(i, 'float32') + 6.0
        base_1 /= 6.0
        length_penalty = layers.pow(base_1, self.alpha)
        #length_penalty = layers.pow(((5.0 + layers.cast(i+1, 'float32')) / 6.0), self.alpha)
        
        curr_scores = log_probs / length_penalty
        flat_curr_scores = layers.reshape(curr_scores, [self.batch_size, self.beam_size * self.vocab_size])

        topk_scores, topk_ids = layers.topk(flat_curr_scores, k=self.beam_size * 2)
        
        topk_log_probs = topk_scores * length_penalty

        select_beam_index = topk_ids // self.vocab_size
        select_id = topk_ids % self.vocab_size

        #layers.Print(select_id, message="select_id", summarize=1024)
        #layers.Print(topk_scores, message="topk_scores", summarize=10000000)
        
        flat_select_beam_index = layers.reshape(select_beam_index, [-1]) + self.gather_top2k_append_index
        
        topk_seq = layers.gather(alive_seq, [flat_select_beam_index])
        topk_seq = layers.reshape(topk_seq, [self.batch_size, 2 * self.beam_size, -1])
        
        
        #concat with current ids
        topk_seq = layers.concat([topk_seq, layers.unsqueeze(select_id, axes=[2])], axis=2)
        topk_finished = layers.cast(layers.equal(select_id, self.eos_id), 'float32') 
        
        #gather cache
        self.gather_cache(cache, flat_select_beam_index)

        #topk_seq: [batch_size, 2*beam_size, i+1]
        #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size]
        return topk_seq, topk_log_probs, topk_scores, topk_finished, cache
示例#21
0
    def forward(self, features):
        src_ids, sent_ids, input_seqlen = features
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.equal(src_ids, zero),
                            'float32')  # assume pad id == 0
        #input_mask = L.unsqueeze(input_mask, axes=[2])
        d_shape = L.shape(src_ids)
        seqlen = d_shape[1]
        batch_size = d_shape[0]
        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
        pos_ids = L.expand(pos_ids, [batch_size, 1])
        pos_ids = L.unsqueeze(pos_ids, axes=[2])
        pos_ids = L.cast(pos_ids, 'int64')
        pos_ids.stop_gradient = True
        input_mask.stop_gradient = True
        task_ids = L.zeros_like(
            src_ids) + self.hparam.task_id  #this shit wont use at the moment
        task_ids.stop_gradient = True

        model = ErnieModel(src_ids=src_ids,
                           position_ids=pos_ids,
                           sentence_ids=sent_ids,
                           task_ids=task_ids,
                           input_mask=input_mask,
                           config=self.hparam,
                           use_fp16=self.hparam['use_fp16'])

        enc_out = model.get_sequence_output()
        logits = L.fc(
            input=enc_out,
            size=self.num_label,
            num_flatten_dims=2,
            param_attr=F.ParamAttr(
                name="cls_seq_label_out_w",
                initializer=F.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=F.ParamAttr(name="cls_seq_label_out_b",
                                  initializer=F.initializer.Constant(0.)))

        propeller.summary.histogram('pred', logits)

        return logits, input_seqlen
示例#22
0
def _check_finished(decoder, next_inputs, finished, outputs_array):
    """check finished instance by next_inputs.action, and
    update finished tag and write END to outputs

    Args:
        decoder (TYPE): NULL
        next_inputs (TYPE): NULL
        finished (TYPE): NULL
        outputs_array (TYPE): NULL

    Returns: TODO

    Raises: NULL
    """
    act_stop = tensor.fill_constant_batch_size_like(
        next_inputs.action,
        shape=next_inputs.action.shape,
        value=decoder._grammar.ACTION_STOP,
        dtype='int64')
    new_finished = layers.logical_and(
        layers.equal(next_inputs.action, act_stop),
        layers.logical_not(finished))

    end_token_id = tensor.fill_constant_batch_size_like(
        outputs_array.data,
        shape=[-1],
        value=decoder._grammar.END,
        dtype=outputs_array.data.dtype)
    out_data_tmp, out_pos_tmp = data_structure.Array.push(outputs_array,
                                                          end_token_id,
                                                          in_place=False)
    new_data, new_pos = nn_utils.ifelse(
        new_finished, [out_data_tmp, out_pos_tmp],
        [outputs_array.data, outputs_array.pos])

    layers.assign(new_data, outputs_array.data)
    layers.assign(new_pos, outputs_array.pos)
    layers.logical_or(finished, new_finished, out=finished)
示例#23
0
    def training_network(self, img, caption):
        # build caption and mask
        target = caption[:, 1:]
        source = caption[:, :-1]
        padding_filled = layers.fill_constant_batch_size_like(target, shape=[-1, decoder_config['sentence_length'] - 1],
                                                              dtype='int64', value=config.dc['padding_idx'])
        mask = layers.equal(target, padding_filled)
        mask = layers.cast(layers.logical_not(mask), 'float32')
        scale_factor = layers.reduce_sum(mask)
        mask.stop_gradient = True
        scale_factor.stop_gradient = True

        # mdl
        decoder = Decoder(decoder_config['hidden_dim'], rnn_layer=1)
        image_embed, global_image_feat = self._img2feature(img)  # [batch, k+1, hidden], [batch, hidden]

        # 这里要改,要么在rnn里面做embedding,要么在外面做!
        seq_out = decoder.call(global_image_feat, image_embed, embedding_function, words=source)

        loss = layers.squeeze(ImageCaptionModel.loss(target, seq_out), axes=[2])
        loss = layers.elementwise_mul(loss, mask)
        output_loss = layers.elementwise_div(layers.reduce_sum(loss), scale_factor, name='loss')
        return output_loss
示例#24
0
def decode_with_grammar(decoder, inits, decode_vocab, max_step_num, **kwargs):
    """A modification of paddle.fluid.layers.dynamic_decode(...).
    Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
    Tensor indicating finished status contains all True values or the number of
    decoding step reachs to :attr:`max_step_num`.
    :code:`decoder.initialize()` would be called once before the decoding loop.
    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
    would be called once after the decoding loop.

    Args:
        decoder(Decoder): An instance of `Decoder`.
        inits(tuple): Argument passed to `decoder.initialize`. 
        decode_vocab(DecoderDynamicVocab): namedtuple(table table_len column column_len value value_len)
        max_step_num(int): The maximum number of steps.
        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. 

    Returns:
        tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
            outputs and states, both are Tensor or nested structure of Tensor. \
            `final_outputs` has the same structure and data types as \
            :code:`decoder.output_dtype` , and each Tenser in `final_outputs` \
            is the stacked of all decoding steps' outputs, which might be revised \
            by :code:`decoder.finalize` . `final_states` is the counterpart \
            at last time step of initial states returned by :code:`decoder.initialize` , \
            thus has the same structure with it and has tensors with same shapes \
            and data types.
    """
    step_cnt = tensor.fill_constant(shape=[1], dtype="int64", value=1)
    max_step_num_tensor = tensor.fill_constant(shape=[1],
                                               dtype="int64",
                                               value=max_step_num - 2)

    # shape = [batch_size, beam_size, ...]
    initial_inputs, initial_states, initial_finished = decoder.initialize(
        inits, decode_vocab)
    global_inputs, global_states, global_finished = (initial_inputs,
                                                     initial_states,
                                                     initial_finished)
    inputs = initial_inputs
    states = initial_states

    # 保存输出结果
    outputs_arr_data = tensor.fill_constant_batch_size_like(
        inputs.input,
        shape=[-1, decoder.beam_size, max_step_num],
        dtype=decoder.output_dtype.predicted_ids,
        value=0)
    outputs_arr_pos = tensor.fill_constant_batch_size_like(
        inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0)
    outputs_array = data_structure.ArrayData(
        decoder.merge_batch_beams(outputs_arr_data),
        decoder.merge_batch_beams(outputs_arr_pos))

    sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished),
                                   "int64")

    # 按语法解码的相关约束数据结构
    grammar_stack_dat = tensor.fill_constant_batch_size_like(
        inputs.input,
        shape=[-1, decoder.beam_size, max_step_num * STACK_EXPAND_TIMES],
        dtype='int64',
        value=0)
    grammar_stack_pos = tensor.fill_constant_batch_size_like(
        inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0)
    grammar_stack = data_structure.StackData(
        decoder.merge_batch_beams(grammar_stack_dat),
        decoder.merge_batch_beams(grammar_stack_pos))

    ############        循环解码,直到全部为 finish 状态        ############
    #   finish 的判断:通过 global_finished/next_finished && max_step_num 判断
    cond = layers.logical_not((layers.reduce_all(initial_finished)))
    while_op = layers.While(cond)
    with while_op.block():
        # step_outputs --> OutputWrapper
        # next_states  --> StateWrapper
        # next_inputs  --> DecoderInputsWrapper
        step_outputs, next_states, next_inputs = decoder.step(
            inputs, states, **kwargs)
        predicted_ids = step_outputs.predicted_ids
        _save_predict_output(outputs_array, predicted_ids,
                             next_states.finished)

        pred_gmr_type = decoder.grammar_type(predicted_ids)
        cond_type_leaf = layers.equal(pred_gmr_type, decoder.GMR_TYPE.LEAF)
        cond_type_midd = layers.equal(pred_gmr_type, decoder.GMR_TYPE.MID)

        _process_type_leaf(cond_type_leaf, decoder, grammar_stack, next_inputs,
                           next_states.finished)
        _process_type_midd(cond_type_midd, decoder, grammar_stack, next_inputs,
                           predicted_ids)

        ##next_sequence_lengths = layers.elementwise_add(sequence_lengths,
        ##                        tensor.cast(layers.logical_not(global_finished), sequence_lengths.dtype))

        _check_finished(decoder, next_inputs, next_states.finished,
                        outputs_array)

        layers.utils.map_structure(tensor.assign, next_inputs, global_inputs)
        layers.utils.map_structure(tensor.assign, next_states, global_states)
        tensor.assign(next_states.finished, global_finished)
        ##tensor.assign(next_sequence_lengths, sequence_lengths)

        # 更新循环条件
        layers.increment(x=step_cnt, value=1.0, in_place=True)
        layers.logical_and(
            layers.logical_not(layers.reduce_all(next_states.finished)),
            layers.less_equal(step_cnt, max_step_num_tensor), cond)

    final_outputs = outputs_array.data
    final_states = global_states

    final_outputs, final_states = decoder.finalize(final_outputs,
                                                   global_states,
                                                   sequence_lengths)

    return final_outputs, final_states
示例#25
0
 def __init__(self, label, pred):
     """doc"""
     self.eq = L.equal(pred, label)
     self.reset()
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
示例#27
0
    def _greedy_search(self,
                       src_word,
                       src_pos,
                       src_slf_attn_bias,
                       trg_word,
                       trg_src_attn_bias,
                       bos_id=0,
                       eos_id=1,
                       max_len=256):
        # run encoder
        enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)

        # constant number
        batch_size = enc_output.shape[0]
        max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
        end_token_tensor = layers.fill_constant(shape=[batch_size, 1],
                                                dtype="int64",
                                                value=eos_id)

        predict_ids = []
        log_probs = layers.fill_constant(shape=[batch_size, 1],
                                         dtype="float32",
                                         value=0)
        trg_word = layers.fill_constant(shape=[batch_size, 1],
                                        dtype="int64",
                                        value=bos_id)
        finished = layers.fill_constant(shape=[batch_size, 1],
                                        dtype="bool",
                                        value=0)

        ## init states (caches) for transformer
        caches = [{
            "k":
            layers.fill_constant(shape=[batch_size, self.n_head, 0, self.d_key],
                                 dtype=enc_output.dtype,
                                 value=0),
            "v":
            layers.fill_constant(
                shape=[batch_size, self.n_head, 0, self.d_value],
                dtype=enc_output.dtype,
                value=0),
        } for i in range(self.n_layer)]

        for i in range(max_len):
            trg_pos = layers.fill_constant(shape=trg_word.shape,
                                           dtype="int64",
                                           value=i)
            logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
                                  enc_output, caches)
            step_log_probs = layers.log(layers.softmax(logits))
            log_probs = layers.elementwise_add(x=step_log_probs,
                                               y=log_probs,
                                               axis=0)
            scores = log_probs
            topk_scores, topk_indices = layers.topk(input=scores, k=1)

            finished = layers.logical_or(
                finished, layers.equal(topk_indices, end_token_tensor))
            trg_word = topk_indices
            log_probs = topk_scores

            predict_ids.append(topk_indices)

            if layers.reduce_all(finished).numpy():
                break

        predict_ids = layers.stack(predict_ids, axis=0)
        finished_seq = layers.transpose(predict_ids, [1, 2, 0])
        finished_scores = topk_scores

        return finished_seq, finished_scores
示例#28
0
def grammar_output(inputs,
                   actions,
                   gmr_mask,
                   last_col2tbl_mask,
                   decode_vocab,
                   grammar,
                   name=None,
                   column2table=None):
    """output logits according to grammar

    Args:
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        actions (Variable): shape = [batch_size, max_len]. infer 阶段 max_len 恒为1
        gmr_mask (Variable): shape = [batch_size, max_len, grammar_size]. infer 阶段 max_len 恒为1
        last_col2tbl_mask (Variable): shape = [batch_size, max_len, max_table]. 解码过程中,上一个step为column时,其对应的 table mask
        decode_vocab (DecoderDynamicVocab): (table, table_len, column, column_len, value, value_len, column2table_mask).
                                            这里的column2table_mask是跟column一一对应的table mask。
        gramamr (Grammar): NULL
        name (str): Variable 的 name 前缀。用于多次调用时的参数共享。默认为 None,表示参数不会共享。

    Returns: (Variable, Variable)
        output: 词表输出概率
        valid_table_mask: 只在预测阶段有效

    Raises: NULL
    """
    batch_size = layers.shape(inputs)[0]
    max_len = inputs.shape[1]
    vocab_size = grammar.vocab_size

    action_shape = [batch_size, max_len]
    act_apply_rule = tensor.fill_constant(shape=action_shape,
                                          value=grammar.ACTION_APPLY,
                                          dtype='int64')
    act_stop = tensor.fill_constant(shape=action_shape,
                                    value=grammar.ACTION_STOP,
                                    dtype='int64')
    act_select_t = tensor.fill_constant(shape=action_shape,
                                        value=grammar.ACTION_SELECT_T,
                                        dtype='int64')
    act_select_c = tensor.fill_constant(shape=action_shape,
                                        value=grammar.ACTION_SELECT_C,
                                        dtype='int64')
    act_select_v = tensor.fill_constant(shape=action_shape,
                                        value=grammar.ACTION_SELECT_V,
                                        dtype='int64')
    cond_apply_rule = layers.logical_or(layers.equal(actions, act_apply_rule),
                                        layers.equal(actions, act_stop))
    cond_select_t = layers.equal(actions, act_select_t)
    cond_select_c = layers.equal(actions, act_select_c)
    cond_select_v = layers.equal(actions, act_select_v)

    # expand vocab to [-1, max_len, ...]
    if max_len == 1:
        expand_to_seq_len = lambda x: layers.unsqueeze(x, [1])
    else:
        expand_to_seq_len = lambda x: layers.expand(layers.unsqueeze(
            x, [1]), [1, max_len] + [1] * (len(x.shape) - 1))
    table_enc = expand_to_seq_len(decode_vocab.table)
    table_len = expand_to_seq_len(decode_vocab.table_len)
    column_enc = expand_to_seq_len(decode_vocab.column)
    column_len = expand_to_seq_len(decode_vocab.column_len)
    value_enc = expand_to_seq_len(decode_vocab.value)
    value_len = expand_to_seq_len(decode_vocab.value_len)
    column2table_mask = expand_to_seq_len(decode_vocab.column2table_mask)

    # merge batch & seq_len dim
    inputs = nn_utils.merge_first_ndim(inputs, n=2)
    actions = nn_utils.merge_first_ndim(actions, n=2)
    gmr_mask = nn_utils.merge_first_ndim(gmr_mask, n=2)
    last_col2tbl_mask = nn_utils.merge_first_ndim(last_col2tbl_mask, n=2)
    table_enc = nn_utils.merge_first_ndim(table_enc, n=2)
    table_len = nn_utils.merge_first_ndim(table_len, n=2)
    column_enc = nn_utils.merge_first_ndim(column_enc, n=2)
    column_len = nn_utils.merge_first_ndim(column_len, n=2)
    value_enc = nn_utils.merge_first_ndim(value_enc, n=2)
    value_len = nn_utils.merge_first_ndim(value_len, n=2)
    column2table_mask = nn_utils.merge_first_ndim(column2table_mask, n=2)
    cond_apply_rule = nn_utils.merge_first_ndim(cond_apply_rule, n=2)
    cond_select_t = nn_utils.merge_first_ndim(cond_select_t, n=2)
    cond_select_c = nn_utils.merge_first_ndim(cond_select_c, n=2)
    cond_select_v = nn_utils.merge_first_ndim(cond_select_v, n=2)

    t_ptr_net = models.PointerNetwork(score_type="affine",
                                      name='gmr_output_t_ptr')
    c_ptr_net = models.PointerNetwork(score_type="affine",
                                      name='gmr_output_c_ptr')
    v_ptr_net = models.PointerNetwork(score_type="affine",
                                      name='gmr_output_v_ptr')

    ## 核心处理逻辑 ##
    apply_rule_output = _apply_rule(cond_apply_rule,
                                    inputs,
                                    gmr_mask,
                                    grammar,
                                    name=name)
    select_t_output = \
            _select_table(cond_select_t, inputs, table_enc, table_len, last_col2tbl_mask, t_ptr_net, grammar)
    select_c_output, valid_table_mask = \
            _select_column(cond_select_c, inputs, column_enc, column_len, c_ptr_net, grammar, column2table_mask)
    select_v_output = _select_value(cond_select_v, inputs, value_enc,
                                    value_len, v_ptr_net, grammar)

    output = fluider.elementwise_add(apply_rule_output,
                                     select_t_output,
                                     select_c_output,
                                     select_v_output,
                                     axis=0)
    output = layers.reshape(output, shape=[batch_size, max_len, vocab_size])
    return output, valid_table_mask
示例#29
0
    def beam_search(self,
                    src_word,
                    src_pos,
                    src_slf_attn_bias,
                    trg_word,
                    trg_src_attn_bias,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_len=256):
        def expand_to_beam_size(tensor, beam_size):
            tensor = layers.reshape(tensor,
                                    [tensor.shape[0], 1] + tensor.shape[1:])
            tile_dims = [1] * len(tensor.shape)
            tile_dims[1] = beam_size
            return layers.expand(tensor, tile_dims)

        def merge_batch_beams(tensor):
            return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] +
                                  tensor.shape[2:])

        def split_batch_beams(tensor):
            return fluid.layers.reshape(tensor,
                                        shape=[-1, beam_size] +
                                        list(tensor.shape[1:]))

        def mask_probs(probs, finished, noend_mask_tensor):
            # TODO: use where_op
            finished = layers.cast(finished, dtype=probs.dtype)
            probs = layers.elementwise_mul(layers.expand(
                layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
                                           noend_mask_tensor,
                                           axis=-1) - layers.elementwise_mul(
                                               probs, (finished - 1), axis=0)
            return probs

        def gather(x, indices, batch_pos):
            topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
            return layers.gather_nd(x, topk_coordinates)

        # run encoder
        enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)

        # constant number
        inf = float(1. * 1e7)
        batch_size = enc_output.shape[0]
        max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
        vocab_size_tensor = layers.fill_constant(shape=[1],
                                                 dtype="int64",
                                                 value=self.trg_vocab_size)
        end_token_tensor = to_variable(
            np.full([batch_size, beam_size], eos_id, dtype="int64"))
        noend_array = [-inf] * self.trg_vocab_size
        noend_array[eos_id] = 0
        noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
        batch_pos = layers.expand(
            layers.unsqueeze(
                to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
            [1, beam_size])

        predict_ids = []
        parent_ids = []
        ### initialize states of beam search ###
        log_probs = to_variable(
            np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size,
                     dtype="float32"))
        finished = to_variable(
            np.full([batch_size, beam_size], 0, dtype="bool"))
        ### initialize inputs and states of transformer decoder ###
        ## init inputs for decoder, shaped `[batch_size*beam_size, ...]`
        trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                        dtype="int64",
                                        value=bos_id)
        trg_pos = layers.zeros_like(trg_word)
        trg_src_attn_bias = merge_batch_beams(
            expand_to_beam_size(trg_src_attn_bias, beam_size))
        enc_output = merge_batch_beams(
            expand_to_beam_size(enc_output, beam_size))
        ## init states (caches) for transformer, need to be updated according to selected beam
        caches = [{
            "k":
            layers.fill_constant(
                shape=[batch_size * beam_size, self.n_head, 0, self.d_key],
                dtype=enc_output.dtype,
                value=0),
            "v":
            layers.fill_constant(
                shape=[batch_size * beam_size, self.n_head, 0, self.d_value],
                dtype=enc_output.dtype,
                value=0),
        } for i in range(self.n_layer)]

        for i in range(max_len):
            trg_pos = layers.fill_constant(shape=trg_word.shape,
                                           dtype="int64",
                                           value=i)
            caches = map_structure(  # can not be reshaped since the 0 size
                lambda x: x if i == 0 else merge_batch_beams(x), caches)
            logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
                                  enc_output, caches)
            caches = map_structure(split_batch_beams, caches)
            step_log_probs = split_batch_beams(
                fluid.layers.log(fluid.layers.softmax(logits)))
            step_log_probs = mask_probs(step_log_probs, finished,
                                        noend_mask_tensor)
            log_probs = layers.elementwise_add(x=step_log_probs,
                                               y=log_probs,
                                               axis=0)
            log_probs = layers.reshape(log_probs,
                                       [-1, beam_size * self.trg_vocab_size])
            scores = log_probs
            topk_scores, topk_indices = fluid.layers.topk(input=scores,
                                                          k=beam_size)
            beam_indices = fluid.layers.elementwise_floordiv(
                topk_indices, vocab_size_tensor)
            token_indices = fluid.layers.elementwise_mod(
                topk_indices, vocab_size_tensor)

            # update states
            caches = map_structure(
                lambda x: gather(x, beam_indices, batch_pos), caches)
            log_probs = gather(log_probs, topk_indices, batch_pos)
            finished = gather(finished, beam_indices, batch_pos)
            finished = layers.logical_or(
                finished, layers.equal(token_indices, end_token_tensor))
            trg_word = layers.reshape(token_indices, [-1, 1])

            predict_ids.append(token_indices)
            parent_ids.append(beam_indices)

            if layers.reduce_all(finished).numpy():
                break

        predict_ids = layers.stack(predict_ids, axis=0)
        parent_ids = layers.stack(parent_ids, axis=0)
        finished_seq = layers.transpose(
            layers.gather_tree(predict_ids, parent_ids), [1, 2, 0])
        finished_scores = topk_scores

        return finished_seq, finished_scores
示例#30
0
    def build_model(self, model_configs):
        self.update_params(model_configs)
        features = fluid.layers.data(name="features",
                                     shape=[None, self.seq_len_],
                                     dtype='int64')
        labels = fluid.layers.data(name="labels",
                                   shape=[None, self.seq_len_],
                                   dtype='int64')
        sequence_length_ph = fluid.layers.data(name="seq_len_ph",
                                               shape=[None],
                                               dtype='int64')
        sequence_mask_ph = fluid.layers.data(name="seq_mask_ph",
                                             shape=[None],
                                             dtype='float32')

        init_hidden = fluid.layers.data(
            name="init_hidden",
            shape=[None, self.num_layers_, self.n_hidden_],
            dtype='float32')
        init_cell = fluid.layers.data(
            name="init_cell",
            shape=[None, self.num_layers_, self.n_hidden_],
            dtype='float32')

        init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2])
        init_cell = layers.transpose(init_cell, perm=[1, 0, 2])

        init_hidden_reshape = layers.reshape(
            init_hidden, shape=[self.num_layers_, -1, self.n_hidden_])
        init_cell_reshape = layers.reshape(
            init_cell, shape=[self.num_layers_, -1, self.n_hidden_])

        features = layers.reshape(features, shape=[-1, self.seq_len_, 1])

        # word embedding
        inputs = layers.embedding(
            input=features,
            size=[self.vocab_size_, self.n_hidden_],
            dtype='float32',
            is_sparse=False,
            param_attr=fluid.ParamAttr(
                name='embedding_para',
                initializer=fluid.initializer.UniformInitializer(
                    low=-self.init_scale_, high=self.init_scale_)))

        # LSTM
        output, last_hidden, last_cell = self._build_rnn_graph(
            inputs, init_hidden, init_cell, sequence_length_ph)

        output = layers.reshape(output,
                                shape=[-1, self.seq_len_, self.n_hidden_],
                                inplace=True)
        self.last_hidden_ = layers.reshape(
            last_hidden, [-1, self.num_layers_, self.n_hidden_])
        self.last_cell_ = layers.reshape(
            last_cell, [-1, self.num_layers_, self.n_hidden_])

        # softmax
        softmax_w = layers.create_parameter(
            [self.n_hidden_, self.vocab_size_],
            dtype="float32",
            name="softmax_w",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale_, high=self.init_scale_))
        softmax_b = layers.create_parameter(
            [self.vocab_size_],
            dtype="float32",
            name='softmax_b',
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale_, high=self.init_scale_))

        logits = layers.matmul(output, softmax_w)
        logits = layers.elementwise_add(logits, softmax_b)
        logits = layers.reshape(logits,
                                shape=[-1, self.vocab_size_],
                                inplace=True)

        # correct predictions
        labels_reshaped = layers.reshape(labels, [-1])
        pred = layers.cast(layers.argmax(logits, 1), dtype="int64")
        correct_pred = layers.cast(layers.equal(pred, labels_reshaped),
                                   dtype="int64")
        self.pred_ = pred

        # predicting unknown is always considered wrong
        # only in paddle 1.8
        unk_tensor = layers.fill_constant(layers.shape(labels_reshaped),
                                          value=self.unk_symbol_,
                                          dtype='int64')
        pred_unk = layers.cast(layers.equal(pred, unk_tensor), dtype="int64")
        correct_unk = layers.elementwise_mul(pred_unk, correct_pred)

        # predicting padding is always considered wrong
        pad_tensor = layers.fill_constant(layers.shape(labels_reshaped),
                                          value=self.pad_symbol_,
                                          dtype='int64')
        pred_pad = layers.cast(layers.equal(pred, pad_tensor), dtype="int64")
        correct_pad = layers.elementwise_mul(pred_pad, correct_pred)

        # Reshape logits to be a 3-D tensor for sequence loss
        logits = layers.reshape(logits, [-1, self.seq_len_, self.vocab_size_])

        labels = layers.reshape(labels, [-1, self.seq_len_, 1])
        loss = layers.softmax_with_cross_entropy(logits=logits,
                                                 label=labels,
                                                 soft_label=False,
                                                 return_softmax=False)
        sequence_mask = layers.reshape(sequence_mask_ph,
                                       [-1, self.seq_len_, 1])
        loss = layers.reduce_mean(layers.elementwise_mul(loss, sequence_mask))

        eval_metric_ops = fluid.layers.reduce_sum(correct_pred) \
                - fluid.layers.reduce_sum(correct_unk) \
                - fluid.layers.reduce_sum(correct_pad)

        self.loss_ = loss
        self.correct_ = eval_metric_ops
        self.input_name_list_ = [
            'features', 'labels', 'seq_len_ph', 'seq_mask_ph', 'init_hidden',
            'init_cell'
        ]
        self.target_var_names_ = [
            self.loss_, self.last_hidden_, self.last_cell_, self.correct_
        ]

        self.program_ = fluid.default_main_program()
        self.startup_program_ = fluid.default_startup_program()