예제 #1
0
        def exist_objs_3(keep, masks, classes, scores, upsampled_size_out, resize_shape, ori_shape):
            keep = L.reshape(keep, (-1,))
            keep.stop_gradient = True
            masks = L.gather(masks, keep)      # [M4, s4, s4]   M4个物体的掩码概率
            scores = L.gather(scores, keep)    # [M4, ]   M4个物体的分数
            classes = L.gather(classes, keep)  # [M4, ]   M4个物体的类别id

            # 第五次过滤,只保留得分前cfg['max_per_img']个物体
            _, sort_inds = L.argsort(scores, axis=-1, descending=True)
            sort_inds = sort_inds[:cfg['max_per_img']]
            sort_inds.stop_gradient = True

            masks = L.gather(masks, sort_inds)      # [M5, s4, s4]   M5个物体的掩码概率
            scores = L.gather(scores, sort_inds)    # [M5, ]   M5个物体的分数
            classes = L.gather(classes, sort_inds)  # [M5, ]   M5个物体的类别id

            # 插值成图片输入张量的大小
            masks = L.resize_bilinear(L.unsqueeze(masks, axes=[0]), out_shape=upsampled_size_out, align_corners=False, align_mode=0)
            # 去掉黑边
            masks = L.slice(masks, axes=[2], starts=[0], ends=[resize_shape[0]])
            masks = L.slice(masks, axes=[3], starts=[0], ends=[resize_shape[1]])
            # 插值成原图大小
            masks = L.resize_bilinear(masks, out_shape=ori_shape[:2], align_corners=False, align_mode=0)
            # 掩码二值化
            masks = L.cast(masks > cfg['mask_thr'], 'float32')[0]
            return masks, classes, scores
예제 #2
0
                def is_finished(alive_log_prob, finished_scores,
                                finished_in_finished):

                    max_out_len = 200
                    max_length_penalty = layers.pow(
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=((5.0 + max_out_len) /
                                                    6.0)), alpha)

                    lower_bound_alive_score = layers.slice(
                        alive_log_prob, starts=[0], ends=[1],
                        axes=[0]) / max_length_penalty

                    lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished
                    lowest_score_of_fininshed_in_finished += (
                        1.0 - finished_in_finished) * -INF
                    lowest_score_of_fininshed_in_finished = layers.reduce_min(
                        lowest_score_of_fininshed_in_finished)

                    met = layers.less_than(
                        lower_bound_alive_score,
                        lowest_score_of_fininshed_in_finished)
                    met = layers.cast(met, 'float32')
                    bound_is_met = layers.reduce_sum(met)

                    finished_eos_num = layers.reduce_sum(finished_in_finished)

                    finish_cond = layers.less_than(
                        finished_eos_num,
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=beam_size))

                    return finish_cond
def decoder_step(gru_unit,
                 cue_gru_unit,
                 step_in,
                 hidden,
                 input_size,
                 hidden_size,
                 memory,
                 memory_mask,
                 knowledge,
                 mask=None):
    """ decoder step """
    # get attention out
    # get hidden top layers
    top_hidden = layers.slice(hidden, axes=[0], starts=[0], ends=[1])
    top_hidden = layers.squeeze(top_hidden, axes=[0])
    top_hidden = layers.unsqueeze(top_hidden, axes=[1])

    weight_memory, attn = dot_attention(top_hidden, memory, memory_mask)

    step_in = layers.unsqueeze(step_in, axes=[1])
    rnn_input_list = [step_in, weight_memory]
    if weight_memory.shape[0] == -1:
        knowledge_1 = layers.reshape(knowledge, shape=weight_memory.shape)
    else:
        knowledge_1 = knowledge
    cue_input_list = [knowledge_1, weight_memory]
    output_list = [weight_memory]

    rnn_input = layers.concat(rnn_input_list, axis=2)

    rnn_input = layers.squeeze(rnn_input, axes=[1])
    rnn_output, rnn_last_hidden = gru_unit(rnn_input, hidden, mask)

    cue_input = layers.concat(cue_input_list, axis=2)
    cue_input = layers.squeeze(cue_input, axes=[1])
    cue_rnn_out, cue_rnn_last_hidden = cue_gru_unit(cue_input, hidden, mask)

    h_y = layers.tanh(
        fc(rnn_last_hidden, hidden_size, hidden_size, name="dec_fc1"))
    h_cue = layers.tanh(
        fc(cue_rnn_last_hidden, hidden_size, hidden_size, name="dec_fc2"))

    concate_y_cue = layers.concat([h_y, h_cue], axis=2)
    k = layers.sigmoid(fc(concate_y_cue, hidden_size * 2, 1, name='dec_fc3'))

    new_hidden = h_y * k - h_cue * (k - 1.0)

    new_hidden_tmp = layers.transpose(new_hidden, perm=[1, 0, 2])
    output_list.append(new_hidden_tmp)

    real_out = layers.concat(output_list, axis=2)

    if mask:
        mask_tmp = layers.unsqueeze(mask, axes=[0])
        new_hidden = layers.elementwise_mul((new_hidden - hidden),
                                            mask_tmp,
                                            axis=0)
        new_hidden += hidden

    return real_out, new_hidden
예제 #4
0
    def _recognition_network(self,
                             token_ids,
                             type_ids,
                             pos_ids,
                             role_ids,
                             recognition_mask):
        mask_id = layers.fill_constant_batch_size_like(
            input=token_ids, shape=[-1, 1, 1], value=self.mask_id, dtype="int64")
        mask_emb = layers.embedding(
            input=mask_id,
            size=[self.vocab_size, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(
                name=self.token_emb_name, initializer=self.param_initializer))
        emb_out, n_head_self_attn_mask = self._gen_input(
            token_ids, type_ids, pos_ids, role_ids, recognition_mask, aux_emb=mask_emb)

        recognition_out, checkpoints = self._encode(emb_out, n_head_self_attn_mask)

        recognition_feat = layers.slice(
            input=recognition_out, axes=[1], starts=[0], ends=[1])
        recognition_feat = layers.fc(
            input=recognition_feat,
            size=self.hidden_size,
            act="tanh",
            param_attr=fluid.ParamAttr(
                name="recognition_fc.w_0", initializer=self.param_initializer),
            bias_attr="recognition_fc.b_0")
        logits = layers.fc(
            input=recognition_feat,
            size=self.latent_type_size,
            param_attr=fluid.ParamAttr(
                name=self.latent_emb_name, initializer=self.param_initializer),
            bias_attr="recognition_bias")
        return logits, checkpoints
예제 #5
0
def encoder_1(x_emb,
              vocab_size,
              emb_size,
              init_hidden=None,
              init_cell=None,
              para_name='',
              args=None):
    rnn_input = x_emb
    #rnn_input.stop_gradient = True
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    num_layers = 2
    for i in range(num_layers):
        #rnn_input = dropout(rnn_input, False, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        #rnn_out = dropout(rnn_out, test_mode, args)
        rnn_out.stop_gradient = True
        rnn_outs.append(rnn_out)
        #rnn_outs_ori.stop_gradient = True
        rnn_outs_ori.append(rnn_out_ori)
    #ipdb.set_trace()
    #layers.Print(input_seq, message='input_seq', summarize=10)
    #layers.Print(rnn_outs[-1], message='rnn_outs', summarize=10)
    return rnn_outs[-1], rnn_outs_ori
예제 #6
0
    def test_slice(self):
        starts = [1, 0, 2]
        ends = [3, 3, 4]
        axes = [0, 1, 2]

        program = Program()
        with program_guard(program):
            input = layers.data(
                name="input", shape=[3, 4, 5, 6], dtype='float32')

            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
예제 #7
0
파일: elmo.py 프로젝트: zxn-sdsf/Senta
def encoder_wrapper(x_emb,
                    vocab_size,
                    emb_size,
                    init_hidden=None,
                    init_cell=None,
                    para_name='',
                    args=None):
    """
    encoder_wrapper
    """
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    num_layers = 2
    for i in range(num_layers):
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out.stop_gradient = True
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
    return rnn_outs, rnn_outs_ori
예제 #8
0
    def metrics(self, predictions, label):
        qid, logits = predictions

        positive_class_logits = L.slice(logits, axes=[1], starts=[1], ends=[2])
        mrr = propeller.metrics.Mrr(qid, label, positive_class_logits)

        predictions = L.argmax(logits, axis=1)
        predictions = L.unsqueeze(predictions, axes=[1])
        f1 = propeller.metrics.F1(label, predictions)
        acc = propeller.metrics.Acc(label, predictions)
        #auc = propeller.metrics.Auc(label, predictions)

        return {'acc': acc, 'f1': f1, 'mrr': mrr}
예제 #9
0
    def _dequeue_and_enqueue(self, keys):
        # gather keys before updating queue

        batch_size = keys.shape[0]

        ptr = int(self.queue_ptr)
       # assert self.K % batch_size == 0  # for simplic
        # replace the keys at ptr (dequeue and enqueue)
        if ptr==0:
            li = [L.transpose(keys, perm=[1, 0]), L.slice(self.queue, axes=[1], starts=[ptr+batch_size], ends=[self.K+100])]
        elif ptr+batch_size == self.K:
            print(ptr)
            print(keys.shape)
            li = [L.slice(self.queue, axes=[1], starts=[0], ends=[ptr]), L.transpose(keys, perm=[1, 0])]
        else:
            li = [L.slice(self.queue, axes=[1], starts=[0], ends=[ptr]), \
              L.transpose(keys, perm=[1, 0]), \
              L.slice(self.queue, axes=[1], starts=[ptr+batch_size], ends=[self.K+100])]
        self.queue = L.concat(li, axis=1)
        ptr = (ptr + batch_size) % self.K  # move pointer

        self.queue_ptr = ptr
def gru_rnn(input,
            input_size,
            hidden_size,
            init_hidden=None,
            batch_first=False,
            mask=None,
            num_layers=1,
            dropout=0.0,
            name="gru"):
    """ gru rnn """

    gru_unit = GRU_unit(input_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=dropout,
                        name=name + "_gru_unit")

    if batch_first:
        input = layers.transpose(x=input, perm=[1, 0, 2])
        if mask:
            mask = layers.transpose(mask, perm=[1, 0])

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(input)
        step_mask = None

        if mask:
            step_mask = rnn.step_input(mask)

        pre_hidden = rnn.memory(init=init_hidden)
        new_hidden, last_hidden = gru_unit(step_in, pre_hidden, step_mask)
        rnn.update_memory(pre_hidden, last_hidden)
        step_in = new_hidden
        rnn.step_output(step_in)
        rnn.step_output(last_hidden)

    rnn_res = rnn()
    rnn_out = rnn_res[0]
    last_hidden = layers.slice(rnn_res[1],
                               axes=[0],
                               starts=[-1],
                               ends=[1000000000])
    last_hidden = layers.reshape(last_hidden,
                                 shape=[num_layers, -1, hidden_size])

    if batch_first:
        rnnout = layers.transpose(x=rnn_out, perm=[1, 0, 2])

    return rnnout, last_hidden
예제 #11
0
def knowledge_task(enc_output: layers.data,
                   mask_pos_list: List[List[int]],
                   type_list: List[List[str]],
                   entities_size: int,
                   property_size: int,
                   name='knowledge'):
    """
    the knowledge task for pre-train stage:
    There are 2 types for knowledge, one for S, one for P
    1. mask entity to predict
    2. mask property to predict

    :param enc_output:
    :param mask_pos:
    :param type_list:
    :return:
    """
    assert len(mask_pos_list) == len(
        type_list
    ), "InputError: Type list must have the same length as mask_pos_list"
    s_fc = lambda x: layers.fc(x,
                               entities_size,
                               param_attr=fluid.ParamAttr(name=name + "_S_w"),
                               bias_attr=fluid.ParamAttr(name=name + "_S_b"),
                               name=name + "_S")
    p_fc = lambda x: layers.fc(x,
                               property_size,
                               param_attr=fluid.ParamAttr(name=name + "_P_w"),
                               bias_attr=fluid.ParamAttr(name=name + "_P_b"),
                               name=name + "_P")

    S_res_list = []
    P_res_list = []
    for batch_id, (mask_pos_sub,
                   type_sub) in enumerate(zip(mask_pos_list, type_list)):
        for mask_pos, mask_type in zip(mask_pos_sub, type_sub):
            tmp = layers.slice(enc_output,
                               axes=[0, 1, 2],
                               starts=[batch_id, mask_pos, 0],
                               ends=[batch_id + 1, mask_pos + 1, INT_MAX])
            if mask_type.lower() == 'p':
                P_res_list.append(p_fc(tmp))
            elif mask_type.lower() == 's':
                S_res_list.append(s_fc(tmp))
    S_output = layers.concat(S_res_list, axis=0)
    P_res_list = layers.concat(P_res_list, axis=0)
    return S_output, P_res_list
    def gru_step(self, input, hidden, mask=None):
        """ gru step """
        hidden_array = []
        for i in range(self.num_layers):
            hidden_temp = layers.slice(hidden,
                                       axes=[0],
                                       starts=[i],
                                       ends=[i + 1])
            hidden_temp = layers.reshape(hidden_temp,
                                         shape=[-1, self.hidden_size])
            hidden_array.append(hidden_temp)

        last_hidden_array = []
        for k in range(self.num_layers):
            trans_input = layers.matmul(input, self.weight_input_array[k])
            trans_input += self.bias_input_array[k]
            trans_hidden = layers.matmul(hidden_array[k],
                                         self.weight_hidden_array[k])
            trans_hidden += self.bias_hidden_array[k]

            input_array = layers.split(trans_input, num_or_sections=3, dim=-1)
            trans_array = layers.split(trans_hidden, num_or_sections=3, dim=-1)

            reset_gate = layers.sigmoid(input_array[0] + trans_array[0])
            input_gate = layers.sigmoid(input_array[1] + trans_array[1])
            new_gate = layers.tanh(input_array[2] +
                                   reset_gate * trans_array[2])

            new_hidden = new_gate + input_gate * (hidden_array[k] - new_gate)

            if mask:
                neg_mask = layers.fill_constant_batch_size_like(
                    input=mask, shape=[1], value=1.0, dtype='float32') - mask
                new_hidden = new_hidden * mask + hidden_array[k] * neg_mask

            last_hidden_array.append(new_hidden)
            input = new_hidden

            if self.dropout and self.dropout > 0.0:
                input = layers.dropout(input, dropout_prob=self.dropout)

        last_hidden = layers.concat(last_hidden_array, 0)
        last_hidden = layers.reshape(
            last_hidden, shape=[self.num_layers, -1, self.hidden_size])

        return input, last_hidden
예제 #13
0
def fluid_sequence_get_seq_len(lodtensor):
    """
    args:
        lodtensor: lod = [[0,4,7]]
    return:
        seq_len: lod = []
             data = [4, 3]
             shape = [-1, 1]
    """
    lodtensor_slice = layers.slice(lodtensor, axes=[1], starts=[0], ends=[1])
    assert lodtensor_slice.shape == (-1, 1), (lodtensor_slice.shape())
    ones = layers.cast(lodtensor_slice * 0 + 1,
                       'float32')  # (batch*seq_len, 1)
    ones = layers.lod_reset(ones, lodtensor)
    ones_padded = fluid_sequence_pad(ones, 0)  # (batch, max_seq_len, 1)
    ones_padded = layers.squeeze(ones_padded, [2])  # (batch, max_seq_len)
    seq_len = layers.cast(layers.reduce_sum(ones_padded, 1, keep_dim=True),
                          'int64')  # (batch, 1)
    return seq_len
예제 #14
0
    def _calc_bow_logits(self, enc_out, checkpoints, bow_pos):
        """Get the logits of generation."""
        bow_feat = layers.slice(input=enc_out, axes=[1], starts=[0], ends=[1])
        bow_feat = layers.reshape(x=bow_feat, shape=[-1, self.hidden_size])
        bow_pos = layers.cast(x=bow_pos, dtype="int32")
        bow_feat = layers.gather(input=bow_feat, index=bow_pos)

        bow_trans_feat = layers.fc(
            input=bow_feat,
            size=self.emb_size,
            act=self.hidden_act,
            param_attr=fluid.ParamAttr(name="bow_trans_fc.w_0",
                                       initializer=self.param_initializer),
            bias_attr=fluid.ParamAttr(name="bow_trans_fc.b_0"))

        bow_trans_feat = pre_process_layer(bow_trans_feat,
                                           self.post_cls_cmd,
                                           name="bow_trans")

        checkpoints.append(bow_trans_feat)

        if self.weight_sharing:
            fc_out = layers.matmul(
                x=bow_trans_feat,
                y=fluid.default_main_program().global_block().var(
                    self.token_emb_name),
                transpose_y=True)
            if self.cls_bias:
                fc_out += layers.create_parameter(
                    shape=[self.vocab_size],
                    dtype=self.dtype,
                    attr=fluid.ParamAttr(name="bow_out_fc.b_0"),
                    is_bias=True)
        else:
            bow_out_bias_attr = fluid.ParamAttr(
                name="bow_out_fc.b_0") if self.cls_bias else False
            fc_out = layers.fc(input=bow_trans_feat,
                               size=self.vocab_size,
                               param_attr=fluid.ParamAttr(
                                   name="bow_out_fc.w_0",
                                   initializer=self.param_initializer),
                               bias_attr=bow_out_bias_attr)
        return fc_out
예제 #15
0
파일: net.py 프로젝트: sshuster/Parakeet
def crop(x, audio_start, audio_length):
    """Crop the upsampled condition to match audio_length. The upsampled condition has the same time steps as the whole audio does. But since audios are sliced to 0.5 seconds randomly while conditions are not, upsampled conditions should also be sliced to extaclt match the time steps of the audio slice.

    Args:
        x (Variable): shape(B, C, T), dtype float32, the upsample condition.
        audio_start (Variable): shape(B, ), dtype: int64, the index the starting point.
        audio_length (int): the length of the audio (number of samples it contaions).

    Returns:
        Variable: shape(B, C, audio_length), cropped condition.
    """
    # crop audio
    slices = []  # for each example
    starts = audio_start.numpy()
    for i in range(x.shape[0]):
        start = starts[i]
        end = start + audio_length
        slice = F.slice(x[i], axes=[1], starts=[start], ends=[end])
        slices.append(slice)
    out = F.stack(slices)
    return out
예제 #16
0
 def false_fn(array, start, end):
     new_array = slice(array, starts=[start], ends=[end], axes=[0])
     return new_array
    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
예제 #18
0
    def forward(self):
        """ forward
        """
        src, dst = L.read_file(self.pyreader)

        src_id = L.slice(src, [0, 1, 2, 3], [0, 0, 0, 0],
                         [int(math.pow(2, 30)) - 1, 1, 1, 1])
        dst_id = L.slice(dst, [0, 1, 2, 3], [0, 0, 0, 0],
                         [int(math.pow(2, 30)) - 1, self.neg_num + 1, 1, 1])

        if self.is_sparse:
            # sparse mode use 2 dims input.
            src = L.reshape(src, [-1, 1])
            dst = L.reshape(dst, [-1, 1])

        # [b, 1, f, h]
        src_embed = split_embedding(src, self.num_nodes, self.hidden_size,
                                    self.embed_init, "weight", self.num_part,
                                    self.is_sparse)

        # [b, n+1, f, h]
        dst_embed = split_embedding(dst, self.num_nodes, self.hidden_size,
                                    self.embed_init, "weight", self.num_part,
                                    self.is_sparse)

        if self.is_sparse:
            src_embed = L.reshape(src_embed,
                                  [-1, 1, self.num_featuers, self.hidden_size])
            dst_embed = L.reshape(
                dst_embed,
                [-1, self.neg_num + 1, self.num_featuers, self.hidden_size])

        # [b, 1, 1, f]
        src_weight = L.softmax(
            L.embedding(src_id, [self.num_nodes, self.num_featuers],
                        param_attr=F.ParamAttr(name="alpha")))
        # [b, n+1, 1, f]
        dst_weight = L.softmax(
            L.embedding(dst_id, [self.num_nodes, self.num_featuers],
                        param_attr=F.ParamAttr(name="alpha")))

        # [b, 1, h]
        src_sum = L.squeeze(L.matmul(src_weight, src_embed), axes=[2])
        # [b, n+1, h]
        dst_sum = L.squeeze(L.matmul(dst_weight, dst_embed), axes=[2])

        logits = L.matmul(src_sum, dst_sum,
                          transpose_y=True)  # [batch_size, 1, neg_num+1]

        pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
                                                    "float32", 1)
        neg_label = L.fill_constant_batch_size_like(logits,
                                                    [-1, 1, self.neg_num],
                                                    "float32", 0)
        label = L.concat([pos_label, neg_label], -1)

        pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
                                                     "float32", self.neg_num)
        neg_weight = L.fill_constant_batch_size_like(logits,
                                                     [-1, 1, self.neg_num],
                                                     "float32", 1)
        weight = L.concat([pos_weight, neg_weight], -1)

        weight.stop_gradient = True
        label.stop_gradient = True

        loss = L.sigmoid_cross_entropy_with_logits(logits, label)
        loss = loss * weight
        loss = L.reduce_mean(loss)
        loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
        loss.persistable = True
        self.loss = loss
        return loss
예제 #19
0
    def encoder_static(input_embedding, len=3, init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(
                pre_hidden, shape=[-1, hidden_size], inplace=True)
            pre_cell = layers.reshape(
                pre_cell, shape=[-1, hidden_size], inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(
            input_embedding, num_or_sections=len, dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(
            last_cell, shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(
            real_res, shape=[len, -1, hidden_size], inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
예제 #20
0
    def create_model(self, pyreader_name, is_prediction=False):
        """Create the network"""

        if is_prediction:
            return self.fast_decode(pyreader_name)

        pyreader = fluid.layers.py_reader(
            capacity=50,
            shapes=[
                [-1, self.max_para_num, self.max_para_len],  # src_word
                [-1, self.max_para_num, self.max_para_len],  # src_word_pos
                [-1, self.max_para_num],  # src_sent_pos
                [-1, self.max_para_num,
                 self.max_para_len],  # src_words_slf_attn_bias
                [-1, self.max_para_num],  # src_sents_slf_attn_bias
                [-1, self.max_para_num, self.max_para_num],  # graph_attn_bias
                [-1, self.max_tgt_len],  # trg_word
                [-1, self.max_tgt_len],  # trg_pos
                [-1, self.max_tgt_len, self.max_tgt_len],  # trg_slf_attn_bias
                [-1, 1],  # tgt_label
                [-1, 1]
            ],  # label_weights
            dtypes=[
                'int64', 'int64', 'int64', 'float32', 'float32', 'float32',
                'int64', 'int64', 'float32', 'int64', 'float32'
            ],
            lod_levels=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            name=pyreader_name,
            use_double_buffer=True)

        (src_word, src_word_pos, src_sent_pos, src_words_slf_attn_bias, src_sents_slf_attn_bias,
         graph_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, tgt_label, label_weights) = \
            fluid.layers.read_file(pyreader)

        src_words_slf_attn_bias = layers.expand(
            layers.unsqueeze(src_words_slf_attn_bias, axes=[2, 3]),
            expand_times=[1, 1, self._n_head, self.max_para_len, 1])
        src_words_slf_attn_bias.stop_gradient = True
        src_sents_slf_attn_bias = layers.expand(
            layers.unsqueeze(src_sents_slf_attn_bias, axes=[1, 2]),
            expand_times=[1, self._n_head, self.max_para_num, 1])
        src_sents_slf_attn_bias.stop_gradient = True

        graph_attn_bias = layers.expand(layers.unsqueeze(graph_attn_bias,
                                                         axes=[1]),
                                        expand_times=[1, self._n_head, 1, 1])
        graph_attn_bias.stop_gradient = True

        trg_slf_attn_bias = layers.expand(layers.unsqueeze(trg_slf_attn_bias,
                                                           axes=[1]),
                                          expand_times=[1, self._n_head, 1, 1])
        trg_slf_attn_bias.stop_gradient = True

        tgt_src_words_attn_bias = layers.expand(
            layers.slice(src_words_slf_attn_bias,
                         axes=[3],
                         starts=[0],
                         ends=[1]),
            expand_times=[1, 1, 1, self.max_tgt_len, 1])
        tgt_src_words_attn_bias.stop_gradient = True

        tgt_src_sents_attn_bias = layers.expand(
            layers.slice(src_sents_slf_attn_bias,
                         axes=[2],
                         starts=[0],
                         ends=[1]),
            expand_times=[1, 1, self.max_tgt_len, 1])
        tgt_src_sents_attn_bias.stop_gradient = True

        src_word = layers.reshape(
            src_word, [-1, self.max_para_num, self.max_para_len, 1])
        src_word_pos = layers.reshape(
            src_word_pos, [-1, self.max_para_num, self.max_para_len, 1])
        src_sent_pos = layers.reshape(src_sent_pos, [-1, self.max_para_num, 1])
        trg_word = layers.reshape(trg_word, [-1, self.max_tgt_len, 1])
        trg_pos = layers.reshape(trg_pos, [-1, self.max_tgt_len, 1])
        tgt_label = layers.reshape(tgt_label, [-1, 1])
        label_weights = layers.reshape(label_weights, [-1, 1])

        enc_input = (src_word, src_word_pos, src_sent_pos,
                     src_words_slf_attn_bias, src_sents_slf_attn_bias,
                     graph_attn_bias)
        dec_input = (trg_word, trg_pos, trg_slf_attn_bias,
                     tgt_src_words_attn_bias, tgt_src_sents_attn_bias,
                     graph_attn_bias)

        graph_vars = self.build_model(enc_input=enc_input,
                                      dec_input=dec_input,
                                      tgt_label=tgt_label,
                                      label_weights=label_weights)

        return pyreader, graph_vars
예제 #21
0
    def gru_attention_infer(self, decoder_boot, max_length, char_num,
                            word_vector_dim, encoded_vector, encoded_proj,
                            decoder_size):
        init_state = decoder_boot
        beam_size = 1
        array_len = layers.fill_constant(
            shape=[1], dtype='int64', value=max_length)
        counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True)

        # fill the first element with init_state
        state_array = layers.create_array('float32')
        layers.array_write(init_state, array=state_array, i=counter)

        # ids, scores as memory
        ids_array = layers.create_array('int64')
        scores_array = layers.create_array('float32')
        rois_shape = layers.shape(init_state)
        batch_size = layers.slice(
            rois_shape, axes=[0], starts=[0], ends=[1]) + 1
        lod_level = layers.range(
            start=0, end=batch_size, step=1, dtype=batch_size.dtype)

        init_ids = layers.fill_constant_batch_size_like(
            input=init_state, shape=[-1, 1], value=0, dtype='int64')
        init_ids = layers.lod_reset(init_ids, lod_level)
        init_ids = layers.lod_append(init_ids, lod_level)

        init_scores = layers.fill_constant_batch_size_like(
            input=init_state, shape=[-1, 1], value=1, dtype='float32')
        init_scores = layers.lod_reset(init_scores, init_ids)
        layers.array_write(init_ids, array=ids_array, i=counter)
        layers.array_write(init_scores, array=scores_array, i=counter)

        full_ids = fluid.layers.fill_constant_batch_size_like(
            input=init_state, shape=[-1, 1], dtype='int64', value=1)

        cond = layers.less_than(x=counter, y=array_len)
        while_op = layers.While(cond=cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids_array, i=counter)
            pre_state = layers.array_read(array=state_array, i=counter)
            pre_score = layers.array_read(array=scores_array, i=counter)
            pre_ids_emb = layers.embedding(
                input=pre_ids,
                size=[char_num, word_vector_dim],
                dtype='float32')

            context = self.simple_attention(encoded_vector, encoded_proj,
                                            pre_state, decoder_size)

            # expand the recursive_sequence_lengths of pre_state 
            # to be the same with pre_score
            pre_state_expanded = layers.sequence_expand(pre_state, pre_score)
            context_expanded = layers.sequence_expand(context, pre_score)

            fc_1 = layers.fc(input=context_expanded,
                             size=decoder_size * 3,
                             bias_attr=False,
                             name="rnn_fc1")

            fc_2 = layers.fc(input=pre_ids_emb,
                             size=decoder_size * 3,
                             bias_attr=False,
                             name="rnn_fc2")

            decoder_inputs = fc_1 + fc_2
            current_state, _, _ = layers.gru_unit(
                input=decoder_inputs,
                hidden=pre_state_expanded,
                size=decoder_size * 3)
            current_state_with_lod = layers.lod_reset(
                x=current_state, y=pre_score)
            # use score to do beam search
            current_score = layers.fc(input=current_state_with_lod,
                                      size=char_num,
                                      bias_attr=True,
                                      act='softmax',
                                      name="rnn_out_fc")
            topk_scores, topk_indices = layers.topk(current_score, k=beam_size)

            new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
            fluid.layers.assign(new_ids, full_ids)

            layers.increment(x=counter, value=1, in_place=True)

            # update the memories
            layers.array_write(current_state, array=state_array, i=counter)
            layers.array_write(topk_indices, array=ids_array, i=counter)
            layers.array_write(topk_scores, array=scores_array, i=counter)

            # update the break condition: 
            # up to the max length or all candidates of
            # source sentences have ended.
            length_cond = layers.less_than(x=counter, y=array_len)
            finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
        return full_ids
예제 #22
0
def convlstm2d_rnn(rnn_input,
                   init_hidden,
                   init_cell,
                   padding,
                   hidden_h,
                   hidden_w,
                   filters,
                   filter_size,
                   drop_out=None,
                   sequence_length=None,
                   name='conv_lstm_2d'):

    # transpose : (sequence x batch x C x H x W)
    rnn_input = layers.transpose(rnn_input, [1, 0, 4, 2, 3])

    # generate mask
    mask = None
    if sequence_length:
        max_seq_len = layers.shape(rnn_input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    # init
    conv_lstm_2d = ConvLSTM2D_unit(filters, filter_size, padding)

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(rnn_input)

        if mask:
            step_mask = rnn.step_input(mask)

        if init_hidden and init_cell:
            pre_hidden = rnn.memory(init=init_hidden)
            pre_cell = rnn.memory(init=init_cell)
        else:
            pre_hidden = rnn.memory(batch_ref=rnn_input,
                                    shape=[-1, filters, hidden_h, hidden_w])
            pre_cell = rnn.memory(batch_ref=rnn_input,
                                  shape=[-1, filters, hidden_h, hidden_w])

        real_out, last_hidden, last_cell = conv_lstm_2d(
            step_in, pre_hidden, pre_cell)

        if mask:
            last_hidden = dot(last_hidden, step_mask, axis=0) - dot(
                pre_hidden, (step_mask - 1), axis=0)
            last_cell = dot(last_cell, step_mask, axis=0) - dot(
                pre_cell, (step_mask - 1), axis=0)

        rnn.update_memory(pre_hidden, last_hidden)
        rnn.update_memory(pre_cell, last_cell)

        rnn.step_output(last_hidden)
        rnn.step_output(last_cell)

        step_input = last_hidden

        if drop_out != None and drop_out > 0.0:
            step_input = layers.dropout(
                step_input,
                dropout_prob=drop_out,
                dropout_implementation='upscale_in_train')

    rnn_res = rnn()
    rnn_out = rnn_res[0]
    last_hidden = layers.slice(rnn_res[1],
                               axes=[0],
                               starts=[-1],
                               ends=[1000000000])

    rnn_out = layers.transpose(rnn_out, [1, 0, 3, 4, 2])
    last_hidden = layers.transpose(last_hidden, [1, 0, 3, 4, 2])

    # print('rnn_out ', rnn_out.shape)
    # print('last_hidden ', last_hidden.shape)

    return rnn_out, last_hidden
예제 #23
0
파일: lm_model.py 프로젝트: wbj0110/models
    def build(self):
        args = self.args
        emb_size = args.embed_size
        proj_size = args.embed_size
        hidden_size = args.hidden_size
        batch_size = args.batch_size
        num_layers = args.num_layers
        num_steps = args.num_steps

        lstm_outputs = []

        x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1)
        y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1)

        x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1)
        y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1)

        init_hiddens_ = layers.data(name="init_hiddens",
                                    shape=[1],
                                    dtype='float32')
        init_cells_ = layers.data(name="init_cells",
                                  shape=[1],
                                  dtype='float32')

        init_hiddens = layers.reshape(init_hiddens_,
                                      shape=[2 * num_layers, -1, proj_size])
        init_cells = layers.reshape(init_cells_,
                                    shape=[2 * num_layers, -1, hidden_size])

        init_hidden = layers.slice(init_hiddens,
                                   axes=[0],
                                   starts=[0],
                                   ends=[num_layers])
        init_cell = layers.slice(init_cells,
                                 axes=[0],
                                 starts=[0],
                                 ends=[num_layers])
        init_hidden_r = layers.slice(init_hiddens,
                                     axes=[0],
                                     starts=[num_layers],
                                     ends=[2 * num_layers])
        init_cell_r = layers.slice(init_cells,
                                   axes=[0],
                                   starts=[num_layers],
                                   ends=[2 * num_layers])

        if args.use_custom_samples:
            custom_samples = layers.data(
                name="custom_samples",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_samples_r = layers.data(
                name="custom_samples_r",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_probabilities = layers.data(
                name="custom_probabilities",
                shape=[args.n_negative_samples_batch + 1],
                dtype='float32',
                lod_level=1)
        else:
            custom_samples = None
            custom_samples_r = None
            custom_probabilities = None

        forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder(
            x_f,
            y_f,
            self.vocab_size,
            emb_size,
            init_hidden,
            init_cell,
            para_name='fw_',
            custom_samples=custom_samples,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)
        backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder(
            x_b,
            y_b,
            self.vocab_size,
            emb_size,
            init_hidden_r,
            init_cell_r,
            para_name='bw_',
            custom_samples=custom_samples_r,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)

        losses = layers.concat([forward[-1], backward[-1]])
        self.loss = layers.reduce_mean(losses)
        self.loss.persistable = True
        self.grad_vars = [x_f, y_f, x_b, y_b, self.loss]
        self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss']
        fw_vars_name = ['x_emb', 'proj', 'loss'] + [
            'init_hidden', 'init_cell'
        ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2']
        bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'
                        ] + ['init_hidden_r', 'init_cell_r'] + [
                            'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r',
                            'xproj_r', 'xproj2_r'
                        ]
        fw_vars = forward + [init_hidden, init_cell
                             ] + fw_hiddens + fw_cells + fw_projs
        bw_vars = backward + [init_hidden_r, init_cell_r
                              ] + bw_hiddens + bw_cells + bw_projs
        for i in range(len(fw_vars_name)):
            self.grad_vars.append(fw_vars[i])
            self.grad_vars.append(bw_vars[i])
            self.grad_vars_name.append(fw_vars_name[i])
            self.grad_vars_name.append(bw_vars_name[i])
        if args.use_custom_samples:
            self.feed_order = [
                'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r',
                'custom_probabilities'
            ]
        else:
            self.feed_order = ['x', 'y', 'x_r', 'y_r']
        self.last_hidden = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_hiddens_ori + bw_hiddens_ori
        ]
        self.last_cell = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_cells + bw_cells
        ]
        self.last_hidden = layers.concat(self.last_hidden, axis=0)
        self.last_hidden.persistable = True
        self.last_cell = layers.concat(self.last_cell, axis=0)
        self.last_cell.persistable = True
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
예제 #25
0
파일: lm_model.py 프로젝트: wbj0110/models
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(input=x,
                             size=[vocab_size, emb_size],
                             dtype='float32',
                             is_sparse=False,
                             param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter([vocab_size, emb_size],
                                             dtype="float32",
                                             name="softmax_weight")
    softmax_bias = layers.create_parameter([vocab_size],
                                           dtype="float32",
                                           name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(logits=projection,
                                                 label=label,
                                                 soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
예제 #26
0
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(
                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
                j = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2])
                f = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3])
                o = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell
예제 #27
0
    def call(self, global_img_feat, p_img_feat, embedding_fn, words=None):
        # 图片特征
        img_feat = layers.fc(p_img_feat, self.hid_size, num_flatten_dims=2, act='tanh')  # [batch, k, hid]
        img_feat_emb = layers.fc(p_img_feat, self.hid_size, num_flatten_dims=2)

        if self.mode == 'eval':
            word = layers.fill_constant_batch_size_like(global_img_feat, [-1],
                                                        dtype='int64',
                                                        value=config.data['start_idx'])
        else:
            words = layers.transpose(words, [1, 0])  # [seq, batch]
            words.stop_gradient = True
        # lstm 初始化
        hid, cell = create_zero_state(global_img_feat), create_zero_state(global_img_feat)

        # While loop 参数初始化
        mx = decoder_config['sentence_length'] - 1 if self.mode == 'train' else decoder_config['infer_max_length']
        if self.mode == 'eval':
            mx = decoder_config['infer_max_length']
            while_op_output = layers.create_array('int64')
        else:
            while_op_output = layers.create_array('float32')
        max_step = layers.fill_constant(shape=[1], dtype='int64', value=mx)
        step = layers.fill_constant(shape=[1], dtype='int64', value=0)
        cond = layers.less_than(step, max_step)
        while_op = layers.While(cond)

        with while_op.block():
            if self.mode == 'train':
                st = layers.cast(step, 'int32')
                word = layers.slice(words, axes=[0], starts=st, ends=st + 1)
                word = layers.squeeze(word, [0])
                word.stop_gradient = True

            word_emb = embedding_fn(word)
            # 这里可能用+效果更好?
            xt = layers.concat([word_emb, global_img_feat], axis=-1)  # [batch, feat]
            h, c = layers.lstm_unit(xt, hid, cell, param_attr=fluid.ParamAttr('lstm_w'),
                                    bias_attr=fluid.ParamAttr('lstm_b'))
            p_word_emb = layers.fc(xt, size=self.hid_size)
            p_hidden = layers.fc(hid, size=self.hid_size)
            sentinel_gate = layers.sigmoid(p_word_emb + p_hidden)  # [batch, hidden]
            sentinel = layers.elementwise_mul(sentinel_gate, layers.tanh(c))  # [batch, hidden]

            layers.assign(h, hid)
            layers.assign(c, cell)

            k = layers.shape(p_img_feat)[1]

            p_hid = layers.fc(h, self.hid_size, act='tanh')
            # attention 部分
            #     alpha
            hid_emb = layers.fc(p_hid, self.hid_size)  # [batch, hidden]
            exp_hid_emb = layers.expand(layers.unsqueeze(hid_emb, 1), [1, k + 1, 1])  # [batch, k+1, hidden]
            sentinel_emb = layers.unsqueeze(layers.fc(sentinel, self.hid_size), axes=1)  # [batch, 1, hidden]
            feat_emb = layers.concat([img_feat_emb, sentinel_emb], axis=1)  # [batch, k+1, hidden]
            z = layers.tanh(feat_emb + exp_hid_emb)  # [batch, k+1, 1]
            alpha = layers.fc(z, size=1, num_flatten_dims=2, act='softmax')  # [batch, k+1, 1]

            #     context vector

            context = layers.concat([img_feat, layers.unsqueeze(sentinel, axes=1)], axis=1)  # [batch, k+1, hidden]
            context = layers.elementwise_mul(context, alpha, axis=0)
            context = layers.reduce_mean(context, dim=1)  # [batch, hidden]

            out = layers.fc(context + p_hid, self.hid_size, act='tanh')

            word_pred = weight_tying_fc(out)  # [batch, vocab]

            if self.mode == 'eval':
                next_word = layers.argmax(word_pred, axis=-1)
                layers.assign(next_word, word)
                next_word = layers.cast(next_word, 'float32')
                layers.array_write(next_word, step, array=while_op_output)
            else:
                layers.array_write(word_pred, step, array=while_op_output)
            layers.increment(step)
            layers.less_than(step, max_step, cond=cond)
        if self.mode == 'train':
            output_time_major, _ = layers.tensor_array_to_tensor(while_op_output, axis=0, use_stack=True)
            output = layers.transpose(output_time_major, [1, 0, 2])
        else:
            output_time_major = layers.tensor_array_to_tensor(while_op_output, axis=0, use_stack=True)[0]
            output = layers.transpose(output_time_major, [1, 0])

        return output
예제 #28
0
    def fast_decode(self, pyreader_name):
        """Inference process of the model"""

        pyreader = fluid.layers.py_reader(
            capacity=50,
            shapes=[
                [-1, self.max_para_num, self.max_para_len],  # src_word
                [-1, self.max_para_num, self.max_para_len],  # src_word_pos
                [-1, self.max_para_num],  # src_sent_pos
                [-1, self.max_para_num,
                 self.max_para_len],  # src_words_slf_attn_bias
                [-1, self.max_para_num],  # src_sents_slf_attn_bias
                [-1, self.max_para_num, self.max_para_num],  # graph_attn_bias
                [-1, 1],  # start_tokens
                [-1, 1],  # init_scores
                [-1],  # parent_idx
                [-1, 1]
            ],  # data_ids
            dtypes=[
                'int64', 'int64', 'int64', 'float32', 'float32', 'float32',
                'int64', 'float32', 'int64', 'int64'
            ],
            lod_levels=[0, 0, 0, 0, 0, 0, 2, 2, 0, 0],
            name=pyreader_name,
            use_double_buffer=True)

        (src_word, src_word_pos, src_sent_pos, src_words_slf_attn_bias, src_sents_slf_attn_bias,
         graph_attn_bias, start_tokens, init_scores, parent_idx, data_ids) = \
            fluid.layers.read_file(pyreader)

        src_words_slf_attn_bias = layers.expand(
            layers.unsqueeze(src_words_slf_attn_bias, axes=[2, 3]),
            expand_times=[1, 1, self._n_head, self.max_para_len, 1])
        src_words_slf_attn_bias.stop_gradient = True
        src_sents_slf_attn_bias = layers.expand(
            layers.unsqueeze(src_sents_slf_attn_bias, axes=[1, 2]),
            expand_times=[1, self._n_head, self.max_para_num, 1])
        src_sents_slf_attn_bias.stop_gradient = True
        graph_attn_bias = layers.expand(layers.unsqueeze(graph_attn_bias,
                                                         axes=[1]),
                                        expand_times=[1, self._n_head, 1, 1])
        graph_attn_bias.stop_gradient = True

        tgt_src_words_attn_bias = layers.slice(src_words_slf_attn_bias,
                                               axes=[3],
                                               starts=[0],
                                               ends=[1])
        tgt_src_words_attn_bias.stop_gradient = True
        tgt_src_sents_attn_bias = layers.slice(src_sents_slf_attn_bias,
                                               axes=[2],
                                               starts=[0],
                                               ends=[1])
        tgt_src_sents_attn_bias.stop_gradient = True

        src_word = layers.reshape(
            src_word, [-1, self.max_para_num, self.max_para_len, 1])
        src_word_pos = layers.reshape(
            src_word_pos, [-1, self.max_para_num, self.max_para_len, 1])
        src_sent_pos = layers.reshape(src_sent_pos, [-1, self.max_para_num, 1])

        enc_input = (src_word, src_word_pos, src_sent_pos,
                     src_words_slf_attn_bias, src_sents_slf_attn_bias,
                     graph_attn_bias)
        enc_words_output, enc_sents_output = self.encode(enc_input=enc_input)

        def beam_search():
            """Beam search function"""

            max_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.max_out_len,
                                           force_cpu=True)
            min_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.min_out_len)
            neg_inf = layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=-INF)
            step_idx = layers.fill_constant(shape=[1],
                                            dtype=start_tokens.dtype,
                                            value=0,
                                            force_cpu=True)
            step_next_idx = layers.fill_constant(shape=[1],
                                                 dtype=start_tokens.dtype,
                                                 value=1,
                                                 force_cpu=True)
            cond = layers.less_than(x=step_idx,
                                    y=max_len)  # default force_cpu=True
            while_op = layers.While(cond)
            # array states will be stored for each step.
            ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                     step_idx)
            scores = layers.array_write(init_scores, step_idx)
            # cell states will be overwrited at each step.
            # caches contains states of history steps in decoder self-attention
            # and static encoder output projections in encoder-decoder attention
            # to reduce redundant computation.
            caches = [
                {
                    "k":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "v":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "static_k_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_v_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_k_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype),
                    "static_v_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype)
                } for i in range(self._dec_n_layer)
            ]

            trigram_blocking = TrigramBlocking(start_tokens,
                                               self.tokenizer,
                                               use_fp16=self._use_fp16,
                                               beam_size=self.beam_size)

            with while_op.block():
                pre_ids = layers.array_read(array=ids, i=step_idx)
                pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                # Since beam_search_op dosen't enforce pre_ids' shape, we can do
                # inplace reshape here which actually change the shape of pre_ids.
                # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                pre_scores = layers.array_read(array=scores, i=step_idx)
                # gather cell states corresponding to selected parent
                pre_src_words_attn_bias = layers.gather(
                    tgt_src_words_attn_bias, index=parent_idx)
                pre_src_sents_attn_bias = layers.gather(
                    tgt_src_sents_attn_bias, index=parent_idx)
                pre_graph_attn_bias = layers.gather(graph_attn_bias,
                                                    index=parent_idx)
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=
                        pre_src_sents_attn_bias,  # cann't use lod tensor here
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

                logits = self.decode(
                    dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias,
                               pre_src_sents_attn_bias, pre_graph_attn_bias),
                    enc_words_output=enc_words_output,
                    enc_sents_output=enc_sents_output,
                    caches=caches,
                    gather_idx=parent_idx)

                # prevent generating end token if length less than min_out_len
                eos_index = layers.fill_constant(
                    shape=[layers.shape(logits)[0]],
                    dtype='int64',
                    value=self.eos_idx)
                eos_index = fluid.one_hot(eos_index, depth=self.voc_size)
                less_cond = layers.cast(layers.less_than(x=step_idx,
                                                         y=min_len),
                                        dtype='float32')
                less_val = layers.elementwise_mul(less_cond, neg_inf)
                eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
                revised_logits = layers.elementwise_add(logits,
                                                        eos_val,
                                                        axis=0)

                # topK reduction across beams, also contain special handle of
                # end beams and end sentences(batch reduction)
                topk_scores, topk_indices = layers.topk(
                    input=layers.softmax(revised_logits), k=self.beam_size)

                # Roll-Back previous-scores for length-penalty
                # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
                # because of doing this, we need store the length-penaltied score in `scores`
                # while calculating use the un-penaltied score
                # -> safe for step_idx == 0 (initialization state), because previous-score == 0
                pre_timestep_length_penalty = fluid.layers.pow(
                    ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) /
                     6.0), self.len_penalty)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                    pre_scores, pre_timestep_length_penalty)

                # calc trigram-blocking delta scores for current alive sequence
                if self.block_trigram:
                    trigram_blocking.update_seq(pre_ids, parent_idx)
                    trigram_blocking.expand_cand_seq(topk_indices)
                    fluid.layers.py_func(
                        func=trigram_blocking.blocking_forward,
                        x=[
                            trigram_blocking.cand_seq,
                            trigram_blocking.id2is_full_token
                        ],
                        out=trigram_blocking.delta_score_out,
                        backward_func=None)
                    layers.Print(trigram_blocking.delta_score_out,
                                 summarize=100,
                                 message="trigram_blocking.delta_score_out")
                    pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                        x=trigram_blocking.delta_score_out,
                        y=pre_scores_wo_len_penalty,
                        axis=0)
                # => [N, topk]

                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores),
                    y=pre_scores_wo_len_penalty,
                    axis=0)

                cur_timestep_length_penalty = layers.pow(
                    ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) /
                     6.0), self.len_penalty)
                curr_scores = layers.elementwise_div(
                    accu_scores, cur_timestep_length_penalty)

                # beam_search op uses lod to differentiate branches.
                curr_scores = layers.lod_reset(curr_scores, pre_ids)
                topk_indices = layers.lod_reset(topk_indices, pre_ids)
                selected_ids, selected_scores, gather_idx = layers.beam_search(
                    pre_ids=pre_ids,
                    pre_scores=pre_scores,
                    ids=topk_indices,
                    scores=curr_scores,
                    beam_size=self.beam_size,
                    end_id=self.eos_idx,
                    return_parent_idx=True)

                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.increment(x=step_next_idx, value=1.0, in_place=True)
                # cell states(caches) have been updated in wrap_decoder,
                # only need to update beam search states here.
                layers.array_write(selected_ids, i=step_idx, array=ids)
                layers.array_write(selected_scores, i=step_idx, array=scores)
                layers.assign(gather_idx, parent_idx)
                layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias)
                layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias)
                layers.assign(pre_graph_attn_bias, graph_attn_bias)

                length_cond = layers.less_than(x=step_idx, y=max_len)
                finish_cond = layers.logical_not(
                    layers.is_empty(x=selected_ids))
                layers.logical_and(x=length_cond, y=finish_cond, out=cond)

            finished_ids, finished_scores = layers.beam_search_decode(
                ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

            return finished_ids, finished_scores

        finished_ids, finished_scores = beam_search()

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars
    def __call__(self,
                 location,
                 confidence,
                 gt_box,
                 gt_label,
                 landmark_predict,
                 lmk_label,
                 lmk_ignore_flag,
                 prior_box,
                 prior_box_var=None):
        def _reshape_to_2d(var):
            return layers.flatten(x=var, axis=2)

        helper = LayerHelper('ssd_loss')  #, **locals())
        # Only support mining_type == 'max_negative' now.
        mining_type = 'max_negative'
        # The max `sample_size` of negative box, used only
        # when mining_type is `hard_example`.
        sample_size = None
        num, num_prior, num_class = confidence.shape
        conf_shape = layers.shape(confidence)

        # 1. Find matched boundding box by prior box.
        # 1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
        iou = iou_similarity(x=gt_box, y=prior_box)
        # 1.2 Compute matched boundding box by bipartite matching algorithm.
        matched_indices, matched_dist = bipartite_match(
            iou, self.match_type, self.overlap_threshold)

        # 2. Compute confidence for mining hard examples
        # 2.1. Get the target label based on matched indices
        gt_label = layers.reshape(x=gt_label,
                                  shape=(len(gt_label.shape) - 1) * (0, ) +
                                  (-1, 1))
        gt_label.stop_gradient = True
        target_label, _ = target_assign(gt_label,
                                        matched_indices,
                                        mismatch_value=self.background_label)
        # 2.2. Compute confidence loss.
        # Reshape confidence to 2D tensor.
        confidence = _reshape_to_2d(confidence)
        target_label = tensor.cast(x=target_label, dtype='int64')
        target_label = _reshape_to_2d(target_label)
        target_label.stop_gradient = True
        conf_loss = layers.softmax_with_cross_entropy(confidence, target_label)
        # 3. Mining hard examples
        actual_shape = layers.slice(conf_shape, axes=[0], starts=[0], ends=[2])
        actual_shape.stop_gradient = True
        conf_loss = layers.reshape(x=conf_loss,
                                   shape=(-1, 0),
                                   actual_shape=actual_shape)
        conf_loss.stop_gradient = True
        neg_indices = helper.create_variable_for_type_inference(dtype='int32')
        updated_matched_indices = helper.create_variable_for_type_inference(
            dtype=matched_indices.dtype)
        helper.append_op(type='mine_hard_examples',
                         inputs={
                             'ClsLoss': conf_loss,
                             'LocLoss': None,
                             'MatchIndices': matched_indices,
                             'MatchDist': matched_dist,
                         },
                         outputs={
                             'NegIndices': neg_indices,
                             'UpdatedMatchIndices': updated_matched_indices
                         },
                         attrs={
                             'neg_pos_ratio': self.neg_pos_ratio,
                             'neg_dist_threshold': self.neg_overlap,
                             'mining_type': mining_type,
                             'sample_size': sample_size,
                         })

        # 4. Assign classification and regression targets
        # 4.1. Encoded bbox according to the prior boxes.
        encoded_bbox = box_coder(prior_box=prior_box,
                                 prior_box_var=prior_box_var,
                                 target_box=gt_box,
                                 code_type='encode_center_size')
        # 4.2. Assign regression targets
        target_bbox, target_loc_weight = target_assign(
            encoded_bbox,
            updated_matched_indices,
            mismatch_value=self.background_label)
        # 4.3. Assign classification targets
        target_label, target_conf_weight = target_assign(
            gt_label,
            updated_matched_indices,
            negative_indices=neg_indices,
            mismatch_value=self.background_label)

        target_loc_weight = target_loc_weight * target_label
        encoded_lmk_label = self.decode_lmk(lmk_label, prior_box,
                                            prior_box_var)

        target_lmk, target_lmk_weight = target_assign(
            encoded_lmk_label,
            updated_matched_indices,
            mismatch_value=self.background_label)
        lmk_ignore_flag = layers.reshape(
            x=lmk_ignore_flag,
            shape=(len(lmk_ignore_flag.shape) - 1) * (0, ) + (-1, 1))
        target_ignore, nouse = target_assign(
            lmk_ignore_flag,
            updated_matched_indices,
            mismatch_value=self.background_label)

        target_lmk_weight = target_lmk_weight * target_ignore
        landmark_predict = _reshape_to_2d(landmark_predict)
        target_lmk = _reshape_to_2d(target_lmk)
        target_lmk_weight = _reshape_to_2d(target_lmk_weight)
        lmk_loss = layers.smooth_l1(landmark_predict, target_lmk)
        lmk_loss = lmk_loss * target_lmk_weight
        target_lmk.stop_gradient = True
        target_lmk_weight.stop_gradient = True
        target_ignore.stop_gradient = True
        nouse.stop_gradient = True

        # 5. Compute loss.
        # 5.1 Compute confidence loss.
        target_label = _reshape_to_2d(target_label)
        target_label = tensor.cast(x=target_label, dtype='int64')

        conf_loss = layers.softmax_with_cross_entropy(confidence, target_label)
        target_conf_weight = _reshape_to_2d(target_conf_weight)
        conf_loss = conf_loss * target_conf_weight

        # the target_label and target_conf_weight do not have gradient.
        target_label.stop_gradient = True
        target_conf_weight.stop_gradient = True

        # 5.2 Compute regression loss.
        location = _reshape_to_2d(location)
        target_bbox = _reshape_to_2d(target_bbox)

        loc_loss = layers.smooth_l1(location, target_bbox)
        target_loc_weight = _reshape_to_2d(target_loc_weight)
        loc_loss = loc_loss * target_loc_weight

        # the target_bbox and target_loc_weight do not have gradient.
        target_bbox.stop_gradient = True
        target_loc_weight.stop_gradient = True

        # 5.3 Compute overall weighted loss.
        loss = self.conf_loss_weight * conf_loss + self.loc_loss_weight * loc_loss + 0.4 * lmk_loss
        # reshape to [N, Np], N is the batch size and Np is the prior box number.
        loss = layers.reshape(x=loss, shape=(-1, 0), actual_shape=actual_shape)
        loss = layers.reduce_sum(loss, dim=1, keep_dim=True)
        if self.normalize:
            normalizer = layers.reduce_sum(target_loc_weight) + 1
            loss = loss / normalizer

        return loss