示例#1
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
        Scaled Dot-Product Attention
        """

        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.

        # The current implementation of softmax_op only supports 2D tensor,
        # consequently it cannot be directly used here.
        # If to use the reshape_op, Besides, the shape of product inferred in
        # compile-time is not the actual shape in run-time. It cann't be used
        # to set the attribute of reshape_op.
        # So, here define the softmax for temporary solution.

        def __softmax(x, eps=1e-9):
            exp_out = layers.exp(x=x)
            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)

        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
        if dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=dropout_rate, is_test=False)
        out = layers.matmul(weights, v)
        return out
示例#2
0
def _ernie_model_forward(self,
                         src_ids,
                         sent_ids=None,
                         pos_ids=None,
                         input_mask=None,
                         attn_bias=None,
                         past_cache=None,
                         use_causal_mask=False,
                         num_layers=12,
                         depth=1.,
                         head_mask=None):
    assert len(
        src_ids.shape
    ) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr(
        src_ids.shape))
    assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None'
    d_batch = L.shape(src_ids)[0]
    d_seqlen = L.shape(src_ids)[1]
    if pos_ids is None:
        pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1])
        pos_ids = L.cast(pos_ids, 'int64')
    if attn_bias is None:
        if input_mask is None:
            input_mask = L.cast(src_ids != 0, 'float32')
        assert len(input_mask.shape) == 2
        input_mask = L.unsqueeze(input_mask, axes=[-1])
        attn_bias = L.matmul(input_mask, input_mask, transpose_y=True)
        if use_causal_mask:
            sequence = L.reshape(
                L.range(0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1])
            causal_mask = L.cast(
                (L.matmul(sequence, 1. / sequence, transpose_y=True) >= 1.),
                'float32')
            attn_bias *= causal_mask
    else:
        assert len(
            attn_bias.shape
        ) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape
    attn_bias = (1. - attn_bias) * -10000.0
    attn_bias = L.unsqueeze(attn_bias, [1])
    attn_bias.stop_gradient = True

    if sent_ids is None:
        sent_ids = L.zeros_like(src_ids)

    if head_mask is not None:
        if len(head_mask.shape) == 1:
            head_mask = L.unsqueeze(
                L.unsqueeze(L.unsqueeze(L.unsqueeze(head_mask, 0), 0), -1), -1)
            head_mask = L.expand(head_mask,
                                 expand_times=[num_layers, 1, 1, 1, 1])
        elif len(head_mask.shape) == 2:
            head_mask = L.unsqueeze(L.unsqueeze(L.unsqueeze(head_mask, 1), -1),
                                    -1)

    else:
        head_mask = [None] * num_layers

    src_embedded = self.word_emb(src_ids)
    pos_embedded = self.pos_emb(pos_ids)
    sent_embedded = self.sent_emb(sent_ids)
    embedded = src_embedded + pos_embedded + sent_embedded

    embedded = self.dropout(self.ln(embedded))

    encoded, hidden_list, cache_list = self.encoder_stack(
        embedded,
        attn_bias,
        past_cache=past_cache,
        num_layers=num_layers,
        depth_mult=depth,
        head_mask=head_mask)
    if self.pooler is not None:
        pooled = self.pooler(encoded[:, 0, :])
    else:
        pooled = None

    additional_info = {
        'hiddens': hidden_list,
        'caches': cache_list,
    }

    if self.return_additional_info:
        return pooled, encoded, additional_info
    else:
        return pooled, encoded
示例#3
0
    def scaled_dot_product_attention(q, k, v, attn_bias,
                                     biaffine_transformation,
                                     biaffine_transformation_bias,
                                     structure_mask, with_ent_structure, d_key,
                                     dropout_rate):
        """
        Scaled Dot-Product Attention
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)

        if with_ent_structure:
            # TRANSFORMATION
            # 1.reshape input
            # q: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden] -> [bs, 5, n_head, seq, hidden]
            # -> [5, n_head, bs, seq, hidden] -> [5, n_head, bs * seq, hidden]
            # transformation: [dependencies(5), n_head, hidden, hidden]
            # k: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden]
            q_ = layers.unsqueeze(scaled_q, [1])
            q_ = layers.expand(q_,
                               [1, biaffine_transformation.shape[0], 1, 1, 1])
            q_ = layers.transpose(q_, perm=[1, 2, 0, 3, 4])
            q_ = layers.reshape(
                q_,
                shape=[0, 0, -1, biaffine_transformation.shape[3]],
                inplace=True)
            k_ = layers.unsqueeze(k, [1])
            k_ = layers.expand(k_,
                               [1, biaffine_transformation.shape[0], 1, 1, 1])

            # 2.implement matmul
            # q * transformation: [5, n_head, bs * seq, hidden]
            # q * transformation: [5, n_head, bs * seq, hidden] -> [5, n_head, bs, seq, hidden]
            # -> [bs, dependencies(5), n_head, seq, hidden]
            # q * transformation * k: [bs, dependencies(5), n_head, seq, seq]
            structured_bias = layers.matmul(x=q_, y=biaffine_transformation)
            structured_bias = layers.reshape(
                structured_bias,
                shape=[0, 0, -1, k_.shape[3], k_.shape[4]],
                inplace=True)
            structured_bias = layers.transpose(structured_bias,
                                               perm=[2, 0, 1, 3, 4])
            structured_bias = layers.matmul(x=structured_bias,
                                            y=k_,
                                            transpose_y=True)

            structured_bias = layers.elementwise_add(
                structured_bias, biaffine_transformation_bias, axis=1)

            # mask & apply
            structured_bias = structured_bias * structure_mask
            structured_bias = layers.reduce_sum(structured_bias, dim=1)
            product += structured_bias

        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
示例#4
0
    def forward(self):
        src, dsts = L.read_file(self.pyreader)
        if self.is_sparse:
            # sparse mode use 2 dims input.
            src = L.reshape(src, [-1, 1])
            dsts = L.reshape(dsts, [-1, 1])

        if self.num_part is not None and self.num_part != 1 and not self.is_distributed:
            src_embed = split_embedding(src,
                                        self.num_nodes,
                                        self.hidden_size,
                                        self.embed_init,
                                        "weight",
                                        self.num_part,
                                        self.is_sparse,
                                        learning_rate=self.embedding_lr)

            dsts_embed = split_embedding(dsts,
                                         self.num_nodes,
                                         self.hidden_size,
                                         self.embed_init,
                                         "weight",
                                         self.num_part,
                                         self.is_sparse,
                                         learning_rate=self.embedding_lr)
        else:
            print("Leo: L.embedding:", self.num_nodes)
            src_embed = L.embedding(src, (self.num_nodes, self.hidden_size),
                                    self.is_sparse,
                                    self.is_distributed,
                                    param_attr=F.ParamAttr(
                                        name="weight",
                                        learning_rate=self.embedding_lr,
                                        initializer=self.embed_init))
            print("Leo: L.embedding:", src_embed)
            dsts_embed = L.embedding(dsts, (self.num_nodes, self.hidden_size),
                                     self.is_sparse,
                                     self.is_distributed,
                                     param_attr=F.ParamAttr(
                                         name="weight",
                                         learning_rate=self.embedding_lr,
                                         initializer=self.embed_init))

        if self.is_sparse:
            # reshape back
            src_embed = L.reshape(src_embed, [-1, 1, self.hidden_size])
            dsts_embed = L.reshape(dsts_embed,
                                   [-1, self.neg_num + 1, self.hidden_size])
        logits = L.matmul(src_embed, dsts_embed,
                          transpose_y=True)  # [batch_size, 1, neg_num+1]

        pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
                                                    "float32", 1)
        neg_label = L.fill_constant_batch_size_like(logits,
                                                    [-1, 1, self.neg_num],
                                                    "float32", 0)
        label = L.concat([pos_label, neg_label], -1)

        pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
                                                     "float32", self.neg_num)
        neg_weight = L.fill_constant_batch_size_like(logits,
                                                     [-1, 1, self.neg_num],
                                                     "float32", 1)
        weight = L.concat([pos_weight, neg_weight], -1)

        weight.stop_gradient = True
        label.stop_gradient = True

        loss = L.sigmoid_cross_entropy_with_logits(logits, label)
        loss = loss * weight
        loss = L.reduce_mean(loss)
        loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
        loss.persistable = True
        self.loss = loss
        return loss
示例#5
0
    def forward(self, src_ids, sent_ids=None, pos_ids=None, input_mask=None, attn_bias=None, past_cache=None,
                use_causal_mask=False):
        """
        Args:
            src_ids (`Variable` of shape `[batch_size, seq_len]`): 
                Indices of input sequence tokens in the vocabulary.
            sent_ids (optional, `Variable` of shape `[batch_size, seq_len]`): 
                aka token_type_ids, Segment token indices to indicate first and second portions of the inputs.
                if None, assume all tokens come from `segment_a`
            pos_ids(optional, `Variable` of shape `[batch_size, seq_len]`): 
                Indices of positions of each input sequence tokens in the position embeddings.
            input_mask(optional `Variable` of shape `[batch_size, seq_len]`): 
                Mask to avoid performing attention on the padding token indices of the encoder input.
            attn_bias(optional, `Variable` of shape `[batch_size, seq_len, seq_len] or False`): 
                3D version of `input_mask`, if set, overrides `input_mask`; if set not False, will not apply attention mask
            past_cache(optional, tuple of two lists: cached key and cached value,
                each is a list of `Variable`s of shape `[batch_size, seq_len, hidden_size]`):
                cached key/value tensor that will be concated to generated key/value when performing self attention. 
                if set, `attn_bias` should not be None.

        Returns:
            pooled (`Variable` of shape `[batch_size, hidden_size]`):
                output logits of pooler classifier
            encoded(`Variable` of shape `[batch_size, seq_len, hidden_size]`):
                output logits of transformer stack
            info (Dictionary):
                addtional middle level info, inclues: all hidden stats, k/v caches.
        """
        # d_batch, d_seqlen = src_ids.shape
        assert len(src_ids.shape) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr(src_ids.shape))
        assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None'
        d_batch = L.shape(src_ids)[0]
        d_seqlen = L.shape(src_ids)[1]
        if pos_ids is None:
            pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1])
            pos_ids = L.cast(pos_ids, 'int64')
        if attn_bias is None:
            if input_mask is None:
                input_mask = L.cast(src_ids != 0, 'float32')
            assert len(input_mask.shape) == 2
            input_mask = L.unsqueeze(input_mask, axes=[-1])
            attn_bias = L.matmul(input_mask, input_mask, transpose_y=True)
            if use_causal_mask:
                sequence = L.reshape(L.range(0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1])
                causal_mask = L.cast((L.matmul(sequence, 1. / sequence, transpose_y=True) >= 1.), 'float32')
                attn_bias *= causal_mask
        else:
            assert len(attn_bias.shape) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape
        attn_bias = (1. - attn_bias) * -10000.0
        attn_bias = L.unsqueeze(attn_bias, [1])
        attn_bias = L.expand(attn_bias, [1, self.n_head, 1, 1])  # avoid broadcast =_=
        attn_bias.stop_gradient = True

        if sent_ids is None:
            sent_ids = L.zeros_like(src_ids)

        src_embedded = self.word_emb(src_ids)
        pos_embedded = self.pos_emb(pos_ids)
        sent_embedded = self.sent_emb(sent_ids)
        embedded = src_embedded + pos_embedded + sent_embedded

        embedded = self.dropout(self.ln(embedded))

        encoded, hidden_list, cache_list = self.encoder_stack(embedded, attn_bias, past_cache=past_cache)
        if self.pooler is not None:
            pooled = self.pooler(encoded[:, 0, :])
        else:
            pooled = None

        additional_info = {
            'hiddens': hidden_list,
            'caches': cache_list,
        }

        if self.return_additional_info:
            return pooled, encoded, additional_info
        else:
            return pooled, encoded
    def _build_decoder(self,
                       enc_last_hidden,
                       enc_last_cell,
                       mode='train',
                       beam_size=10):
        softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \
                    default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))
        if mode == 'train':
            dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \
                    self.hidden_size, num_layers=self.num_layers, \
                    batch_first=self.batch_first, \
                    dropout_prob=self.dropout, \
                    param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \
                    bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ))

            dec_output = layers.matmul(dec_output, softmax_weight)

            return dec_output
        elif mode == 'beam_search' or mode == 'greedy_search':
            dec_unit_list = []
            name = 'basic_lstm'
            for i in range(self.num_layers):
                new_name = name + "_layers_" + str(i)
                dec_unit_list.append(
                    BasicLSTMUnit(new_name, self.hidden_size, dtype='float32'))

            def decoder_step(current_in, pre_hidden_array, pre_cell_array):
                new_hidden_array = []
                new_cell_array = []

                step_in = current_in
                for i in range(self.num_layers):
                    pre_hidden = pre_hidden_array[i]
                    pre_cell = pre_cell_array[i]

                    new_hidden, new_cell = dec_unit_list[i](step_in,
                                                            pre_hidden,
                                                            pre_cell)

                    new_hidden_array.append(new_hidden)
                    new_cell_array.append(new_cell)

                    step_in = new_hidden

                return step_in, new_hidden_array, new_cell_array

            if mode == 'beam_search':
                max_src_seq_len = layers.shape(self.src)[1]
                max_length = max_src_seq_len * 2
                #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
                pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
                full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

                score = layers.fill_constant([1], dtype='float32', value=0.0)

                #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2)

                pre_hidden_array = []
                pre_cell_array = []
                pre_feed = layers.fill_constant([beam_size, self.hidden_size],
                                                dtype='float32',
                                                value=0)
                for i in range(self.num_layers):
                    pre_hidden_array.append(
                        layers.expand(enc_last_hidden[i], [beam_size, 1]))
                    pre_cell_array.append(
                        layers.expand(enc_last_cell[i], [beam_size, 1]))

                eos_ids = layers.fill_constant([beam_size],
                                               dtype='int64',
                                               value=2)
                init_score = np.zeros((beam_size)).astype('float32')
                init_score[1:] = -INF
                pre_score = layers.assign(init_score)
                #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0)
                tokens = layers.fill_constant([beam_size, 1],
                                              dtype='int64',
                                              value=1)

                enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1])

                pre_tokens = layers.fill_constant([beam_size, 1],
                                                  dtype='int64',
                                                  value=1)

                finished_seq = layers.fill_constant([beam_size, 1],
                                                    dtype='int64',
                                                    value=0)
                finished_scores = layers.fill_constant([beam_size],
                                                       dtype='float32',
                                                       value=-INF)
                finished_flag = layers.fill_constant([beam_size],
                                                     dtype='float32',
                                                     value=0.0)

                step_idx = layers.fill_constant(shape=[1],
                                                dtype='int32',
                                                value=0)
                cond = layers.less_than(x=step_idx,
                                        y=max_length)  # default force_cpu=True

                parent_idx = layers.fill_constant([1], dtype='int32', value=0)
                while_op = layers.While(cond)

                def compute_topk_scores_and_seq(sequences,
                                                scores,
                                                scores_to_gather,
                                                flags,
                                                beam_size,
                                                select_beam=None,
                                                generate_id=None):
                    scores = layers.reshape(scores, shape=[1, -1])
                    _, topk_indexs = layers.topk(scores, k=beam_size)

                    topk_indexs = layers.reshape(topk_indexs, shape=[-1])

                    # gather result

                    top_seq = layers.gather(sequences, topk_indexs)
                    topk_flags = layers.gather(flags, topk_indexs)
                    topk_gather_scores = layers.gather(scores_to_gather,
                                                       topk_indexs)

                    if select_beam:
                        topk_beam = layers.gather(select_beam, topk_indexs)
                    else:
                        topk_beam = select_beam

                    if generate_id:
                        topk_id = layers.gather(generate_id, topk_indexs)
                    else:
                        topk_id = generate_id
                    return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id

                def grow_alive(curr_seq, curr_scores, curr_log_probs,
                               curr_finished, select_beam, generate_id):
                    curr_scores += curr_finished * -INF
                    return compute_topk_scores_and_seq(curr_seq,
                                                       curr_scores,
                                                       curr_log_probs,
                                                       curr_finished,
                                                       beam_size,
                                                       select_beam,
                                                       generate_id=generate_id)

                def grow_finished(finished_seq, finished_scores, finished_flag,
                                  curr_seq, curr_scores, curr_finished):
                    finished_seq = layers.concat([
                        finished_seq,
                        layers.fill_constant(
                            [beam_size, 1], dtype='int64', value=1)
                    ],
                                                 axis=1)
                    curr_scores += (1.0 - curr_finished) * -INF
                    #layers.Print( curr_scores, message="curr scores")
                    curr_finished_seq = layers.concat([finished_seq, curr_seq],
                                                      axis=0)
                    curr_finished_scores = layers.concat(
                        [finished_scores, curr_scores], axis=0)
                    curr_finished_flags = layers.concat(
                        [finished_flag, curr_finished], axis=0)

                    return compute_topk_scores_and_seq(curr_finished_seq,
                                                       curr_finished_scores,
                                                       curr_finished_scores,
                                                       curr_finished_flags,
                                                       beam_size)

                def is_finished(alive_log_prob, finished_scores,
                                finished_in_finished):

                    max_out_len = 200
                    max_length_penalty = layers.pow(
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=((5.0 + max_out_len) /
                                                    6.0)), alpha)

                    lower_bound_alive_score = layers.slice(
                        alive_log_prob, starts=[0], ends=[1],
                        axes=[0]) / max_length_penalty

                    lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished
                    lowest_score_of_fininshed_in_finished += (
                        1.0 - finished_in_finished) * -INF
                    lowest_score_of_fininshed_in_finished = layers.reduce_min(
                        lowest_score_of_fininshed_in_finished)

                    met = layers.less_than(
                        lower_bound_alive_score,
                        lowest_score_of_fininshed_in_finished)
                    met = layers.cast(met, 'float32')
                    bound_is_met = layers.reduce_sum(met)

                    finished_eos_num = layers.reduce_sum(finished_in_finished)

                    finish_cond = layers.less_than(
                        finished_eos_num,
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=beam_size))

                    return finish_cond

                def grow_top_k(step_idx, alive_seq, alive_log_prob,
                               parant_idx):
                    pre_ids = alive_seq

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    current_log = layers.elementwise_add(x=layers.log(logits),
                                                         y=alive_log_prob,
                                                         axis=0)
                    base_1 = layers.cast(step_idx, 'float32') + 6.0
                    base_1 /= 6.0
                    length_penalty = layers.pow(base_1, alpha)

                    len_pen = layers.pow(
                        ((5. + layers.cast(step_idx + 1, 'float32')) / 6.),
                        alpha)

                    current_log = layers.reshape(current_log, shape=[1, -1])

                    current_log = current_log / length_penalty
                    topk_scores, topk_indices = layers.topk(input=current_log,
                                                            k=beam_size)

                    topk_scores = layers.reshape(topk_scores, shape=[-1])

                    topk_log_probs = topk_scores * length_penalty

                    generate_id = layers.reshape(
                        topk_indices, shape=[-1]) % self.tar_vocab_size

                    selected_beam = layers.reshape(
                        topk_indices, shape=[-1]) // self.tar_vocab_size

                    topk_finished = layers.equal(generate_id, eos_ids)

                    topk_finished = layers.cast(topk_finished, 'float32')

                    generate_id = layers.reshape(generate_id, shape=[-1, 1])

                    pre_tokens_list = layers.gather(tokens, selected_beam)

                    full_tokens_list = layers.concat(
                        [pre_tokens_list, generate_id], axis=1)


                    return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                            dec_att_out, new_hidden_array, new_cell_array

                with while_op.block():
                    topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \
                        grow_top_k(  step_idx, pre_tokens, pre_score, parent_idx)
                    alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive(
                        topk_seq, topk_scores, topk_log_probs, topk_finished,
                        topk_beam, topk_generate_id)

                    finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished(
                        finished_seq, finished_scores, finished_flag, topk_seq,
                        topk_scores, topk_finished)

                    finished_cond = is_finished(alive_log_prob,
                                                finished_scores_2,
                                                finished_flags_2)

                    layers.increment(x=step_idx, value=1.0, in_place=True)

                    layers.assign(alive_beam, parent_idx)
                    layers.assign(alive_id, pre_tokens)
                    layers.assign(alive_log_prob, pre_score)
                    layers.assign(alive_seq, tokens)
                    layers.assign(finished_seq_2, finished_seq)
                    layers.assign(finished_scores_2, finished_scores)
                    layers.assign(finished_flags_2, finished_flag)

                    # update init_hidden, init_cell, input_feed
                    new_feed = layers.gather(attention_out, parent_idx)
                    layers.assign(new_feed, pre_feed)
                    for i in range(self.num_layers):
                        new_hidden_var = layers.gather(new_hidden_array[i],
                                                       parent_idx)
                        layers.assign(new_hidden_var, pre_hidden_array[i])
                        new_cell_var = layers.gather(new_cell_array[i],
                                                     parent_idx)
                        layers.assign(new_cell_var, pre_cell_array[i])

                    length_cond = layers.less_than(x=step_idx, y=max_length)
                    layers.logical_and(x=length_cond,
                                       y=finished_cond,
                                       out=cond)

                tokens_with_eos = tokens

                all_seq = layers.concat([tokens_with_eos, finished_seq],
                                        axis=0)
                all_score = layers.concat([pre_score, finished_scores], axis=0)
                _, topk_index = layers.topk(all_score, k=beam_size)
                topk_index = layers.reshape(topk_index, shape=[-1])
                final_seq = layers.gather(all_seq, topk_index)
                final_score = layers.gather(all_score, topk_index)

                return final_seq
            elif mode == 'greedy_search':
                max_src_seq_len = layers.shape(self.src)[1]
                max_length = max_src_seq_len * 2
                #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
                pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
                full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

                score = layers.fill_constant([1], dtype='float32', value=0.0)

                eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2)

                pre_hidden_array = []
                pre_cell_array = []
                pre_feed = layers.fill_constant([1, self.hidden_size],
                                                dtype='float32',
                                                value=0)
                for i in range(self.num_layers):
                    pre_hidden_array.append(enc_last_hidden[i])
                    pre_cell_array.append(enc_last_cell[i])
                    #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0)  )
                    #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) )

                step_idx = layers.fill_constant(shape=[1],
                                                dtype='int32',
                                                value=0)
                cond = layers.less_than(x=step_idx,
                                        y=max_length)  # default force_cpu=True
                while_op = layers.While(cond)

                with while_op.block():

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    logits = layers.log(logits)

                    current_log = layers.elementwise_add(logits, score, axis=0)

                    topk_score, topk_indices = layers.topk(input=current_log,
                                                           k=1)

                    new_ids = layers.concat([full_ids, topk_indices])
                    layers.assign(new_ids, full_ids)
                    #layers.Print( full_ids, message="ful ids")
                    layers.assign(topk_score, score)
                    layers.assign(topk_indices, pre_ids)
                    layers.assign(dec_att_out, pre_feed)
                    for i in range(self.num_layers):
                        layers.assign(new_hidden_array[i], pre_hidden_array[i])
                        layers.assign(new_cell_array[i], pre_cell_array[i])

                    layers.increment(x=step_idx, value=1.0, in_place=True)

                    eos_met = layers.not_equal(topk_indices, eos_ids)
                    length_cond = layers.less_than(x=step_idx, y=max_length)
                    layers.logical_and(x=length_cond, y=eos_met, out=cond)

                return full_ids

            raise Exception("error")
        else:
            print("mode not supprt", mode)
示例#7
0
    def net(self, inputs, is_infer=False):
        if is_infer:
            bs = self.evaluate_batch_size
        else:
            bs = self.train_batch_size

        stdv = 1.0 / math.sqrt(self.hidden_size)

        def embedding_layer(input,
                            table_name,
                            emb_dim,
                            initializer_instance=None):
            emb = fluid.embedding(input=input,
                                  size=[self.dict_size, emb_dim],
                                  param_attr=fluid.ParamAttr(
                                      name=table_name,
                                      initializer=initializer_instance))
            return emb

        sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
        items_emb = embedding_layer(inputs[0], "emb", self.hidden_size,
                                    sparse_initializer)
        pre_state = items_emb
        for i in range(self.step):
            pre_state = layers.reshape(x=pre_state,
                                       shape=[bs, -1, self.hidden_size])
            state_in = layers.fc(
                input=pre_state,
                name="state_in",
                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(initializer=fluid.initializer.
                                           Uniform(low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]
            state_out = layers.fc(
                input=pre_state,
                name="state_out",
                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(initializer=fluid.initializer.
                                           Uniform(low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]

            state_adj_in = layers.matmul(inputs[3],
                                         state_in)  # [batch_size, uniq_max, h]
            state_adj_out = layers.matmul(
                inputs[4], state_out)  # [batch_size, uniq_max, h]

            gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

            gru_input = layers.reshape(x=gru_input,
                                       shape=[-1, self.hidden_size * 2])
            gru_fc = layers.fc(input=gru_input,
                               name="gru_fc",
                               size=3 * self.hidden_size,
                               bias_attr=False)
            pre_state, _, _ = fluid.layers.gru_unit(
                input=gru_fc,
                hidden=layers.reshape(x=pre_state,
                                      shape=[-1, self.hidden_size]),
                size=3 * self.hidden_size)

        final_state = layers.reshape(pre_state,
                                     shape=[bs, -1, self.hidden_size])
        seq = layers.gather_nd(final_state, inputs[1])
        last = layers.gather_nd(final_state, inputs[2])

        seq_fc = layers.fc(
            input=seq,
            name="seq_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, h]
        last_fc = layers.fc(
            input=last,
            name="last_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=1,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [bathc_size, h]

        seq_fc_t = layers.transpose(seq_fc,
                                    perm=[1, 0, 2])  # [seq_max, batch_size, h]
        add = layers.elementwise_add(seq_fc_t,
                                     last_fc)  # [seq_max, batch_size, h]
        b = layers.create_parameter(
            shape=[self.hidden_size],
            dtype='float32',
            default_initializer=fluid.initializer.Constant(value=0.0))  # [h]
        add = layers.elementwise_add(add, b)  # [seq_max, batch_size, h]

        add_sigmoid = layers.sigmoid(add)  # [seq_max, batch_size, h]
        add_sigmoid = layers.transpose(add_sigmoid,
                                       perm=[1, 0,
                                             2])  # [batch_size, seq_max, h]

        weight = layers.fc(
            input=add_sigmoid,
            name="weight_fc",
            size=1,
            act=None,
            num_flatten_dims=2,
            bias_attr=False,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, 1]
        weight *= inputs[5]
        weight_mask = layers.elementwise_mul(
            seq, weight, axis=0)  # [batch_size, seq_max, h]
        global_attention = layers.reduce_sum(weight_mask,
                                             dim=1)  # [batch_size, h]

        final_attention = layers.concat([global_attention, last],
                                        axis=1)  # [batch_size, 2*h]
        final_attention_fc = layers.fc(
            input=final_attention,
            name="final_attention_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, h]

        # all_vocab = layers.create_global_var(
        #     shape=[items_num - 1],
        #     value=0,
        #     dtype="int64",
        #     persistable=True,
        #     name="all_vocab")
        all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32')
        all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab),
                                      dtype='int64')

        all_emb = fluid.embedding(
            input=all_vocab,
            param_attr=fluid.ParamAttr(name="emb",
                                       initializer=fluid.initializer.Uniform(
                                           low=-stdv, high=stdv)),
            size=[self.dict_size, self.hidden_size])  # [all_vocab, h]

        logits = layers.matmul(x=final_attention_fc,
                               y=all_emb,
                               transpose_y=True)  # [batch_size, all_vocab]
        softmax = layers.softmax_with_cross_entropy(
            logits=logits, label=inputs[6])  # [batch_size, 1]
        self.loss = layers.reduce_mean(softmax)  # [1]
        acc = RecallK(input=logits, label=inputs[6], k=20)
        self._cost = self.loss

        if is_infer:
            self._infer_results['P@20'] = acc
            self._infer_results['LOSS'] = self.loss
            return

        self._metrics["LOSS"] = self.loss
        self._metrics["Train_P@20"] = acc
示例#8
0
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        #real_res = layers.concat(res, 0)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        '''
        else:
            real_res = rnnout[-1]
            for i in range( num_layers ):

            m1, c1, m2, c2 = rnnout
            real_res = m2
            m1.stop_gradient = True
            c1.stop_gradient = True
            c2.stop_gradient = True
        '''

        #layers.Print( first_hidden, message="22", summarize=10)
        #layers.Print( rnnout[1], message="11", summarize=10)
        #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)
        '''
        last_hidden = layers.concat( hidden_array, 1 )
        last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size])
        last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2])
        last_cell = layers.concat( cell_array, 1)
        last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size])
        last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2])
        '''

        return real_res, last_hidden, last_cell
示例#9
0
    def forward(self):
        """ forward
        """
        src, dst = L.read_file(self.pyreader)

        src_id = L.slice(src, [0, 1, 2, 3], [0, 0, 0, 0],
                         [int(math.pow(2, 30)) - 1, 1, 1, 1])
        dst_id = L.slice(dst, [0, 1, 2, 3], [0, 0, 0, 0],
                         [int(math.pow(2, 30)) - 1, self.neg_num + 1, 1, 1])

        if self.is_sparse:
            # sparse mode use 2 dims input.
            src = L.reshape(src, [-1, 1])
            dst = L.reshape(dst, [-1, 1])

        # [b, 1, f, h]
        src_embed = split_embedding(src, self.num_nodes, self.hidden_size,
                                    self.embed_init, "weight", self.num_part,
                                    self.is_sparse)

        # [b, n+1, f, h]
        dst_embed = split_embedding(dst, self.num_nodes, self.hidden_size,
                                    self.embed_init, "weight", self.num_part,
                                    self.is_sparse)

        if self.is_sparse:
            src_embed = L.reshape(src_embed,
                                  [-1, 1, self.num_featuers, self.hidden_size])
            dst_embed = L.reshape(
                dst_embed,
                [-1, self.neg_num + 1, self.num_featuers, self.hidden_size])

        # [b, 1, 1, f]
        src_weight = L.softmax(
            L.embedding(src_id, [self.num_nodes, self.num_featuers],
                        param_attr=F.ParamAttr(name="alpha")))
        # [b, n+1, 1, f]
        dst_weight = L.softmax(
            L.embedding(dst_id, [self.num_nodes, self.num_featuers],
                        param_attr=F.ParamAttr(name="alpha")))

        # [b, 1, h]
        src_sum = L.squeeze(L.matmul(src_weight, src_embed), axes=[2])
        # [b, n+1, h]
        dst_sum = L.squeeze(L.matmul(dst_weight, dst_embed), axes=[2])

        logits = L.matmul(src_sum, dst_sum,
                          transpose_y=True)  # [batch_size, 1, neg_num+1]

        pos_label = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
                                                    "float32", 1)
        neg_label = L.fill_constant_batch_size_like(logits,
                                                    [-1, 1, self.neg_num],
                                                    "float32", 0)
        label = L.concat([pos_label, neg_label], -1)

        pos_weight = L.fill_constant_batch_size_like(logits, [-1, 1, 1],
                                                     "float32", self.neg_num)
        neg_weight = L.fill_constant_batch_size_like(logits,
                                                     [-1, 1, self.neg_num],
                                                     "float32", 1)
        weight = L.concat([pos_weight, neg_weight], -1)

        weight.stop_gradient = True
        label.stop_gradient = True

        loss = L.sigmoid_cross_entropy_with_logits(logits, label)
        loss = loss * weight
        loss = L.reduce_mean(loss)
        loss = loss * ((self.neg_num + 1) / 2 / self.neg_num)
        loss.persistable = True
        self.loss = loss
        return loss
示例#10
0
    def forward(self,
                q,
                k,
                v,
                lengths,
                speaker_embed,
                start_index,
                force_monotonic=False,
                prev_coeffs=None,
                window=None):
        # add position encoding as an inductive bias
        if self.has_bias:  # multi-speaker model
            omega_q = 2 * F.sigmoid(
                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
            omega_k = 2 * self.omega_initial * F.sigmoid(
                F.squeeze(self.k_pos_affine(speaker_embed), axes=[-1]))
        else:  # single-speaker case
            batch_size = q.shape[0]
            omega_q = F.ones((batch_size, ), dtype="float32")
            omega_k = F.ones(
                (batch_size, ), dtype="float32") * self.omega_default
        q += self.position_encoding_weight * positional_encoding(
            q, start_index, omega_q)
        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
        activations = F.matmul(q, k, transpose_y=True)
        activations /= np.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
            mask = F.sequence_mask(lengths, dtype="float32")
            attn_bias = F.scale(1. - mask, -1000)
            activations += F.unsqueeze(attn_bias, [1])
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
            T_enc = k.shape[1]
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
                   else F.argmax(prev_coeffs, axis=-1)
            backward = F.sequence_mask(alpha - backward_step,
                                       maxlen=T_enc,
                                       dtype="bool")
            forward = F.sequence_mask(alpha + forward_step,
                                      maxlen=T_enc,
                                      dtype="bool")
            mask = F.cast(F.logical_xor(backward, forward), "float32")
            # print("mask's shape:", mask.shape)
            attn_bias = F.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
        coefficients = F.dropout(coefficients,
                                 1. - self.keep_prob,
                                 dropout_implementation='upscale_in_train')
        contexts = F.matmul(coefficients, v)
        # context normalization
        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
        contexts *= F.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
示例#11
0
    def edge_aware_self_attention(q, k, v, edges_k, edges_v, attn_bias, d_key,
                                  dropout_rate):
        """
        Edge-aware Self-Attention.

        Scalar dimensions referenced here:
            B = batch_size
            M = max_sequence_length
            N = num_attention_heads
            H = hidden_size_per_head

        Args:
            q: reshaped queries [B, N, M, H]
            k: reshaped keys    [B, N, M, H]
            v: reshaped values  [B, N, M, H]
            edges_k: edge representations between input tokens (keys)   [M, M, H]
            edges_v: edge representations between input tokens (values) [M, M, H]
            attn_bias: attention mask [B, N, M, M]
        """
        if not (len(q.shape) == len(k.shape) == len(v.shape) == 4):
            raise ValueError("Input q, k, v should be 4-D Tensors.")
        if not (len(edges_k.shape) == len(edges_v.shape) == 3):
            raise ValueError(
                "Input edges_k and edges_v should be 3-D Tensors.")

        # regular self-attention
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)

        # edge-aware self-attention
        if edges_k and edges_v:
            # 1. transpose scaled_q from [B, N, M, H] to [M, B, N, H]
            scaled_q = layers.transpose(x=scaled_q, perm=[2, 0, 1, 3])
            # 2. reshape scaled_q from [M, B, N, H] to [M, B*N, H]
            scaled_q = layers.reshape(x=scaled_q,
                                      shape=[0, -1, scaled_q.shape[3]],
                                      inplace=True)
            # 3. multiply scaled_q with transpose(edges_k)
            #      scaled_q:  [M, B*N, H]
            #      edges_k:   [M, M, H]
            #      edge_bias: [M, B*N, M]
            edge_bias = layers.matmul(x=scaled_q, y=edges_k, transpose_y=True)
            # 4. reshape edge_bias from [M, B*N, M] to [M, B, N, M]
            edge_bias = layers.reshape(x=edge_bias,
                                       shape=[0, -1, q.shape[1], q.shape[2]],
                                       inplace=True)
            # 5. transpose edge_bias from [M, B, N, M] to [B, N, M, M]
            edge_bias = layers.transpose(x=edge_bias, perm=[1, 2, 0, 3])
            # 6. add edge_bias to product
            product += edge_bias

        # add attention bias
        if attn_bias:
            product += attn_bias

        # softmax attention weights
        weights = layers.softmax(product)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)

        # edge-aware self-attention
        out = layers.matmul(weights, v)
        if edges_k and edges_v:
            # 1. transpose weights from [B, N, M, M] to [M, B, N, M]
            reshaped_weights = layers.transpose(x=weights, perm=[2, 0, 1, 3])
            # 2. reshape weights from [M, B, N, M] to [M, B*N, M]
            reshaped_weights = layers.reshape(
                x=reshaped_weights,
                shape=[0, -1, reshaped_weights.shape[3]],
                inplace=True)
            # 3. multiply reshaped_weights with edges_v
            #      reshaped_weights: [M, B*N, M]
            #      edges_v:          [M, M, H]
            #      edge_bias:        [M, B*N, H]
            edge_bias = layers.matmul(x=reshaped_weights, y=edges_v)
            # 4. reshape edge_bias from [M, B*N, H] to [M, B, N, H]
            edge_bias = layers.reshape(x=edge_bias,
                                       shape=[0, -1, q.shape[1], q.shape[3]],
                                       inplace=True)
            # 5. transpose edge_bias from [M, B, N, H] to [B, N, M, H]
            edge_bias = layers.transpose(x=edge_bias, perm=[1, 2, 0, 3])
            out += edge_bias

        return out
示例#12
0
        def dot_attention(query, memory):
            attn = layers.matmul(query, memory, transpose_y=True)
            weight = layers.softmax(attn)
            weight_memory = layers.matmul(weight, memory)

            return weight_memory, weight
示例#13
0
    def build_model(self, model_configs):
        self.update_params(model_configs)
        features = fluid.layers.data(name="features",
                                     shape=[None, self.seq_len_],
                                     dtype='int64')
        labels = fluid.layers.data(name="labels",
                                   shape=[None, self.seq_len_],
                                   dtype='int64')
        sequence_length_ph = fluid.layers.data(name="seq_len_ph",
                                               shape=[None],
                                               dtype='int64')
        sequence_mask_ph = fluid.layers.data(name="seq_mask_ph",
                                             shape=[None],
                                             dtype='float32')

        init_hidden = fluid.layers.data(
            name="init_hidden",
            shape=[None, self.num_layers_, self.n_hidden_],
            dtype='float32')
        init_cell = fluid.layers.data(
            name="init_cell",
            shape=[None, self.num_layers_, self.n_hidden_],
            dtype='float32')

        init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2])
        init_cell = layers.transpose(init_cell, perm=[1, 0, 2])

        init_hidden_reshape = layers.reshape(
            init_hidden, shape=[self.num_layers_, -1, self.n_hidden_])
        init_cell_reshape = layers.reshape(
            init_cell, shape=[self.num_layers_, -1, self.n_hidden_])

        features = layers.reshape(features, shape=[-1, self.seq_len_, 1])

        # word embedding
        inputs = layers.embedding(
            input=features,
            size=[self.vocab_size_, self.n_hidden_],
            dtype='float32',
            is_sparse=False,
            param_attr=fluid.ParamAttr(
                name='embedding_para',
                initializer=fluid.initializer.UniformInitializer(
                    low=-self.init_scale_, high=self.init_scale_)))

        # LSTM
        output, last_hidden, last_cell = self._build_rnn_graph(
            inputs, init_hidden, init_cell, sequence_length_ph)

        output = layers.reshape(output,
                                shape=[-1, self.seq_len_, self.n_hidden_],
                                inplace=True)
        self.last_hidden_ = layers.reshape(
            last_hidden, [-1, self.num_layers_, self.n_hidden_])
        self.last_cell_ = layers.reshape(
            last_cell, [-1, self.num_layers_, self.n_hidden_])

        # softmax
        softmax_w = layers.create_parameter(
            [self.n_hidden_, self.vocab_size_],
            dtype="float32",
            name="softmax_w",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale_, high=self.init_scale_))
        softmax_b = layers.create_parameter(
            [self.vocab_size_],
            dtype="float32",
            name='softmax_b',
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale_, high=self.init_scale_))

        logits = layers.matmul(output, softmax_w)
        logits = layers.elementwise_add(logits, softmax_b)
        logits = layers.reshape(logits,
                                shape=[-1, self.vocab_size_],
                                inplace=True)

        # correct predictions
        labels_reshaped = layers.reshape(labels, [-1])
        pred = layers.cast(layers.argmax(logits, 1), dtype="int64")
        correct_pred = layers.cast(layers.equal(pred, labels_reshaped),
                                   dtype="int64")
        self.pred_ = pred

        # predicting unknown is always considered wrong
        # only in paddle 1.8
        unk_tensor = layers.fill_constant(layers.shape(labels_reshaped),
                                          value=self.unk_symbol_,
                                          dtype='int64')
        pred_unk = layers.cast(layers.equal(pred, unk_tensor), dtype="int64")
        correct_unk = layers.elementwise_mul(pred_unk, correct_pred)

        # predicting padding is always considered wrong
        pad_tensor = layers.fill_constant(layers.shape(labels_reshaped),
                                          value=self.pad_symbol_,
                                          dtype='int64')
        pred_pad = layers.cast(layers.equal(pred, pad_tensor), dtype="int64")
        correct_pad = layers.elementwise_mul(pred_pad, correct_pred)

        # Reshape logits to be a 3-D tensor for sequence loss
        logits = layers.reshape(logits, [-1, self.seq_len_, self.vocab_size_])

        labels = layers.reshape(labels, [-1, self.seq_len_, 1])
        loss = layers.softmax_with_cross_entropy(logits=logits,
                                                 label=labels,
                                                 soft_label=False,
                                                 return_softmax=False)
        sequence_mask = layers.reshape(sequence_mask_ph,
                                       [-1, self.seq_len_, 1])
        loss = layers.reduce_mean(layers.elementwise_mul(loss, sequence_mask))

        eval_metric_ops = fluid.layers.reduce_sum(correct_pred) \
                - fluid.layers.reduce_sum(correct_unk) \
                - fluid.layers.reduce_sum(correct_pad)

        self.loss_ = loss
        self.correct_ = eval_metric_ops
        self.input_name_list_ = [
            'features', 'labels', 'seq_len_ph', 'seq_mask_ph', 'init_hidden',
            'init_cell'
        ]
        self.target_var_names_ = [
            self.loss_, self.last_hidden_, self.last_cell_, self.correct_
        ]

        self.program_ = fluid.default_main_program()
        self.startup_program_ = fluid.default_startup_program()
示例#14
0
    def forward(self, input_ids, position_ids):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input_ids,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })

        input_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.word_embeddings.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        embeddings = input_embeddings + position_embeddings
        embeddings = self.dropout1(embeddings)

        # Pre-norm
        target = self.norm1(embeddings)

        # The following is the attention part
        q = self.q_proj(target)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(target)
        v = self.v_proj(target)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        residual = embeddings + self.dropout2(out)

        # Pre-norm
        out0 = self.norm2(residual)

        # The following is the MLP part
        out1 = self.linear0(out0)
        out2 = F.gelu(out1, approximate=True)
        out3 = self.linear1(out2)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.linear0.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.linear1.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        # Add residual
        final = residual + self.dropout3(out3)
        return final
示例#15
0
    def forward(self, input):
        if _global_parallel_strategy == "dp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(input,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1, -1]
                              })

        q = self.q_proj(input)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])

        k = self.k_proj(input)
        v = self.v_proj(input)

        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 0]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.q_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.k_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })
            auto.shard_tensor(self.v_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [-1, 1]
                              })

        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        if self.attn_mask is not None:
            product = product + self.attn_mask

        weights = F.softmax(product)

        if self.dropout_ratio:
            weights = F.dropout(weights,
                                self.dropout_ratio,
                                training=self.training,
                                mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)
        if _global_parallel_strategy == "mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [0, -1]
                              })
        elif _global_parallel_strategy == "dp_mp":
            auto.shard_tensor(self.out_proj.weight,
                              dist_attr={
                                  "process_mesh": _global_process_mesh,
                                  "dims_mapping": [1, -1]
                              })

        return out
示例#16
0
def lm_model(hidden_size,
             vocab_size,
             batch_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static'):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        #real_res = layers.concat(res, 0)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        '''
        else:
            real_res = rnnout[-1]
            for i in range( num_layers ):

            m1, c1, m2, c2 = rnnout
            real_res = m2
            m1.stop_gradient = True
            c1.stop_gradient = True
            c2.stop_gradient = True
        '''

        #layers.Print( first_hidden, message="22", summarize=10)
        #layers.Print( rnnout[1], message="11", summarize=10)
        #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)
        '''
        last_hidden = layers.concat( hidden_array, 1 )
        last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size])
        last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2])
        last_cell = layers.concat( cell_array, 1)
        last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size])
        last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2])
        '''

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    x = layers.data(name="x",
                    shape=[batch_size, num_steps, 1],
                    dtype='int64',
                    append_batch_size=False)
    y = layers.data(name="y",
                    shape=[batch_size * num_steps, 1],
                    dtype='int64',
                    append_batch_size=False)

    init_hidden = layers.data(name="init_hidden",
                              shape=[num_layers, batch_size, hidden_size],
                              dtype='float32',
                              append_batch_size=False)
    init_cell = layers.data(name="init_cell",
                            shape=[num_layers, batch_size, hidden_size],
                            dtype='float32',
                            append_batch_size=False)

    init_hidden = layers.reshape(init_hidden,
                                 shape=[num_layers, -1, hidden_size])
    init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(low=-init_scale,
                                                             high=init_scale)))

    x_emb = layers.reshape(x_emb,
                           shape=[-1, num_steps, hidden_size],
                           inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(x_emb,
                               dropout_prob=dropout,
                               dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(x_emb,
                                                      len=num_steps,
                                                      init_hidden=init_hidden,
                                                      init_cell=init_cell)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden, init_cell,  num_steps, hidden_size, num_layers, \
                is_bidirec=False, \
                default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) )
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    else:
        print("type not support")
        return
    rnn_out = layers.reshape(rnn_out,
                             shape=[-1, num_steps, hidden_size],
                             inplace=True)


    softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \
            default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \
            default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection,
                                shape=[-1, vocab_size],
                                inplace=True)

    loss = layers.softmax_with_cross_entropy(logits=projection,
                                             label=y,
                                             soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    return loss, last_hidden, last_cell, feeding_list
    def forward(self, q, v, mask=None):
        """forward

        Args:
            q (Variable): shape = [batch_size, seq_len1, hidden_size] or [batch_size, hidden_size].
                          dtype = float32
            v (Variable): shape = [batch_size, seq_len2, hidden_size]. dtype = float32
            mask (Variable): shape = [batch_size, seq_len2]. dtype = v.dtype. Default is None

        Returns: Variable
            shape = [batch_size, seq_len2], dtype = float32.

        Raises:
            RuntimeError: while giving unsupported score_type.
        """
        input_dim = len(q.shape)
        if input_dim == 2:
            q = layers.unsqueeze(q, [1])

        if self._score_type == 'dot_prod':
            ptr_score = layers.matmul(q, v, transpose_y=True)
        elif self._score_type == 'affine':
            q_tmp = layers.fc(q,
                              size=v.shape[2],
                              num_flatten_dims=2,
                              **nn_utils.param_attr(self._name,
                                                    self._init_scale,
                                                    need_bias=True))
            ptr_score = layers.matmul(q_tmp, v, transpose_y=True)
        elif self._score_type == 'std':
            if self._hidden_size <= 0:
                raise ValueError("hidden_size should greater than 0")
            q_tmp = layers.fc(q,
                              size=self._hidden_size,
                              num_flatten_dims=2,
                              **nn_utils.param_attr(self._name + '_q',
                                                    self._init_scale,
                                                    need_bias=True))
            v_tmp = layers.fc(v,
                              size=self._hidden_size,
                              num_flatten_dims=2,
                              **nn_utils.param_attr(self._name + '_k',
                                                    self._init_scale,
                                                    need_bias=True))

            # shape = [batch_size, seq_len1, seq_len2, hidden_size]
            q_tmp_expand = layers.expand(layers.unsqueeze(q_tmp, [2]),
                                         [1, 1, v_tmp.shape[1], 1])
            # shape = [batch_size, 1, seq_len2, hidden_size]
            v_tmp_expand = layers.unsqueeze(v_tmp, [1])
            ptr_score = layers.fc(layers.elementwise_add(q_tmp_expand,
                                                         v_tmp_expand,
                                                         act='tanh'),
                                  size=1,
                                  num_flatten_dims=3,
                                  **nn_utils.param_attr(self._name + '_w',
                                                        self._init_scale,
                                                        need_bias=True))
            ptr_score = layers.squeeze(ptr_score, [3])
        else:
            raise RuntimeError(
                'Supported score types: dot_prod/affine/std. but got %s' %
                (self._score_type))

        if mask is not None:
            score_for_mask = layers.transpose(ptr_score, [1, 0, 2])
            ptr_score_masked = layers.elementwise_add(score_for_mask,
                                                      (mask - 1.0) * INF,
                                                      axis=-1)
            ptr_score = layers.transpose(ptr_score_masked, [1, 0, 2])

        if input_dim == 2:
            ptr_score = layers.squeeze(ptr_score, [1])
        return ptr_score
示例#18
0
def matmul(a, b):
    if isinstance(a, PTensor) or isinstance(b, PTensor):
        return layers.matmul(a, b)
    else:
        return np.matmul(a, b)
x = paddle.randn((N, in_C, H, W))
w = paddle.randn((C, in_C, 1, 1))

y = F.conv2d(x, w)

x_in = L.reshape(x, (N, 1, in_C, H, W))
w_r = L.reshape(w, (1, C, in_C, 1, 1))
y2 = x_in * w_r  # [N, C, in_C, H, W]
y2 = L.reduce_sum(y2, dim=[
    2,
])

x_in2 = L.transpose(x, [0, 2, 3, 1])  # [N, H, W, in_C]
w_r2 = L.reshape(w, (C, in_C))
w_r2 = L.transpose(w_r2, [1, 0])  # [in_C, C]
y3 = L.matmul(x_in2, w_r2)  # [N, H, W, C]
y3 = L.transpose(y3, [0, 3, 1, 2])  # [N, C, H, W]

y = y.numpy()
y2 = y2.numpy()
y3 = y3.numpy()
d = np.sum((y - y2)**2)
print(d)
d = np.sum((y - y3)**2)
print(d)
'''
因此,两个形如
(N, 1, in_C, H, W)
(1, C, in_C, 1, 1)
或者说形如
(A, 1, in_C, B, C)
示例#20
0
                def grow_top_k(step_idx, alive_seq, alive_log_prob,
                               parant_idx):
                    pre_ids = alive_seq

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    current_log = layers.elementwise_add(x=layers.log(logits),
                                                         y=alive_log_prob,
                                                         axis=0)
                    base_1 = layers.cast(step_idx, 'float32') + 6.0
                    base_1 /= 6.0
                    length_penalty = layers.pow(base_1, alpha)

                    len_pen = layers.pow(
                        ((5. + layers.cast(step_idx + 1, 'float32')) / 6.),
                        alpha)

                    current_log = layers.reshape(current_log, shape=[1, -1])

                    current_log = current_log / length_penalty
                    topk_scores, topk_indices = layers.topk(input=current_log,
                                                            k=beam_size)

                    topk_scores = layers.reshape(topk_scores, shape=[-1])

                    topk_log_probs = topk_scores * length_penalty

                    generate_id = layers.reshape(
                        topk_indices, shape=[-1]) % self.tar_vocab_size

                    selected_beam = layers.reshape(
                        topk_indices, shape=[-1]) // self.tar_vocab_size

                    topk_finished = layers.equal(generate_id, eos_ids)

                    topk_finished = layers.cast(topk_finished, 'float32')

                    generate_id = layers.reshape(generate_id, shape=[-1, 1])

                    pre_tokens_list = layers.gather(tokens, selected_beam)

                    full_tokens_list = layers.concat(
                        [pre_tokens_list, generate_id], axis=1)


                    return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                            dec_att_out, new_hidden_array, new_cell_array
示例#21
0
def network(batch_size, items_num, hidden_size, step, rate):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = layers.data(
        name="items",
        shape=[batch_size, -1, 1],
        dtype="int64",
        append_batch_size=False)  #[bs, uniq_max, 1]
    seq_index = layers.data(
        name="seq_index",
        shape=[batch_size, -1],
        dtype="int64",
        append_batch_size=False)  #[-1(seq_max)*batch_size, 1]
    last_index = layers.data(
        name="last_index",
        shape=[batch_size],
        dtype="int64",
        append_batch_size=False)  #[batch_size, 1]
    adj_in = layers.data(
        name="adj_in",
        shape=[batch_size, -1, -1],
        dtype="float32",
        append_batch_size=False)
    adj_out = layers.data(
        name="adj_out",
        shape=[batch_size, -1, -1],
        dtype="float32",
        append_batch_size=False)
    mask = layers.data(
        name="mask",
        shape=[batch_size, -1, 1],
        dtype="float32",
        append_batch_size=False)
    label = layers.data(
        name="label",
        shape=[batch_size, 1],
        dtype="int64",
        append_batch_size=False)

    items_emb = layers.embedding(
        input=items,
        is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="emb",
            learning_rate=rate,
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]
    data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(
            x=pre_state, shape=[batch_size, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(
            input=gru_fc,
            hidden=layers.reshape(
                x=pre_state, shape=[-1, hidden_size]),
            size=3 * hidden_size)

    final_state = pre_state
    seq_index = layers.reshape(seq_index, shape=[-1])
    seq = layers.gather(final_state, seq_index)  #[batch_size*-1(seq_max), h]
    last = layers.gather(final_state, last_index)  #[batch_size, h]

    seq = layers.reshape(
        seq, shape=[batch_size, -1, hidden_size])  #[batch_size, -1(seq_max), h]
    last = layers.reshape(
        last, shape=[batch_size, hidden_size])  #[batch_size, h]

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, -1(seq_max), h]
    last_fc = layers.fc(input=last,
                        name="last_fc",
                        size=hidden_size,
                        bias_attr=False,
                        act=None,
                        num_flatten_dims=1,
                        param_attr=fluid.ParamAttr(
                            initializer=fluid.initializer.Uniform(
                                low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(
        seq_fc, perm=[1, 0, 2])  #[-1(seq_max), batch_size, h]
    add = layers.elementwise_add(seq_fc_t,
                                 last_fc)  #[-1(seq_max), batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[-1(seq_max), batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[-1(seq_max), batch_size, h] 
    add_sigmoid = layers.transpose(
        add_sigmoid, perm=[1, 0, 2])  #[batch_size, -1(seq_max), h]

    weight = layers.fc(input=add_sigmoid,
                       name="weight_fc",
                       size=1,
                       act=None,
                       num_flatten_dims=2,
                       bias_attr=False,
                       param_attr=fluid.ParamAttr(
                           initializer=fluid.initializer.Uniform(
                               low=-stdv, high=stdv)))  #[batch_size, -1, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight, axis=0)
    global_attention = layers.reduce_sum(weight_mask, dim=1)

    final_attention = layers.concat(
        [global_attention, last], axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="fina_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(
        shape=[items_num - 1, 1],
        value=0,
        dtype="int64",
        persistable=True,
        name="all_vocab")

    all_emb = layers.embedding(
        input=all_vocab,
        is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="emb",
            learning_rate=rate,
            initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(
        x=final_attention_fc, y=all_emb,
        transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(
        logits=logits, label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    #fluid.layers.Print(loss)
    acc = layers.accuracy(input=logits, label=label, k=20)
    return loss, acc, data_feed, [items_emb, all_emb]
示例#22
0
    def decode(self,
               dec_input,
               enc_words_output,
               enc_sents_output,
               caches=None,
               gather_idx=None):
        """Decoding to generate output text"""

        trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, \
        trg_src_sents_attn_bias, graph_attn_bias = dec_input

        dec_res = self._gen_dec_input(trg_word, trg_pos, trg_slf_attn_bias,
                                      trg_src_words_attn_bias,
                                      trg_src_sents_attn_bias, graph_attn_bias)

        emb_out, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias = \
            dec_res.emb_out, dec_res.trg_slf_attn_bias, dec_res.trg_src_words_attn_bias, \
            dec_res.trg_src_sents_attn_bias, dec_res.graph_attn_bias

        # (batch_size, tgt_len, emb_dim)
        dec_output = graph_decoder(
            dec_input=emb_out,  # (batch_size, tgt_len, emb_dim)
            enc_words_output=
            enc_words_output,  # (batch_size, n_blocks, n_tokens, emb_dim)
            enc_sents_output=enc_sents_output,  # (batch_size, n_blocks, emb_dim)
            dec_slf_attn_bias=
            trg_slf_attn_bias,  # (batch_size, n_head, tgt_len, tgt_len)
            dec_enc_words_attn_bias=
            trg_src_words_attn_bias,  # (batch_size, n_blocks, n_head, tgt_len, n_tokens)
            dec_enc_sents_attn_bias=
            trg_src_sents_attn_bias,  # (batch_size, n_head, tgt_len, n_blocks)
            graph_attn_bias=
            graph_attn_bias,  # (batch_size, n_head, n_blocks, n_blocks)
            pos_win=self.pos_win,
            n_layer=self._dec_n_layer,
            n_head=self._n_head,
            d_key=self._emb_size // self._n_head,
            d_value=self._emb_size // self._n_head,
            d_model=self._emb_size,
            d_inner_hid=self._emb_size * 4,
            prepostprocess_dropout=self._prepostprocess_dropout,
            attention_dropout=self._attention_dropout,
            relu_dropout=self._prepostprocess_dropout,
            hidden_act=self._hidden_act,
            preprocess_cmd=self._preprocess_command,
            postprocess_cmd=self._postprocess_command,
            param_initializer=self._param_initializer,
            caches=caches,
            gather_idx=gather_idx,
            name='graph_decoder')

        # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
        # (batch_size*tgt_len, emb_dim)
        dec_output = layers.reshape(dec_output,
                                    shape=[-1, self._emb_size],
                                    inplace=True)

        if self._dtype is "float16":
            dec_output = fluid.layers.cast(x=dec_output, dtype=self._emb_dtype)

        if self._weight_sharing:
            out = layers.matmul(
                x=dec_output,
                y=fluid.default_main_program().global_block().var(
                    self._word_emb_name),
                transpose_y=True)
            bias = layers.create_parameter(
                shape=[self.voc_size],
                dtype=self._emb_dtype,
                attr=fluid.ParamAttr(
                    name='generator.bias',
                    initializer=fluid.initializer.Constant(value=0.0)),
                is_bias=True)
            predict = layers.elementwise_add(x=out, y=bias, axis=-1)
        else:
            predict = layers.fc(
                input=dec_output,
                size=self.voc_size,
                param_attr=fluid.ParamAttr(
                    name="generator.w",
                    initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
                bias_attr=fluid.ParamAttr(
                    name='generator.bias',
                    initializer=fluid.initializer.Constant(value=0.0)))

        return predict
示例#23
0
    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
示例#24
0
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(input=x,
                             size=[vocab_size, emb_size],
                             dtype='float32',
                             is_sparse=False,
                             param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter([vocab_size, emb_size],
                                             dtype="float32",
                                             name="softmax_weight")
    softmax_bias = layers.create_parameter([vocab_size],
                                           dtype="float32",
                                           name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(logits=projection,
                                                 label=label,
                                                 soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
示例#25
0
def lm_model(hidden_size,
             vocab_size,
             batch_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static',
             use_dataloader=False):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size // fluid.core.get_cuda_device_count()
    x = fluid.data(name="x",
                   shape=[batch_size_each, num_steps, 1],
                   dtype='int64')
    y = fluid.data(name="y",
                   shape=[batch_size_each * num_steps, 1],
                   dtype='int64')

    if use_dataloader:
        dataloader = fluid.io.DataLoader.from_generator(feed_list=[x, y],
                                                        capacity=16,
                                                        iterable=False,
                                                        use_double_buffer=True)

    init_hidden = fluid.data(name="init_hidden",
                             shape=[num_layers, batch_size_each, hidden_size],
                             dtype='float32')
    init_cell = fluid.data(name="init_cell",
                           shape=[num_layers, batch_size_each, hidden_size],
                           dtype='float32')

    init_cell.persistable = True
    init_hidden.persistable = True

    init_hidden_reshape = layers.reshape(init_hidden,
                                         shape=[num_layers, -1, hidden_size])
    init_cell_reshape = layers.reshape(init_cell,
                                       shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(low=-init_scale,
                                                             high=init_scale)))

    x_emb = layers.reshape(x_emb,
                           shape=[-1, num_steps, hidden_size],
                           inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(x_emb,
                               dropout_prob=dropout,
                               dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm(
            x_emb,
            init_hidden_reshape,
            init_cell_reshape,
            num_steps,
            hidden_size,
            num_layers,
            is_bidirec=False,
            default_initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale))
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    elif rnn_model == "basic_lstm":
        rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \
                num_layers=num_layers, batch_first=True, dropout_prob=dropout, \
                param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \
                bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \
                forget_bias = 0.0)
    else:
        print("type not support")
        return

    rnn_out = layers.reshape(rnn_out,
                             shape=[-1, num_steps, hidden_size],
                             inplace=True)

    softmax_weight = layers.create_parameter(
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter(
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)
    projection = layers.reshape(projection,
                                shape=[-1, vocab_size],
                                inplace=True)

    loss = layers.softmax_with_cross_entropy(logits=projection,
                                             label=y,
                                             soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
    layers.assign(input=last_cell, output=init_cell)
    layers.assign(input=last_hidden, output=init_hidden)

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    if use_dataloader:
        return loss, last_hidden, last_cell, feeding_list, dataloader
    else:
        return loss, last_hidden, last_cell, feeding_list
示例#26
0
    def __call__(
            self,
            predictions,
            labels_pos_mask,  # Shape: [batch_size, 19248, 1]
            labels_neg_mask,  # Shape: [batch_size, 19248, 1]
            labels_allboxes_vector,  # Shape: [batch_size, 19248, 8]
            segment_t,  # list  Shape: [batch_size, 19248, 1]
            label_masks,
            labels_best_truth_idx,
            labels_pos_index,
            labels_pos_cid,  #  Shape: [batch_size, 19248]
            labels_pos_cid2,  #  Shape: [batch_size, 19248]
            priors,
            class_vectors,
            batch_size,
            use_maskiou=True,
            use_ce_loss=True,
            use_ghm_c_loss=False,
            use_focal_loss=False,
            use_ohem_loss=False):

        pred_allboxes_encode_x0y0x1y1 = predictions[
            'loc']  # Shape: [batch_size, 19248, 4]
        pred_allboxes_conf = predictions[
            'conf']  # Shape: [batch_size, 19248, 1+80]
        pred_allboxes_mask_coef = predictions[
            'mask']  # Shape: [batch_size, 19248, 原型数=32]
        pred_proto = predictions[
            'proto']  # Shape: [batch_size, s4=138, s4=138, 原型数=32]
        pred_segm = predictions[
            'segm']  # Shape: [batch_size, 类别数=80, s8=69, s8=69]

        labels_allboxes_x0y0x1y1 = labels_allboxes_vector[:, :, 0:
                                                          4]  # Shape: [batch_size, 19248, 4]
        labels_allboxes_decode_x0y0x1y1 = labels_allboxes_vector[:, :, 4:
                                                                 8]  # Shape: [batch_size, 19248, 4]

        losses = {}

        # 1.bbox_loss,只有正例才计算。
        # bbox_alpha = 1.5
        # bbox_loss = P.smooth_l1(P.reshape(pred_allboxes_encode_x0y0x1y1, (-1, 4)), P.reshape(labels_allboxes_x0y0x1y1, (-1, 4)))
        # bbox_loss = P.reshape(labels_pos_mask, (-1, 1)) * bbox_loss
        # bbox_loss = P.reduce_sum(bbox_loss) * bbox_alpha
        # losses['B'] = bbox_loss

        # 1.bbox_loss,ciou_loss
        pred_x0y0x1y1 = []
        for idx in range(batch_size):
            temp = decode(pred_allboxes_encode_x0y0x1y1[idx], priors)
            pred_x0y0x1y1.append(temp)
        pred_x0y0x1y1 = P.concat(pred_x0y0x1y1,
                                 axis=0)  # Shape: [batch_size*num_priors, 4]
        pred_x0y0x1y1 = P.reshape(
            pred_x0y0x1y1,
            (batch_size, -1, 4))  # Shape: [batch_size, num_priors, 4]

        ciou = P.reshape(
            self.bbox_ciou(pred_x0y0x1y1, labels_allboxes_decode_x0y0x1y1),
            (batch_size, -1, 1))  # (batch_size, num_priors, 1)

        # 每个预测框ciou_loss的权重 = 2 - (ground truth的面积/图片面积)
        gt_area = (labels_allboxes_decode_x0y0x1y1[:, :, 2:3] - labels_allboxes_decode_x0y0x1y1[:, :, 0:1]) * \
                  (labels_allboxes_decode_x0y0x1y1[:, :, 3:4] - labels_allboxes_decode_x0y0x1y1[:, :, 1:2])
        bbox_loss_scale = 2.0 - gt_area
        ciou_loss = labels_pos_mask * bbox_loss_scale * (1 - ciou)
        bbox_alpha = 1.5
        ciou_loss = P.reduce_sum(ciou_loss) * bbox_alpha
        losses['B'] = ciou_loss

        # 2.mask_loss,只有正例才计算
        mask_h = P.shape(pred_proto)[1]
        mask_w = P.shape(pred_proto)[2]
        loss_m = 0
        maskiou_t_list = []
        maskiou_net_input_list = []
        label_t_list = []
        for idx in range(batch_size):
            # [[0], [0], [0], [0], [0], [0], [0], [0]]。把8个正样本的最匹配gt的下标(在label_x0y0x1y1cid[idx]中的下标)选出来。
            # 因为只有一个gt,所以下标全是0
            labels_pos_index[idx].stop_gradient = True
            cur_gt = P.gather(labels_best_truth_idx[idx],
                              labels_pos_index[idx])  # (?, 1)
            cur_gt.stop_gradient = True
            cur_x0y0x1y1 = P.gather(labels_allboxes_decode_x0y0x1y1[idx],
                                    labels_pos_index[idx])  # (?, 4)

            proto_masks = pred_proto[idx]  # (138, 138, 32)
            # pred_mask_coef (batch_size, 19248, 32)。 把8个正样本预测的mask系数选出来。
            proto_coef = P.gather(pred_allboxes_mask_coef[idx],
                                  labels_pos_index[idx])  # (?, 32)

            # (?, 138, 138),把8个正样本所匹配的gt的真实mask抽出来。因为匹配到同一个gt,所以是同一个mask重复了8次。
            mask_t = P.gather(label_masks[idx], cur_gt)  # (?, 138, 138)
            # (?, ),把8个正样本所匹配的gt的真实cid抽出来。因为匹配到同一个gt,所以是同一个cid重复了8次。
            label_t = P.gather(labels_pos_cid[idx],
                               labels_pos_index[idx])  # (?, )

            # Size: (138, 138, ?)  =  原型*系数转置
            pred_masks = P.matmul(proto_masks, proto_coef, transpose_y=True)
            pred_masks = P.sigmoid(pred_masks)  # sigmoid激活

            pred_masks = crop(pred_masks, cur_x0y0x1y1)
            pred_masks = P.transpose(pred_masks, perm=[2, 0, 1])

            masks_pos_loss = mask_t * (0 - P.log(pred_masks + 1e-9)
                                       )  # 二值交叉熵,加了极小的常数防止nan
            masks_neg_loss = (1 - mask_t) * (0 - P.log(1 - pred_masks + 1e-9)
                                             )  # 二值交叉熵,加了极小的常数防止nan
            pre_loss = (masks_pos_loss + masks_neg_loss)
            pre_loss = P.reduce_sum(pre_loss, dim=[1, 2])

            # gt面积越小,对应mask损失权重越大
            cur_cxcywh = center_size(cur_x0y0x1y1)
            gt_box_width = cur_cxcywh[:, 2]
            gt_box_height = cur_cxcywh[:, 3]
            pre_loss = pre_loss / (gt_box_width * gt_box_height)
            loss_m += P.reduce_sum(pre_loss)

            if use_maskiou:
                # mask_t中,面积<=5*5的被丢弃
                # discard_mask_area = 5*5
                '''
                gpu版本的paddlepaddle1.6.2里有一个问题。select如果是[None],并且在gather()里使用了select,就会出现
                cudaGetLastError  invalid configuration argument errno: 9   这个错误。cpu版本则可以正常跑。
                为了避免上面的问题,只能让select不是[None],所以这里不做面积过滤,mask_t全部保留。
                '''
                discard_mask_area = -1
                gt_mask_area = P.reduce_sum(mask_t, dim=[1, 2])
                gt_mask_area.stop_gradient = True
                select = P.where(gt_mask_area > discard_mask_area)
                select.stop_gradient = True
                pred_masks = P.gather(pred_masks, select)
                mask_t = P.gather(mask_t, select)
                label_t = P.gather(label_t, select)
                label_t.stop_gradient = True

                maskiou_net_input = P.reshape(
                    pred_masks, (P.shape(pred_masks)[0], 1, mask_h, mask_w))
                pred_masks = P.cast(pred_masks > 0.5, 'float32')  # 四舍五入
                maskiou_t = self._mask_iou(pred_masks, mask_t)  # (8, )
                maskiou_net_input_list.append(
                    maskiou_net_input)  # (8, 1, 138, 138)
                maskiou_t_list.append(maskiou_t)  # (8, )
                label_t_list.append(label_t)  # (8, )
        mask_alpha = 6.125
        losses['M'] = loss_m * mask_alpha / mask_h / mask_w

        # 余下部分
        if use_maskiou:
            maskiou_net_input = P.concat(
                maskiou_net_input_list,
                axis=0)  # (21, 1, 138, 138)  21个正例预测的掩码
            maskiou_t = P.concat(maskiou_t_list,
                                 axis=0)  # (21, )  21个正例预测的掩码和真实掩码的iou
            label_t = P.concat(label_t_list, axis=0)  # (21, )  21个正例预测的cid
            label_t.stop_gradient = True  # 因为是整数所以才?
            maskiou_targets = [maskiou_net_input, maskiou_t, label_t]

        # 3.conf_loss。
        conf_alpha = 1.0
        if use_ce_loss:
            conf_loss = self.ce_conf_loss(pred_allboxes_conf, labels_pos_mask,
                                          labels_neg_mask, class_vectors,
                                          labels_pos_cid2, gt_area)
        elif use_ghm_c_loss:
            conf_loss = self.ghm_c_loss(pred_allboxes_conf, labels_pos_mask,
                                        labels_neg_mask, class_vectors,
                                        labels_pos_cid2)
        elif use_focal_loss:
            conf_loss = self.focal_conf_loss(pred_allboxes_conf,
                                             labels_pos_mask, labels_neg_mask,
                                             class_vectors, labels_pos_cid2)
        elif use_ohem_loss:
            conf_loss = self.ohem_conf_loss(pred_allboxes_conf, batch_size,
                                            labels_neg_mask, labels_pos_mask,
                                            labels_pos_index, class_vectors,
                                            labels_pos_cid)
        losses['C'] = conf_loss * conf_alpha

        # 4.mask_iou_loss,只有正例才计算。
        if use_maskiou:
            # maskiou_net_input  (21, 1, 138, 138)  21个正例预测的掩码
            # maskiou_t          (21, )             21个正例预测的掩码和真实掩码的iou
            # label_t            (21, )             21个正例预测的cid
            maskiou_net_input, maskiou_t, label_t = maskiou_targets
            maskiou_p = maskiou_net(maskiou_net_input, self.num_classes - 1)
            maskiou_p = P.reduce_max(maskiou_p, dim=[2, 3])  # 最大池化  (21, 80)
            temp_mask = P.gather(class_vectors, label_t)  # 掩码  (21, 81)
            temp_mask = temp_mask[:, 1:]  # 掩码  (21, 80)
            maskiou_p = temp_mask * maskiou_p  # 只保留真实类别的那个通道  (21, 80)
            maskiou_p = P.reduce_sum(maskiou_p, dim=1,
                                     keep_dim=True)  # (21, 1)
            loss_i = P.smooth_l1(
                maskiou_p, P.reshape(maskiou_t, (P.shape(maskiou_t)[0], 1)))
            maskiou_alpha = 25.0
            losses['I'] = maskiou_alpha * P.reduce_sum(loss_i)

        # 5.semantic_segmentation_loss,只有正例才计算
        mask_h = P.shape(pred_segm)[2]
        mask_w = P.shape(pred_segm)[3]
        loss_s = 0.0
        for idx in range(batch_size):
            cur_segment = pred_segm[idx]  # (80, 69, 69)
            l = P.sigmoid_cross_entropy_with_logits(cur_segment,
                                                    segment_t[idx])
            loss_s += P.reduce_sum(l)

        semantic_segmentation_alpha = 1.0
        losses['S'] = loss_s / mask_h / mask_w * semantic_segmentation_alpha

        total_num_pos = P.cast(P.reduce_sum(labels_pos_mask), 'float32')
        for k in losses:
            if k not in ('S', ):
                losses[k] /= total_num_pos
            else:
                losses[k] /= batch_size
        total_loss = 0.0
        for k in losses:
            total_loss += losses[k]

        # Loss Key:
        #  - B: Box Localization Loss
        #  - M: Mask Loss
        #  - C: Class Confidence Loss
        #  - I: MaskIou Loss
        #  - S: Semantic Segmentation Loss
        # return losses['M'], losses['C']
        return losses, total_loss
    def _forward(self, inputs, is_training):
        """ Real forward process of model in different mode(train/test). """
        outputs = {}

        src_token = inputs["src_token"]
        src_mask = inputs["src_mask"]
        src_pos = inputs["src_pos"]
        src_type = inputs["src_type"]
        src_turn = inputs["src_turn"]

        tgt_token = inputs["tgt_token"][:, :-1]
        tgt_mask = inputs["tgt_mask"][:, :-1]
        tgt_pos = inputs["tgt_pos"][:, :-1]
        tgt_type = inputs["tgt_type"][:, :-1]
        tgt_turn = inputs["tgt_turn"][:, :-1]

        input_mask = layers.concat([src_mask, tgt_mask], axis=1)
        input_mask.stop_gradient = True
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)
        tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn)
        embed = layers.concat([src_embed, tgt_embed], axis=1)
        embed = self.embed_layer_norm(embed)

        batch_size = src_token.shape[0]
        src_len = src_token.shape[1]
        tgt_len = tgt_token.shape[1]

        if self.num_latent > 0:
            post_embed, post_probs, post_logits = self._posteriori_network(
                input_mask, embed, batch_size, src_len, tgt_len)
            outputs["post_logits"] = post_logits

            if self.use_discriminator:
                pos_probs, neg_probs = self._discriminator_network(
                    input_mask, embed, batch_size, src_len, tgt_len, post_embed)
                outputs["pos_probs"] = pos_probs
                outputs["neg_probs"] = neg_probs

            if is_training:
                z = F.gumbel_softmax(post_logits, self.tau)
            else:
                indices = layers.argmax(post_logits, axis=1)
                z = layers.one_hot(F.unsqueeze(indices, [1]), self.num_latent)
            latent_embeddings = self.latent_embeddings
            latent_embed = layers.matmul(z, latent_embeddings)
            outputs["latent_embed"] = latent_embed
        else:
            latent_embed = None

        latent_embed, dec_probs = self._generation_network(
            input_mask, embed, batch_size, src_len, tgt_len, latent_embed)
        outputs["dec_probs"] = dec_probs

        if self.num_latent > 0 and self.with_bow:
            if self.two_layer_predictor:
                latent_embed = self.pre_bow_predictor(latent_embed)
            bow_logits = self.bow_predictor(latent_embed)
            bow_probs = layers.softmax(bow_logits)
            outputs["bow_probs"] = bow_probs

        return outputs
示例#28
0
def wrap_decoder(trg_vocab_size,
                 max_length,
                 n_layer,
                 n_head,
                 d_key,
                 d_value,
                 d_model,
                 d_inner_hid,
                 prepostprocess_dropout,
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd,
                 postprocess_cmd,
                 weight_sharing,
                 dec_inputs=None,
                 enc_output=None,
                 caches=None,
                 gather_idx=None,
                 bos_idx=0):
    """
    The wrapper assembles together all needed layers for the decoder.
    """
    if dec_inputs is None:
        # This is used to implement independent decoder program in inference.
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \
            make_all_inputs(decoder_data_input_fields)
    else:
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs

    dec_input = prepare_decoder(trg_word,
                                trg_pos,
                                trg_vocab_size,
                                d_model,
                                max_length,
                                prepostprocess_dropout,
                                bos_idx=bos_idx,
                                word_emb_param_name="src_word_emb_table"
                                if weight_sharing else "trg_word_emb_table")
    dec_output = decoder(dec_input,
                         enc_output,
                         trg_slf_attn_bias,
                         trg_src_attn_bias,
                         n_layer,
                         n_head,
                         d_key,
                         d_value,
                         d_model,
                         d_inner_hid,
                         prepostprocess_dropout,
                         attention_dropout,
                         relu_dropout,
                         preprocess_cmd,
                         postprocess_cmd,
                         caches=caches,
                         gather_idx=gather_idx)
    return dec_output
    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
    dec_output = layers.reshape(dec_output,
                                shape=[-1, dec_output.shape[-1]],
                                inplace=True)
    if weight_sharing:
        predict = layers.matmul(
            x=dec_output,
            y=fluid.default_main_program().global_block().var(
                "trg_word_emb_table"),
            transpose_y=True)
    else:
        predict = layers.fc(input=dec_output,
                            size=trg_vocab_size,
                            bias_attr=False)
    if dec_inputs is None:
        # Return probs for independent decoder program.
        predict = layers.softmax(predict)
    return predict
示例#29
0
    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
示例#30
0
    def forward(self):
        """Build the GATNE net.
        """
        param_attr_init = fluid.initializer.Uniform(
            low=-1.0, high=1.0, seed=np.random.randint(100))
        embed_param_attrs = fluid.ParamAttr(name='Base_node_embed',
                                            initializer=param_attr_init)

        # node_embeddings
        base_node_embed = fl.embedding(
            input=fl.reshape(self.train_inputs, shape=[-1, 1]),
            size=[self.num_nodes, self.embedding_size],
            param_attr=embed_param_attrs)

        node_features = []
        for edge_type in self.edge_types:
            param_attr_init = fluid.initializer.Uniform(
                low=-1.0, high=1.0, seed=np.random.randint(100))
            embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' %
                                                edge_type,
                                                initializer=param_attr_init)

            features = fl.embedding(
                input=self.gw[edge_type].node_feat['index'],
                size=[self.num_nodes, self.embedding_u_size],
                param_attr=embed_param_attrs)

            node_features.append(features)

        # mp_output: list of embedding(self.num_nodes, dim)
        mp_output = self.message_passing(self.gw, self.edge_types,
                                         node_features)

        # U : (num_type[m], num_nodes, dim[s])
        node_type_embed = fl.stack(mp_output, axis=0)

        # U : (num_nodes, num_type[m], dim[s])
        node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2])

        #gather node_type_embed from train_inputs
        node_type_embed = fl.gather(node_type_embed, self.train_inputs)

        # M_r
        trans_weights = fl.create_parameter(
            shape=[
                self.edge_type_count, self.embedding_u_size,
                self.embedding_size // self.att_head
            ],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w')

        # W_r
        trans_weights_s1 = fl.create_parameter(
            shape=[self.edge_type_count, self.embedding_u_size, self.dim_a],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s1')

        # w_r
        trans_weights_s2 = fl.create_parameter(
            shape=[self.edge_type_count, self.dim_a, self.att_head],
            attr=fluid.initializer.TruncatedNormalInitializer(
                loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)),
            dtype='float32',
            name='trans_w_s2')

        trans_w = fl.gather(trans_weights, self.train_types)
        trans_w_s1 = fl.gather(trans_weights_s1, self.train_types)
        trans_w_s2 = fl.gather(trans_weights_s2, self.train_types)

        attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2)
        node_type_embed = fl.matmul(attention, node_type_embed)
        node_embed = base_node_embed + fl.reshape(
            fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size])

        self.last_node_embed = fl.l2_normalize(node_embed, axis=1)

        nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer(
            loc=0.0, scale=1.0 / math.sqrt(self.embedding_size))
        nce_weight_attrs = fluid.ParamAttr(name='nce_weight',
                                           initializer=nce_weight_initializer)

        weight_pos = fl.embedding(input=self.train_labels,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        weight_neg = fl.embedding(input=self.train_negs,
                                  size=[self.num_nodes, self.embedding_size],
                                  param_attr=nce_weight_attrs)
        tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1])
        pos_logits = fl.matmul(tmp_node_embed, weight_pos,
                               transpose_y=True)  # [B, 1, 1]

        neg_logits = fl.matmul(tmp_node_embed, weight_neg,
                               transpose_y=True)  # [B, 1, neg_num]

        pos_score = fl.squeeze(pos_logits, axes=[1])
        pos_score = fl.clip(pos_score, min=-10, max=10)
        pos_score = -1.0 * fl.logsigmoid(pos_score)

        neg_score = fl.squeeze(neg_logits, axes=[1])
        neg_score = fl.clip(neg_score, min=-10, max=10)
        neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score)

        neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True)
        self.loss = fl.reduce_mean(pos_score + neg_score)
 def forward(self,
             query,
             key,
             value,
             attn_mask=None,
             use_cache=False,
             cache=None):
     """
     Applies multi-head attention to map queries and a set of key-value pairs
     to outputs.
     """
     key = query if key is None else key
     value = query if value is None else value
     # compute q ,k ,v
     if use_cache is False:
         if self.fuse:
             q, k, v = self._fuse_prepare_qkv(query)
         else:
             q, k, v = self._prepare_qkv(query, key, value, use_cache,
                                         cache)
     else:
         q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                            cache)
     product = layers.matmul(x=q,
                             y=k,
                             transpose_y=True,
                             alpha=self.head_dim**-0.5)
     if attn_mask is not None:
         product = product + attn_mask
     weights = F.softmax(product)
     if self.dropout:
         weights = F.dropout(weights,
                             self.dropout,
                             training=self.training,
                             mode="upscale_in_train")
     out = tensor.matmul(weights, v)
     # combine heads
     out = tensor.transpose(out, perm=[0, 2, 1, 3])
     out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
     # project to output
     out = self.out_proj(out)
     if _global_parallel_strategy == "mp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh": _global_process_mesh,
                               "dims_mapping": [0, -1]
                           })
     elif _global_parallel_strategy == "dp_mp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh": _global_process_mesh,
                               "dims_mapping": [1, -1]
                           })
     elif _global_parallel_strategy == "mp_pp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh":
                               MPPP_MESH_LIST[self.mesh_idx],
                               "dims_mapping": [0, -1]
                           })
     elif _global_parallel_strategy == "dp_mp_pp":
         auto.shard_tensor(self.out_proj.weight,
                           dist_attr={
                               "process_mesh":
                               DPMPPP_MESH_LIST[self.mesh_idx],
                               "dims_mapping": [1, -1]
                           })
     outs = [out]
     if self.need_weights:
         outs.append(weights)
     if use_cache:
         outs.append(cache)
     return out if len(outs) == 1 else tuple(outs)