Пример #1
0
    def create_rnn_op(self):
        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
                        dtype='float32',
                        name='x',
                        append_batch_size=False)
        x.stop_gradient = False
        h_boot1 = layers.data(shape=[self.batch_size, self.input_dim],
                              dtype='float32',
                              name='h_boot1',
                              append_batch_size=False)
        h_boot1.stop_gradient = False
        h_boot2 = layers.data(shape=[self.batch_size, self.input_dim],
                              dtype='float32',
                              name='h_boot2',
                              append_batch_size=False)
        h_boot2.stop_gradient = False

        rnn = layers.StaticRNN()
        with rnn.step():
            h_pre1 = rnn.memory(init=h_boot1)
            h_pre2 = rnn.memory(init=h_boot2)
            x_t = rnn.step_input(x)

            mem1 = layers.scale(x=h_pre1, scale=1.0)
            mem2 = layers.scale(x=h_pre2, scale=1.0)
            out = layers.sums(input=[mem1, x_t, mem2])

            rnn.update_memory(h_pre1, mem1)
            rnn.update_memory(h_pre2, mem2)
            rnn.output(out)

        return rnn()
Пример #2
0
    def forward(self, query, encoder_out, mask=None, last_attended=None):
        """
        Compute contextualized representation and alignment scores.
        
        Args:
            query (Variable): shape(B, T_dec, C_q), dtype float32, the query tensor, where C_q means the query dim.
            encoder_out (keys, values): 
                keys (Variable): shape(B, T_enc, C_emb), dtype float32, the key representation from an encoder, where C_emb means embed dim.
                values (Variable): shape(B, T_enc, C_emb), dtype float32, the value representation from an encoder, where C_emb means embed dim.
            mask (Variable, optional): shape(B, T_enc), dtype float32, mask generated with valid text lengths. Pad tokens corresponds to 1, and valid tokens correspond to 0.
            last_attended (int, optional): The position that received the most attention at last time step. This is only used at inference.

        Outpus:
            x (Variable): shape(B, T_dec, C_q), dtype float32, the contextualized representation from attention mechanism.
            attn_scores (Variable): shape(B, T_dec, T_enc), dtype float32, the alignment tensor, where T_dec means the number of decoder time steps and T_enc means number the number of decoder time steps.
        """
        keys, values = encoder_out
        residual = query
        if self.value_projection:
            values = self.value_proj(values)
        if self.key_projection:
            keys = self.key_proj(keys)
        x = self.query_proj(query)

        x = F.matmul(x, keys, transpose_y=True)

        # mask generated by sentence length
        neg_inf = -1.e30
        if mask is not None:
            neg_inf_mask = F.scale(F.unsqueeze(mask, [1]), neg_inf)
            x += neg_inf_mask

        # if last_attended is provided, focus only on a window range around it
        # to enforce monotonic attention.
        if last_attended is not None:
            locality_mask = np.ones(shape=x.shape, dtype=np.float32)
            backward, ahead = self.window_range
            backward = last_attended + backward
            ahead = last_attended + ahead
            backward = max(backward, 0)
            ahead = min(ahead, x.shape[-1])
            locality_mask[:, :, backward:ahead] = 0.
            locality_mask = dg.to_variable(locality_mask)
            neg_inf_mask = F.scale(locality_mask, neg_inf)
            x += neg_inf_mask

        x = F.softmax(x)
        attn_scores = x
        x = F.dropout(
            x, self.dropout, dropout_implementation="upscale_in_train")
        x = F.matmul(x, values)
        encoder_length = keys.shape[1]

        x = F.scale(x, encoder_length * np.sqrt(1.0 / encoder_length))
        x = self.out_proj(x)
        x = F.scale((x + residual), np.sqrt(0.5))
        return x, attn_scores
Пример #3
0
    def forward(self, q, k, v, lengths, speaker_embed, start_index, 
                force_monotonic=False, prev_coeffs=None, window=None):
        # add position encoding as an inductive bias 
        if self.has_bias: # multi-speaker model
            omega_q = 2 * F.sigmoid(
                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
            omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze(
                self.k_pos_affine(speaker_embed), axes=[-1]))
        else: # single-speaker case
            batch_size = q.shape[0]
            omega_q = F.ones((batch_size, ), dtype="float32")
            omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default
        q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q)
        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
        activations = F.matmul(q, k, transpose_y=True)
        activations /= np.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
            mask = F.sequence_mask(lengths, dtype="float32")
            attn_bias = F.scale(1. - mask, -1000)
            activations += F.unsqueeze(attn_bias, [1])
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
            T_enc = k.shape[1]
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
                   else F.argmax(prev_coeffs, axis=-1)
            backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool")
            forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool")
            mask = F.cast(F.logical_xor(backward, forward), "float32")
            # print("mask's shape:", mask.shape)
            attn_bias = F.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
        coefficients = F.dropout(coefficients, 1. - self.keep_prob,
                                 dropout_implementation='upscale_in_train')
        contexts = F.matmul(coefficients, v)
        # context normalization
        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
        contexts *= F.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
    def setUp(self):
        self.setup_program()
        self.data_field = {"x", "h_boot"}

        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)

        with fluid.program_guard(self.main_program, self.startup_program):
            x = layers.data(
                shape=[self.sent_len, self.batch_size, self.input_dim],
                dtype='float32',
                name='x',
                append_batch_size=False)
            x.stop_gradient = False
            h_boot = layers.data(
                shape=[self.input_dim], dtype='float32', name='h_boot')
            h_boot.stop_gradient = False

            forward_only_rnn = layers.StaticRNN()
            with forward_only_rnn.step():
                h_pre = forward_only_rnn.memory(init=h_boot)
                x_t = forward_only_rnn.step_input(x)

                h = layers.scale(
                    x=layers.elementwise_add(
                        x=h_pre, y=x_t),
                    scale=self.py_rnn.scale)

                forward_only_rnn.update_memory(h_pre, h)
                forward_only_rnn.output(h)
            forward_only_output = forward_only_rnn()
            forward_only_output.stop_gradient = True
            self.forward_only_output = layers.mean(forward_only_output)

            rnn = layers.StaticRNN()
            with rnn.step():
                h_pre = rnn.memory(init=h_boot)
                x_t = rnn.step_input(x)

                h = layers.scale(
                    x=layers.elementwise_add(
                        x=h_pre, y=x_t),
                    scale=self.py_rnn.scale)

                rnn.update_memory(h_pre, h)
                rnn.output(h)

            self.output = layers.mean(rnn())
Пример #5
0
    def forward(self, x, speaker_embed=None):
        """
        Args:
            x (Variable): shape(B, C_in, T), dtype float32, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels T means input time steps.
            speaker_embed (Variable): shape(B, C_sp), dtype float32, speaker embed, where C_sp means speaker embedding size.

        Returns:
            x (Variable): shape(B, C_out, T), the output of Conv1DGLU, where
                C_out means the `num_filters`.
        """
        residual = x
        x = F.dropout(x,
                      self.dropout,
                      dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = F.split(x, num_or_sections=2, dim=1)

        if speaker_embed is not None:
            sp = F.softsign(self.fc(speaker_embed))
            content = F.elementwise_add(content, sp, axis=0)

        # glu
        x = F.sigmoid(gate) * content

        if self.residual:
            x = F.scale(x + residual, np.sqrt(0.5))
        return x
Пример #6
0
    def add_input(self, x_t, speaker_embed=None):
        """
        Takes a step of inputs and return a step of outputs. It works similarily with the `forward` method, but in a `step-in-step-out` fashion.

        Args:
            x_t (Variable): shape(B, C_in, T=1), dtype float32, the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels.
            speaker_embed (Variable): Shape(B, C_sp), dtype float32, speaker embed, where C_sp means speaker embedding size. 

        Returns:
            x (Variable): shape(B, C_out), the output of Conv1DGLU, where C_out means the `num_filter`.
        """
        residual = x_t
        x_t = F.dropout(x_t,
                        self.dropout,
                        dropout_implementation="upscale_in_train")
        x_t = self.conv.add_input(x_t)
        content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)

        if speaker_embed is not None:
            sp = F.softsign(self.fc(speaker_embed))
            content_t = F.elementwise_add(content_t, sp, axis=0)

        # glu
        x_t = F.sigmoid(gate_t) * content_t

        if self.residual:
            x_t = F.scale(x_t + residual, np.sqrt(0.5))
        return x_t
Пример #7
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
        Scaled Dot-Product Attention
        """

        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.

        # The current implementation of softmax_op only supports 2D tensor,
        # consequently it cannot be directly used here.
        # If to use the reshape_op, Besides, the shape of product inferred in
        # compile-time is not the actual shape in run-time. It cann't be used
        # to set the attribute of reshape_op.
        # So, here define the softmax for temporary solution.

        def __softmax(x, eps=1e-9):
            exp_out = layers.exp(x=x)
            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)

        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
        if dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=dropout_rate, is_test=False)
        out = layers.matmul(weights, v)
        return out
Пример #8
0
    def create_rnn_op(self):
        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            dtype='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
        h_boot = layers.data(
            shape=[self.input_dim],
            dtype='float32',
            name='h_boot',
            **self.p_info)
        h_boot.stop_gradient = False

        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

            h = layers.scale(
                x=layers.elementwise_add(
                    x=h_pre, y=x_t, **self.p_info),
                scale=self.py_rnn.scale,
                **self.p_info)

            rnn.update_memory(h_pre, h)
            rnn.output(h)

        return rnn()
Пример #9
0
def prepare_encoder(
        src_word,  #[b,t,c]
        src_pos,
        src_vocab_size,
        src_emb_dim,
        src_max_len,
        dropout_rate=0.,
        bos_idx=0,
        word_emb_param_name=None,
        pos_enc_param_name=None):
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
    [batch_size, max_src_length_in_batch, d_model].
    This module is used at the bottom of the encoder stacks.
    """

    src_word_emb = src_word  #layers.concat(res,axis=1)
    src_word_emb = layers.cast(src_word_emb, 'float32')
    # print("src_word_emb",src_word_emb)

    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
    src_pos_enc = layers.embedding(src_pos,
                                   size=[src_max_len, src_emb_dim],
                                   param_attr=fluid.ParamAttr(
                                       name=pos_enc_param_name,
                                       trainable=False))
    src_pos_enc.stop_gradient = True
    enc_input = src_word_emb + src_pos_enc
    return layers.dropout(
        enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
        is_test=False) if dropout_rate else enc_input
Пример #10
0
def prepare_encoder(src_word,
                    src_pos,
                    src_vocab_size,
                    src_emb_dim,
                    src_max_len,
                    dropout_rate=0.,
                    src_data_shape=None,
                    word_emb_param_name=None,
                    pos_enc_param_name=None):
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
    [batch_size, max_src_length_in_batch, d_model].
    This module is used at the bottom of the encoder stacks.
    """
    src_word_emb = layers.embedding(src_word,
                                    size=[src_vocab_size, src_emb_dim],
                                    param_attr=fluid.ParamAttr(
                                        name=word_emb_param_name,
                                        initializer=fluid.initializer.Normal(
                                            0., src_emb_dim**-0.5)))
    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
    src_pos_enc = layers.embedding(src_pos,
                                   size=[src_max_len, src_emb_dim],
                                   param_attr=fluid.ParamAttr(
                                       name=pos_enc_param_name,
                                       trainable=False))
    enc_input = src_word_emb + src_pos_enc
    enc_input = layers.reshape(x=enc_input,
                               shape=[batch_size, seq_len, src_emb_dim],
                               actual_shape=src_data_shape)
    return layers.dropout(enc_input, dropout_prob=dropout_rate,
                          is_test=False) if dropout_rate else enc_input
Пример #11
0
        def __call__(self, msg):
            alpha = msg["alpha"]  # lod-tensor (batch_size, num_heads)
            if attn_drop:
                old_h = alpha
                dropout = F.data(name='attn_drop', shape=[1], dtype="int64")
                u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'),
                                     min=0.,
                                     max=1.)
                keeped = L.cast(u > dropout, dtype="float32")
                self_attn_mask = L.scale(x=keeped,
                                         scale=10000.0,
                                         bias=-1.0,
                                         bias_after_scale=False)
                n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads,
                                                axis=1)
                n_head_self_attn_mask.stop_gradient = True
                alpha = n_head_self_attn_mask + alpha
                alpha = L.lod_reset(alpha, old_h)

            h = msg["v"]
            alpha = paddle_helper.sequence_softmax(alpha)

            self.alpha = alpha
            old_h = h
            h = h * alpha
            h = L.lod_reset(h, old_h)
            h = L.sequence_pool(h, "sum")

            if concat:
                h = L.reshape(h, [-1, num_heads * hidden_size])
            else:
                h = L.reduce_mean(h, dim=1)
            return h
Пример #12
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        [[
          0 L*L -inf
          -inf -inf
        ]]maxLen*maxLen
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        ############################
        # add code
        layers.Print(attn_bias, message="The content of input layer:")

        attn_mask = attn_bias == 0
        attn_mask = layers.cast(attn_mask, 'float64')
        layers.Print(weights)
        weights = layers.elementwise_mul(attn_mask, weights)
        layers.Print(weights)

        #         weights = layers.elementwise_mul(weights, attn_mask)
        ############################
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
Пример #13
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        [[
          0 L*L -inf
          -inf -inf
        ]]maxLen*maxLen
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        # print("q",q.shape)
        # print("k",k.shape)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        # print("product",product.shape)
        if attn_bias:
            # print('attn_bias',attn_bias.shape)
            # print(product.shape)
            # print(product.shape)
            # print(attn_bias.shape)
            product += attn_bias
        weights = layers.softmax(product)
        # layers.Print(weights)

        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
Пример #14
0
    def add_input(self, x, condition=None):
        """Add a step input. This method works similarily with `forward` but in a `step-in-step-out` fashion.

        Args:
            x (Variable): shape(B, C_res, T=1), input for a step, dtype float32.
            condition (Variable, optional): shape(B, C_cond, T=1). condition for a step, dtype float32. Defaults to None.

        Returns:
            (residual, skip_connection)
            residual (Variable): shape(B, C_res, T=1), the residual for a step, which is used as the input to the next layer of ResidualBlock.
            skip_connection (Variable): shape(B, C_res, T=1), the skip connection for a step. This output is accumulated with that of other ResidualBlocks. 
        """
        h = x

        # dilated conv
        h = self.conv.add_input(h)

        # condition
        if condition is not None:
            h += self.condition_proj(condition)

        # gated tanh
        content, gate = F.split(h, 2, dim=1)
        z = F.sigmoid(gate) * F.tanh(content)

        # projection
        residual = F.scale(z + x, np.sqrt(0.5))
        skip_connection = z
        return residual, skip_connection
Пример #15
0
    def forward(self, x, condition=None):
        """Conv1D gated-tanh Block.

        Args:
            x (Variable): shape(B, C_res, T), the input. (B stands for batch_size, C_res stands for residual channels, T stands for time steps.) dtype float32.
            condition (Variable, optional): shape(B, C_cond, T), the condition, it has been upsampled in time steps, so it has the same time steps as the input does.(C_cond stands for the condition's channels). Defaults to None.

        Returns:
            (residual, skip_connection)
            residual (Variable): shape(B, C_res, T), the residual, which is used as the input to the next layer of ResidualBlock.
            skip_connection (Variable): shape(B, C_res, T), the skip connection. This output is accumulated with that of other ResidualBlocks. 
        """
        time_steps = x.shape[-1]
        h = x

        # dilated conv
        h = self.conv(h)
        if h.shape[-1] != time_steps:
            h = h[:, :, :time_steps]

        # condition
        if condition is not None:
            h += self.condition_proj(condition)

        # gated tanh
        content, gate = F.split(h, 2, dim=1)
        z = F.sigmoid(gate) * F.tanh(content)

        # projection
        residual = F.scale(z + x, math.sqrt(.5))
        skip_connection = z
        return residual, skip_connection
Пример #16
0
def prepare_decoder(src_word,
                    src_pos,
                    src_vocab_size,
                    src_emb_dim,
                    src_max_len,
                    dropout_rate=0.,
                    bos_idx=0,
                    word_emb_param_name=None,
                    pos_enc_param_name=None):
    """Add word embeddings and position encodings.
        The output tensor has a shape of:
        [batch_size, max_src_length_in_batch, d_model].
        This module is used at the bottom of the encoder stacks.
        """
    src_word_emb = layers.embedding(
        src_word,
        size=[src_vocab_size, src_emb_dim],
        padding_idx=bos_idx,  # set embedding of bos to 0
        param_attr=fluid.ParamAttr(name=word_emb_param_name,
                                   initializer=fluid.initializer.Normal(
                                       0., src_emb_dim**-0.5)))
    # print("target_word_emb",src_word_emb)
    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
    src_pos_enc = layers.embedding(src_pos,
                                   size=[src_max_len, src_emb_dim],
                                   param_attr=fluid.ParamAttr(
                                       name=pos_enc_param_name,
                                       trainable=False))
    src_pos_enc.stop_gradient = True
    enc_input = src_word_emb + src_pos_enc
    return layers.dropout(
        enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
        is_test=False) if dropout_rate else enc_input
Пример #17
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
        Scaled Dot-Product Attention
        """

        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.

        # The current implementation of softmax_op only supports 2D tensor,
        # consequently it cannot be directly used here.
        # If to use the reshape_op, Besides, the shape of product inferred in
        # compile-time is not the actual shape in run-time. It cann't be used
        # to set the attribute of reshape_op.
        # So, here define the softmax for temporary solution.

        def __softmax(x, eps=1e-9):
            exp_out = layers.exp(x=x)
            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)

        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
        if dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=dropout_rate, is_test=False)
        out = layers.matmul(weights, v)
        return out
Пример #18
0
        def forward_attention(indicator, support_x, support_y_embed,
                              support_mask, query_x, query_y, query_mask):
            """
            support_indicator: length = support_len
                if attention(support, query), indicator = 0
                if attention(support, support), indicator = 1
            """
            support_y_embed = support_y_embed * support_mask
            support_xy = layers.concat([support_x, support_y_embed, indicator],
                                       axis=1)

            pad_value = layers.assign(
                input=numpy.array([0.0], dtype=numpy.float32))
            support_pad, support_len = layers.sequence_pad(support_xy,
                                                           pad_value=pad_value)
            query_pad, query_len = layers.sequence_pad(query_x,
                                                       pad_value=pad_value)

            attention = self.attention(query_pad, support_pad, support_pad,
                                       self.hidden_dim, 'meta')
            attention = layers.sequence_unpad(attention, length=query_len)
            pred_input = layers.concat([attention, query_x], axis=1)

            pred = self.prepare_preds_with_name(pred_input, 'out_pred')
            label = layers.cast(query_y, dtype='float32')
            label = layers.scale(label, scale=0.01)

            loss = layers.huber_loss(pred, label, 1.0) * query_mask
            loss = layers.mean(loss)
            return pred, label, loss
Пример #19
0
    def forward(self, queries, keys, values, attn_bias, past_cache):
        assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3
        #bsz, q_len, q_dim = queries.shape
        #bsz, k_len, k_dim = keys.shape
        #bsz, v_len, v_dim = values.shape
        #assert k_len == v_len

        q = self.q(queries)
        k = self.k(keys)
        v = self.v(values)

        cache = (k, v)
        if past_cache is not None:
            cached_k, cached_v = past_cache
            k = L.concat([cached_k, k], 1)
            v = L.concat([cached_v, v], 1)

        q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim]
        k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim]
        v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim]


        q = L.scale(q, scale=self.d_key ** -0.5)
        score = L.matmul(q, k, transpose_y=True)
        if attn_bias is not None:
            score += attn_bias
        score = L.softmax(score, use_cudnn=True)
        score = self.dropout(score)

        out = L.matmul(score, v)
        out = L.transpose(out, [0, 2, 1, 3])
        out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]])

        out = self.o(out)
        return out, cache
Пример #20
0
    def attention(self, query_feature, key_feature, value_feature, hidden_dim,
                  name):
        """
        attention
        """
        query_fc = layers.fc(input=query_feature,
                             size=hidden_dim,
                             param_attr=fluid.ParamAttr(
                                 name='query_fc_%s' % name,
                                 learning_rate=self.fc_lr),
                             act='relu',
                             num_flatten_dims=2)

        key_fc = layers.fc(input=key_feature,
                           size=hidden_dim,
                           param_attr=fluid.ParamAttr(
                               'key_fc_%s' % name, learning_rate=self.fc_lr),
                           act='relu',
                           num_flatten_dims=2)

        value_fc = layers.fc(input=value_feature,
                             size=hidden_dim,
                             param_attr=fluid.ParamAttr(
                                 'value_fc_%s' % name,
                                 learning_rate=self.fc_lr),
                             act='relu',
                             num_flatten_dims=2)

        query_key_mat = layers.matmul(query_fc, key_fc, False, True)
        query_key_mat = layers.scale(query_key_mat,
                                     scale=1.0 / math.sqrt(hidden_dim))
        matching_score = layers.softmax(query_key_mat, axis=2)
        attention = layers.matmul(matching_score, value_fc)
        attention
Пример #21
0
    def forward(self, input, bias=None, padding=None):
        """
        input: input feature (B, T, C)
        padding: only used when using causal conv, we pad mannually
        """
        input_dropped = F.dropout(input,
                                  1. - self.keep_prob,
                                  dropout_implementation="upscale_in_train")
        if self.causal:
            assert padding is not None
            input_dropped = F.concat([padding, input_dropped], axis=1)
        hidden = self.conv(input_dropped)

        if self.has_bias:
            assert bias is not None
            transformed_bias = F.softsign(self.bias_affine(bias))
            hidden_embedded = hidden + F.unsqueeze(transformed_bias, [1])
        else:
            hidden_embedded = hidden

        # glu
        content, gate = F.split(hidden, num_or_sections=2, dim=-1)
        content = hidden_embedded[:, :, :self.in_channel]
        hidden = F.sigmoid(gate) * content

        # # residual
        hidden = F.scale(input + hidden, math.sqrt(0.5))
        return hidden
Пример #22
0
 def forward(self, char_embed, speaker_embed=None):
     hidden = self.pre_affine(char_embed, speaker_embed)
     for layer in self.convs:
         hidden = layer(hidden, speaker_embed)
     hidden = self.post_affine(hidden, speaker_embed)
     keys = hidden
     values = F.scale(char_embed + hidden, np.sqrt(0.5))
     return keys, values
def inference_program():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
    scale_infer = layers.scale(x=inference, scale=5.0)

    return scale_infer
Пример #24
0
def inference_program():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
    scale_infer = layers.scale(x=inference, scale=5.0)

    return scale_infer
Пример #25
0
    def _gen_input(self,
                   token_ids,
                   type_ids,
                   pos_ids,
                   input_mask,
                   aux_emb=None):
        token_emb_out = layers.embedding(
            input=token_ids,
            size=[self.vocab_size, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(name=self.token_emb_name,
                                       initializer=self.param_initializer))
        type_emb_out = layers.embedding(
            input=type_ids,
            size=[self.type_size, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(name=self.type_emb_name,
                                       initializer=self.param_initializer))
        pos_emb_out = layers.embedding(
            input=pos_ids,
            size=[self.max_position_seq_len, self.emb_size],
            dtype=self.dtype,
            param_attr=fluid.ParamAttr(name=self.pos_emb_name,
                                       initializer=self.param_initializer))
        emb_out = token_emb_out + type_emb_out + pos_emb_out

        # auxiliary memory embeddings
        if aux_emb is not None:
            emb_out = layers.concat([aux_emb, emb_out], axis=1)

        # post process of embedding
        emb_out = pre_process_layer(emb_out,
                                    self.pre_encoder_cmd,
                                    self.prepostprocess_dropout,
                                    name="pre_encoder",
                                    epsilon=self.epsilon)
        if self.emb_mapping_in:
            emb_out = layers.fc(input=emb_out,
                                num_flatten_dims=2,
                                size=self.hidden_size,
                                param_attr=fluid.ParamAttr(
                                    name="emb_hidden_mapping",
                                    initializer=self.param_initializer),
                                bias_attr="emb_hidden_mapping_bias")

        # generate n-head self-attention mask
        self_attn_mask = input_mask
        self_attn_mask = layers.scale(x=self_attn_mask,
                                      scale=1e4,
                                      bias=-1.0,
                                      bias_after_scale=False)
        n_head_self_attn_mask = layers.stack(x=[self_attn_mask] * self.n_head,
                                             axis=1)
        n_head_self_attn_mask.stop_gradient = True

        return emb_out, n_head_self_attn_mask
Пример #26
0
    def _gen_dec_input(self, trg_word, trg_pos, trg_slf_attn_bias,
                       trg_src_words_attn_bias, trg_src_sents_attn_bias,
                       graph_attn_bias):
        emb_out = fluid.layers.embedding(
            input=trg_word,
            size=[self.voc_size, self._emb_size],
            padding_idx=self._padding_idx,  # set embedding of pad to 0
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._word_emb_name,
                                       initializer=self._param_initializer),
            is_sparse=False)
        emb_out = layers.scale(x=emb_out, scale=self._emb_size**0.5)

        position_emb_out = fluid.layers.embedding(
            input=trg_pos,
            size=[self._max_position_seq_len, self._emb_size],
            dtype=self._emb_dtype,
            param_attr=fluid.ParamAttr(name=self._dec_word_pos_emb_name,
                                       trainable=False))
        position_emb_out.stop_gradient = True

        emb_out = emb_out + position_emb_out
        emb_out = layers.dropout(
            emb_out,
            dropout_prob=self._prepostprocess_dropout,
            dropout_implementation="upscale_in_train",
            is_test=False) if self._prepostprocess_dropout else emb_out

        if self._dtype is "float16":
            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
            if trg_slf_attn_bias is not None:
                trg_slf_attn_bias = fluid.layers.cast(x=trg_slf_attn_bias,
                                                      dtype=self._dtype)

            if trg_src_words_attn_bias is not None:
                trg_src_words_attn_bias = fluid.layers.cast(
                    x=trg_src_words_attn_bias, dtype=self._dtype)

            if trg_src_sents_attn_bias is not None:
                trg_src_sents_attn_bias = fluid.layers.cast(
                    x=trg_src_sents_attn_bias, dtype=self._dtype)

            if graph_attn_bias is not None:
                graph_attn_bias = fluid.layers.cast(x=graph_attn_bias,
                                                    dtype=self._dtype)

        res = namedtuple('results', [
            'emb_out', 'trg_slf_attn_bias', 'trg_src_words_attn_bias',
            'trg_src_sents_attn_bias', 'graph_attn_bias'
        ])

        return res(emb_out=emb_out,
                   trg_slf_attn_bias=trg_slf_attn_bias,
                   trg_src_words_attn_bias=trg_src_words_attn_bias,
                   trg_src_sents_attn_bias=trg_src_sents_attn_bias,
                   graph_attn_bias=graph_attn_bias)
Пример #27
0
 def forward(self, src_word, src_pos, src_slf_attn_bias):
     word_emb = self.word_embedder(src_word)
     word_emb = layers.scale(x=word_emb, scale=self.emb_dim**0.5)
     pos_enc = self.pos_encoder(src_pos)
     pos_enc.stop_gradient = True
     emb = word_emb + pos_enc
     enc_input = layers.dropout(emb,
                                dropout_prob=self.emb_dropout,
                                is_test=False) if self.emb_dropout else emb
     enc_output = self.encoder(enc_input, src_slf_attn_bias)
     return enc_output
Пример #28
0
 def spec_loss(self, decoded, input, num_frames=None):
     if num_frames is None:
         l1_loss = F.reduce_mean(F.abs(decoded - input))
     else:
         # mask the <pad> part of the decoder
         num_channels = decoded.shape[-1]
         l1_loss = F.abs(decoded - input)
         mask = F.sequence_mask(num_frames, dtype="float32")
         l1_loss *= F.unsqueeze(mask, axes=[-1])
         l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels)
     return l1_loss
Пример #29
0
    def init_serv(self, place):
        main = fluid.Program()

        with fluid.program_guard(main):
            serv = layers.ListenAndServ("127.0.0.1:0", ["X"],
                                        optimizer_mode=False)
            with serv.do():
                out_var = main.global_block().create_var(name="scale_0.tmp_0",
                                                         psersistable=True,
                                                         dtype="float32",
                                                         shape=[32, 32])
                x = layers.data(shape=[32, 32],
                                dtype='float32',
                                name="X",
                                append_batch_size=False)
                fluid.initializer.Constant(value=1.0)(x, main.global_block())
                layers.scale(x=x, scale=10.0, out=out_var)

        self.server_exe = fluid.Executor(place)
        self.server_exe.run(main)
Пример #30
0
 def run_local(self, place):
     main = fluid.Program()
     with fluid.program_guard(main):
         x = layers.data(shape=[32, 32],
                         dtype='float32',
                         name='X',
                         append_batch_size=False)
         fluid.initializer.Constant(value=2.3)(x, main.global_block())
         o = layers.scale(x=x, scale=10.0)
     exe = fluid.Executor(place)
     self.local_out = exe.run(main, fetch_list=[o])
Пример #31
0
    def forward(self,
                inputs,
                keys,
                values,
                lengths,
                start_index,
                speaker_embed=None,
                state=None,
                force_monotonic_attention=None,
                coeffs=None,
                window=(0, 4)):
        hidden = inputs
        for layer in self.prenet:
            hidden = layer(hidden, speaker_embed)

        attentions = []  # every layer of (B, T_dec, T_enc) attention
        final_state = []  # layers * (B, (k-1)d, C_dec)
        batch_size = inputs.shape[0]
        causal_padding_shape = (batch_size, self.kernel_size - 1,
                                self.decoder_dim)

        for i in range(len(self.causal_convs)):
            if state is None:
                padding = F.zeros(causal_padding_shape, dtype="float32")
            else:
                padding = state[i]
            new_state = F.concat([padding, hidden],
                                 axis=1)  # => to be used next step
            # causal conv, (B, T, C)
            hidden = self.causal_convs[i](hidden,
                                          speaker_embed,
                                          padding=padding)
            # attn
            prev_coeffs = None if coeffs is None else coeffs[i]
            force_monotonic = False if force_monotonic_attention is None else force_monotonic_attention[
                i]
            context, attention = self.attention_blocks[i](
                hidden, keys, values, lengths, speaker_embed, start_index,
                force_monotonic, prev_coeffs, window)
            # residual connextion (B, T_dec, C_dec)
            hidden = F.scale(hidden + context, np.sqrt(0.5))

            attentions.append(attention)  # layers * (B, T_dec, T_enc)
            # new state: shift a step, layers * (B, T, C)
            new_state = new_state[:, -(self.kernel_size - 1):, :]
            final_state.append(new_state)

        # predict mel spectrogram (B, 1, T_dec, r * C_in)
        decoded = self.out_affine(hidden)
        if self.has_bias:
            decoded *= F.sigmoid(
                F.unsqueeze(self.out_sp_affine(speaker_embed), [1]))
        return decoded, hidden, attentions, final_state
Пример #32
0
 def run_local(self, place):
     main = fluid.Program()
     with fluid.program_guard(main):
         x = layers.data(
             shape=[32, 32],
             dtype='float32',
             name='X',
             append_batch_size=False)
         fluid.initializer.Constant(value=2.3)(x, main.global_block())
         o = layers.scale(x=x, scale=10.0)
     exe = fluid.Executor(place)
     self.local_out = exe.run(main, fetch_list=[o])
Пример #33
0
def get_program(layer, input_spec, output_spec, **configs):
    paddle.jit.set_verbosity(0)
    prog_translator = program_translator.ProgramTranslator()
    if not prog_translator.enable_to_static:
        raise RuntimeError(
            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
        )
    if isinstance(layer, Layer):
        if isinstance(layer.forward, program_translator.StaticFunction):
            concrete_program = layer.forward.concrete_program
        else:
            # transform in jit.save, if input_spec is incomplete, declarative will throw error
            layer = paddle.jit.to_static(layer, input_spec=input_spec)
            concrete_program = layer.forward.concrete_program
            # the input_spec has been used in declarative, which is equal to
            # @declarative with input_spec and jit.save without input_spec,
            # avoid needless warning
            input_spec = None
    else:
        raise TypeError(
            "The input Layer should be 'Layer', but received  type is %s." %
            type(layer))
    feed_var_names = paddle.fluid.dygraph.jit._get_input_var_names(
        concrete_program.inputs, input_spec)
    target_vars = paddle.fluid.dygraph.jit._get_output_vars(
        concrete_program.outputs, output_spec)
    main_program = concrete_program.main_program.clone()
    with program_guard(main_program):
        uniq_target_vars = []
        for i, var in enumerate(target_vars):
            if isinstance(var, Variable):
                var = layers.scale(var,
                                   1.,
                                   name="save_infer_model/scale_{}".format(i))
            uniq_target_vars.append(var)
        target_vars = uniq_target_vars
    global_block = main_program.global_block()
    need_to_remove_op_index = []
    for i, op in enumerate(global_block.ops):
        op.desc.set_is_target(False)
        if op.type == "feed" or op.type == "fetch":
            need_to_remove_op_index.append(i)
    for index in need_to_remove_op_index[::-1]:
        global_block._remove_op(index)
    main_program.desc.flush()
    main_program = main_program._prune_with_input(
        feeded_var_names=feed_var_names, targets=target_vars)
    main_program = main_program._inference_optimize(prune_read_op=True)
    fetch_var_names = [v.name for v in target_vars]
    prepend_feed_ops(main_program, feed_var_names)
    append_fetch_ops(main_program, fetch_var_names)
    return main_program, feed_var_names, target_vars
Пример #34
0
def model():
    usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()

    # need cos sim
    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
    scale_infer = layers.scale(x=inference, scale=5.0)

    label = layers.data(name='score', shape=[1], dtype='float32')
    square_cost = layers.square_error_cost(input=scale_infer, label=label)
    avg_cost = layers.mean(square_cost)

    return scale_infer, avg_cost
Пример #35
0
    def init_serv(self, place):
        main = fluid.Program()

        with fluid.program_guard(main):
            serv = layers.ListenAndServ(
                "127.0.0.1:0", ["X"], optimizer_mode=False)
            with serv.do():
                out_var = main.global_block().create_var(
                    name="scale_0.tmp_0",
                    psersistable=True,
                    dtype="float32",
                    shape=[32, 32])
                x = layers.data(
                    shape=[32, 32],
                    dtype='float32',
                    name="X",
                    append_batch_size=False)
                fluid.initializer.Constant(value=1.0)(x, main.global_block())
                layers.scale(x=x, scale=10.0, out=out_var)

        self.server_exe = fluid.Executor(place)
        self.server_exe.run(main)
Пример #36
0
    def create_rnn_op(self):
        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            dtype='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
        h_boot1 = layers.data(
            shape=[self.batch_size, self.input_dim],
            dtype='float32',
            name='h_boot1',
            append_batch_size=False,
            **self.p_info)
        h_boot1.stop_gradient = False
        h_boot2 = layers.data(
            shape=[self.batch_size, self.input_dim],
            dtype='float32',
            name='h_boot2',
            append_batch_size=False,
            **self.p_info)
        h_boot2.stop_gradient = False

        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre1 = rnn.memory(init=h_boot1)
            h_pre2 = rnn.memory(init=h_boot2)
            x_t = rnn.step_input(x)

            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)

            rnn.update_memory(h_pre1, mem1)
            rnn.update_memory(h_pre2, mem2)
            rnn.output(out)

        return rnn()
Пример #37
0
    def test_read_write(self):
        x = [
            layers.data(
                name='x0', shape=[100]), layers.data(
                    name='x1', shape=[100]), layers.data(
                        name='x2', shape=[100])
        ]

        for each_x in x:
            each_x.stop_gradient = False

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        arr = layers.array_write(x=x[0], i=i)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[1], i=i, array=arr)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[2], i=i, array=arr)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        a0 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a1 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a2 = layers.array_read(array=arr, i=i)

        mean_a0 = layers.mean(a0)
        mean_a1 = layers.mean(a1)
        mean_a2 = layers.mean(a2)

        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

        mean_x0 = layers.mean(x[0])
        mean_x1 = layers.mean(x[1])
        mean_x2 = layers.mean(x[2])

        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

        scope = core.Scope()
        cpu = core.CPUPlace()

        exe = Executor(cpu)

        tensor = numpy.random.random(size=(100, 100)).astype('float32')

        outs = exe.run(feed={'x0': tensor,
                             'x1': tensor,
                             'x2': tensor},
                       fetch_list=[a_sum, x_sum],
                       scope=scope)
        self.assertEqual(outs[0], outs[1])

        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

        append_backward(total_sum_scaled)

        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])
        g_out = [
            item.sum()
            for item in exe.run(
                feed={'x0': tensor,
                      'x1': tensor,
                      'x2': tensor},
                fetch_list=g_vars)
        ]
        g_out_sum = numpy.array(g_out).sum()

        # since our final gradient is 1 and the neural network are all linear
        # with mean_op.
        # the input gradient should also be 1
        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)