예제 #1
0
    def test_masked_fill_fw_bw_job(
            x: oft.Numpy.Placeholder(x_shape, dtype=flow_type),
            mask: oft.Numpy.Placeholder(mask_shape, dtype=flow_type),
    ):
        with flow.scope.placement(device, "0:0"):
            y = flow.get_variable(
                name="vx",
                shape=(1, ),
                dtype=flow.float,
                initializer=flow.zeros_initializer(),
            )
            x += flow.cast(y, flow_type)
            mask = flow.cast(mask, dtype=flow.int8)
            if type_name == "float16":
                out = flow.cast(
                    flow.masked_fill(flow.cast(x, flow.float16), mask, value),
                    flow.float,
                )
            else:
                out = flow.masked_fill(x, mask, value)
            flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                [], [1e-4]),
                               momentum=0).minimize(out)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch(out, test_global_storage.Setter("out"))
            flow.watch_diff(out, test_global_storage.Setter("out_diff"))
            return out
예제 #2
0
    def forward(self, logits, target, mask=None):
        """LabelSmoothing Function with Mask

        Args:
            logits ([tensor]): logits with shape [batch, length, vocab_size]
            target ([tensor]): target with shape [batch, length]
            mask ([tensor], optional): mask tensor (bool) with shape [batch, length]
        """
        assert logits.dim() == 3 and logits.size(-1) == self.size

        pad_mask = target == self.padding_idx
        if mask is not None:
            mask = (pad_mask.int() + mask.int()) > 0
        else:
            mask = pad_mask

        logits = logits.reshape(-1, self.size)
        with flow.no_grad():
            confidence = logits.clone()
            confidence.fill_(self.smoothing / (self.size - 1))
            confidence = flow.scatter(confidence, 1,
                                      target.reshape(-1).unsqueeze(1),
                                      1 - self.smoothing)

        logsoftmax = nn.LogSoftmax(dim=-1)
        KLdiv = nn.KLDivLoss(reduction="none", log_target=False)
        loss = flow.sum(KLdiv(logsoftmax(logits), confidence), dim=-1)

        total = flow.sum(mask == 0)
        denom = total if self.normalize_length else logits.size(0)
        loss = flow.masked_fill(loss, mask.reshape(-1), 0.0)
        loss = flow.sum(loss) / denom

        return loss
예제 #3
0
def mask_finished_preds(pred, flag):
    """
    If a sequence is finished, all of its branch should be </S> (3).
    Args:
        pred: A int array with shape [batch_size * beam_size, beam_size].
        flag: A bool array with shape [batch_size * beam_size, 1].
    Returns:
        A int array with shape [batch_size * beam_size].
    """
    beam_width = pred.size(-1)
    finished = flag.repeat([1, beam_width])
    return flow.masked_fill(pred, finished.to(dtype=flow.uint8) == 1, EOS)
예제 #4
0
    def forward(self, x, mask):
        """
        Args:
            x: [batch_size, time, channels]
            mask: [batch_size, time]
        """
        mask = mask.unsqueeze(2).repeat([1, 1, x.size(-1)])

        x = self.pointwise_conv1(x)
        x = F.glu(x)
        x = flow.masked_fill(x, mask == 0, 0.0)

        x = x.transpose(1, 2)
        x = self.depthwise_conv(x)
        x = self.batch_norm(x)
        x = x * flow.sigmoid(x)
        x = x.transpose(1, 2)

        x = self.pointwise_conv2(x)
        x = flow.masked_fill(x, mask == 0, 0.0)

        return x
예제 #5
0
def mask_finished_scores(score, flag):
    """
    If a sequence is finished, we only allow one alive branch. This function aims to give one branch a zero score
    and the rest -inf score.
    Args:
        score: A real value array with shape [batch_size * beam_size, beam_size].
        flag: A bool array with shape [batch_size * beam_size, 1].
    Returns:
        A real value array with shape [batch_size * beam_size, beam_size].
    """
    beam_width = score.size(-1)
    zero_mask = flow.zeros_like(flag).to(dtype=flow.uint8)
    if beam_width > 1:
        unfinished = flow.cat(
            [zero_mask, flag.repeat([1, beam_width - 1])], dim=1)
        finished = flow.cat(
            (flag.to(dtype=flow.uint8), zero_mask.repeat([1, beam_width - 1])),
            dim=1)
    else:
        unfinished = zero_mask
        finished = flag.to(dtype=flow.uint8)
    score = flow.masked_fill(score, unfinished == 1, -float("inf"))
    score = flow.masked_fill(score, finished == 1, 0)
    return score
예제 #6
0
    def compute_context(self, values, scores, mask=None):
        """
        Args:
            values: [b, t2, v] or [b, nh, t2, v]
            scores: [b, t1, t2] or [b, nh, t1, t2]
            mask: [b, t1, t2] or [b, 1/nh, t1, t2]
        """
        assert values.dim() == scores.dim()

        if mask is not None:
            scores = flow.masked_fill(scores, mask == 0, -float("inf"))

        weights = flow.softmax(scores, dim=-1)
        context = flow.matmul(weights, values)

        if context.dim() == 4:
            b, n, t, v = context.size()
            context = context.transpose(1, 2).reshape(b, t, n * v)

        if self.enable_output_proj:
            context = self.output_proj(context)

        return self.dropout(context), weights
예제 #7
0
def _masked_fill(self, mask, fill_value):
    return flow.masked_fill(self, mask, fill_value)
예제 #8
0
def masked_fill_Job(x: tp.Numpy.Placeholder(x.shape),
                    mask: tp.Numpy.Placeholder((4, ),
                                               dtype=flow.int8)) -> tp.Numpy:
    out = flow.masked_fill(x, mask, value=5)
    return out
예제 #9
0
def Causal_Self_Attention(x, config, name='csa'):
    """
    Input:: 
        x : Eembedded words input[B, T, C]
            -- B is the batch size
            -- T is the sequence length(block_size)
            -- C is the dimension of the embedding (n_embd)
               C/head_number = dimension of each head(d_k)
        config: class object defined with models.GPTConfig
    Output::
        y : output of x, which can be used as new x in next interation
    
 
    Description::
        This functions is the causl_sefl_attention core, which is a part of multiple head attention
        schema.
        Code refered from: https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
        Theory refered from: http://jalammar.github.io/illustrated-gpt2/
        Related paper: 
    """
    assert config.n_embd % config.n_head == 0

    #def
    B, T, C = x.shape
    #Kaiming_initialize
    kaiming_init_C = flow.kaiming_initializer(shape=(C, C))
    ## calculate query, key, values for all heads in batch and move head forward to be the batch dim
    # define: key, query and value projections for all heads
    # process: query + key ----> value
    # dimension: (B,T,C) -> (B, nh, T, hs), nh*ns=C

    # query:The query is a representation of the current word used to score against all the other words (using their keys).
    query = flow.layers.dense(x,
                              units=config.n_embd,
                              kernel_initializer=kaiming_init_C,
                              name=(name + '_query'))
    query = flow.reshape(query, [B, T, config.n_head, C // config.n_head])
    query = flow.transpose(query, [0, 2, 1, 3])
    # key:Key vectors are like labels for all the words in the segment.
    key = flow.layers.dense(x,
                            units=config.n_embd,
                            kernel_initializer=kaiming_init_C,
                            name=(name + '_key'))
    key = flow.reshape(key, [B, T, config.n_head, C // config.n_head])
    key = flow.transpose(key, [0, 2, 1, 3])
    # value: Value vectors are actual word representations
    value = flow.layers.dense(x,
                              units=config.n_embd,
                              kernel_initializer=kaiming_init_C,
                              name=(name + 'value'))
    value = flow.reshape(value, [B, T, config.n_head, C // config.n_head])
    value = flow.transpose(value, [0, 2, 1, 3])

    ##causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
    att = flow.matmul(query, flow.transpose(
        key, [0, 1, 3, 2])) * (1.0 / math.sqrt(key.shape[-1]))
    att_tril = flow.math.tril(
        flow.constant(value=int(-1),
                      dtype=flow.int32,
                      shape=(B, config.n_head, T, T),
                      name=name + "_ConstantLike_tril"))
    att_tril = att_tril + flow.ones_like(like=att_tril, dtype=flow.int32)
    att = flow.masked_fill(att, att_tril, float('-inf'))
    att = flow.nn.softmax(att, name=name + 'att')
    att = flow.nn.dropout(att, config.attn_pdrop)
    ## QK*V: (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = flow.matmul(att, value)
    y = flow.transpose(y, [0, 2, 1, 3])
    y = flow.reshape(y, [B, T, C])
    y = flow.nn.dropout(y, config.resid_pdrop)
    return y