def test_masked_fill_fw_bw_job( x: oft.Numpy.Placeholder(x_shape, dtype=flow_type), mask: oft.Numpy.Placeholder(mask_shape, dtype=flow_type), ): with flow.scope.placement(device, "0:0"): y = flow.get_variable( name="vx", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x += flow.cast(y, flow_type) mask = flow.cast(mask, dtype=flow.int8) if type_name == "float16": out = flow.cast( flow.masked_fill(flow.cast(x, flow.float16), mask, value), flow.float, ) else: out = flow.masked_fill(x, mask, value) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(out) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(out, test_global_storage.Setter("out")) flow.watch_diff(out, test_global_storage.Setter("out_diff")) return out
def forward(self, logits, target, mask=None): """LabelSmoothing Function with Mask Args: logits ([tensor]): logits with shape [batch, length, vocab_size] target ([tensor]): target with shape [batch, length] mask ([tensor], optional): mask tensor (bool) with shape [batch, length] """ assert logits.dim() == 3 and logits.size(-1) == self.size pad_mask = target == self.padding_idx if mask is not None: mask = (pad_mask.int() + mask.int()) > 0 else: mask = pad_mask logits = logits.reshape(-1, self.size) with flow.no_grad(): confidence = logits.clone() confidence.fill_(self.smoothing / (self.size - 1)) confidence = flow.scatter(confidence, 1, target.reshape(-1).unsqueeze(1), 1 - self.smoothing) logsoftmax = nn.LogSoftmax(dim=-1) KLdiv = nn.KLDivLoss(reduction="none", log_target=False) loss = flow.sum(KLdiv(logsoftmax(logits), confidence), dim=-1) total = flow.sum(mask == 0) denom = total if self.normalize_length else logits.size(0) loss = flow.masked_fill(loss, mask.reshape(-1), 0.0) loss = flow.sum(loss) / denom return loss
def mask_finished_preds(pred, flag): """ If a sequence is finished, all of its branch should be </S> (3). Args: pred: A int array with shape [batch_size * beam_size, beam_size]. flag: A bool array with shape [batch_size * beam_size, 1]. Returns: A int array with shape [batch_size * beam_size]. """ beam_width = pred.size(-1) finished = flag.repeat([1, beam_width]) return flow.masked_fill(pred, finished.to(dtype=flow.uint8) == 1, EOS)
def forward(self, x, mask): """ Args: x: [batch_size, time, channels] mask: [batch_size, time] """ mask = mask.unsqueeze(2).repeat([1, 1, x.size(-1)]) x = self.pointwise_conv1(x) x = F.glu(x) x = flow.masked_fill(x, mask == 0, 0.0) x = x.transpose(1, 2) x = self.depthwise_conv(x) x = self.batch_norm(x) x = x * flow.sigmoid(x) x = x.transpose(1, 2) x = self.pointwise_conv2(x) x = flow.masked_fill(x, mask == 0, 0.0) return x
def mask_finished_scores(score, flag): """ If a sequence is finished, we only allow one alive branch. This function aims to give one branch a zero score and the rest -inf score. Args: score: A real value array with shape [batch_size * beam_size, beam_size]. flag: A bool array with shape [batch_size * beam_size, 1]. Returns: A real value array with shape [batch_size * beam_size, beam_size]. """ beam_width = score.size(-1) zero_mask = flow.zeros_like(flag).to(dtype=flow.uint8) if beam_width > 1: unfinished = flow.cat( [zero_mask, flag.repeat([1, beam_width - 1])], dim=1) finished = flow.cat( (flag.to(dtype=flow.uint8), zero_mask.repeat([1, beam_width - 1])), dim=1) else: unfinished = zero_mask finished = flag.to(dtype=flow.uint8) score = flow.masked_fill(score, unfinished == 1, -float("inf")) score = flow.masked_fill(score, finished == 1, 0) return score
def compute_context(self, values, scores, mask=None): """ Args: values: [b, t2, v] or [b, nh, t2, v] scores: [b, t1, t2] or [b, nh, t1, t2] mask: [b, t1, t2] or [b, 1/nh, t1, t2] """ assert values.dim() == scores.dim() if mask is not None: scores = flow.masked_fill(scores, mask == 0, -float("inf")) weights = flow.softmax(scores, dim=-1) context = flow.matmul(weights, values) if context.dim() == 4: b, n, t, v = context.size() context = context.transpose(1, 2).reshape(b, t, n * v) if self.enable_output_proj: context = self.output_proj(context) return self.dropout(context), weights
def _masked_fill(self, mask, fill_value): return flow.masked_fill(self, mask, fill_value)
def masked_fill_Job(x: tp.Numpy.Placeholder(x.shape), mask: tp.Numpy.Placeholder((4, ), dtype=flow.int8)) -> tp.Numpy: out = flow.masked_fill(x, mask, value=5) return out
def Causal_Self_Attention(x, config, name='csa'): """ Input:: x : Eembedded words input[B, T, C] -- B is the batch size -- T is the sequence length(block_size) -- C is the dimension of the embedding (n_embd) C/head_number = dimension of each head(d_k) config: class object defined with models.GPTConfig Output:: y : output of x, which can be used as new x in next interation Description:: This functions is the causl_sefl_attention core, which is a part of multiple head attention schema. Code refered from: https://github.com/karpathy/minGPT/blob/master/mingpt/model.py Theory refered from: http://jalammar.github.io/illustrated-gpt2/ Related paper: """ assert config.n_embd % config.n_head == 0 #def B, T, C = x.shape #Kaiming_initialize kaiming_init_C = flow.kaiming_initializer(shape=(C, C)) ## calculate query, key, values for all heads in batch and move head forward to be the batch dim # define: key, query and value projections for all heads # process: query + key ----> value # dimension: (B,T,C) -> (B, nh, T, hs), nh*ns=C # query:The query is a representation of the current word used to score against all the other words (using their keys). query = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + '_query')) query = flow.reshape(query, [B, T, config.n_head, C // config.n_head]) query = flow.transpose(query, [0, 2, 1, 3]) # key:Key vectors are like labels for all the words in the segment. key = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + '_key')) key = flow.reshape(key, [B, T, config.n_head, C // config.n_head]) key = flow.transpose(key, [0, 2, 1, 3]) # value: Value vectors are actual word representations value = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + 'value')) value = flow.reshape(value, [B, T, config.n_head, C // config.n_head]) value = flow.transpose(value, [0, 2, 1, 3]) ##causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) att = flow.matmul(query, flow.transpose( key, [0, 1, 3, 2])) * (1.0 / math.sqrt(key.shape[-1])) att_tril = flow.math.tril( flow.constant(value=int(-1), dtype=flow.int32, shape=(B, config.n_head, T, T), name=name + "_ConstantLike_tril")) att_tril = att_tril + flow.ones_like(like=att_tril, dtype=flow.int32) att = flow.masked_fill(att, att_tril, float('-inf')) att = flow.nn.softmax(att, name=name + 'att') att = flow.nn.dropout(att, config.attn_pdrop) ## QK*V: (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) y = flow.matmul(att, value) y = flow.transpose(y, [0, 2, 1, 3]) y = flow.reshape(y, [B, T, C]) y = flow.nn.dropout(y, config.resid_pdrop) return y