def _in_projection( q: Tensor, k: Tensor, v: Tensor, w_q: Tensor, w_k: Tensor, w_v: Tensor, b_q: Optional[Tensor] = None, b_k: Optional[Tensor] = None, b_v: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor, Tensor]: Eq, Ek, Ev = q.size(-1), k.size(-1), v.size(-1) assert w_q.shape == ( Eq, Eq, ), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}" assert w_k.shape == ( Eq, Ek, ), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}" assert w_v.shape == ( Eq, Ev, ), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}" assert b_q is None or b_q.shape == ( Eq, ), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}" assert b_k is None or b_k.shape == ( Eq, ), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}" assert b_v is None or b_v.shape == ( Eq, ), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}" return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
def forward( self, query: flow.Tensor, key: flow.Tensor, value: flow.Tensor, mask: Optional[flow.Tensor] = None, ) -> Tuple[flow.Tensor, flow.Tensor]: batch_size = query.size(0) query = self.query(query) key = self.key(key) value = self.value(value) # multi head query = query.view(batch_size, -1, self.num_attention_heads, self.dims_per_head).transpose(1, 2) key = key.view(batch_size, -1, self.num_attention_heads, self.dims_per_head).transpose(1, 2) value = value.view(batch_size, -1, self.num_attention_heads, self.dims_per_head).transpose(1, 2) # self attention context, attention = self.attention(query, key, value, attn_mask=mask) # concat heads context = context.transpose(1, 2).contiguous().view( batch_size, -1, self.hidden_size) output = self.dense(context) return output, attention
def topk_accuracy(output: Tensor, target: Tensor, topk: Sequence[int] = (1, )) -> List[Tensor]: """ https://github.com/pytorch/examples/blob/master/imagenet/main.py#L411 Args: output: [B, C], for C way classification target: [B] """ maxk = max(topk) batch_size = target.size(0) if target.ndim == 2: # Possibly onehot target target = target.max(dim=1).values _, pred = output.topk(maxk, dim=1, largest=True, sorted=True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=False) res.append(correct_k.mul_(100.0 / batch_size)) return res
def forward(self, cosine: flow.Tensor, label): index = flow.where(label != -1)[0] m_hot = flow.zeros(index.size()[0], cosine.size()[1], device=cosine.device) m_hot.scatter_(1, label[index, None], self.m) cosine.acos_() cosine[index] += m_hot cosine.cos_().mul_(self.s) return cosine
def forward( self, input_ids: flow.Tensor, token_type_ids: Optional[flow.Tensor] = None, position_ids: Optional[flow.Tensor] = None, ) -> flow.Tensor: input_shape = input_ids.size() seq_length = input_shape[1] if token_type_ids is None: token_type_ids = flow.zeros(input_shape, dtype=flow.long, device=input_ids.device) if position_ids is None: position_ids = flow.arange(seq_length, dtype=flow.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0).expand(input_shape) input_embeddings = self.token_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) position_embeddings = self.position_embeddings(position_ids) embeddings = input_embeddings + position_embeddings + \ token_type_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings
def get_extended_attention_mask(self, attention_mask: flow.Tensor, input_shape: Tuple[int], device: flow.device) -> flow.Tensor: # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.is_decoder: batch_size, seq_length = input_shape seq_ids = flow.arange(seq_length, device=device) causal_mask = (seq_ids[None, None, :].repeat( batch_size, seq_length, 1) <= seq_ids[None, :, None]) # in case past_key_values are used we need to add a prefix ones mask to the causal mask causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: prefix_seq_len = attention_mask.shape[ 1] - causal_mask.shape[1] causal_mask = flow.cat( [ flow.ones( (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype, ), causal_mask, ], axis=-1, ) extended_attention_mask = (causal_mask[:, None, :, :] * attention_mask[:, None, None, :]) else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ) extended_attention_mask = extended_attention_mask.to(dtype=flow.float) extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 return extended_attention_mask
def numel_in_bucket(tensor: flow.Tensor): def align(x: int, unit_size: int): return (x + (unit_size - 1)) // unit_size * unit_size # tensor memory should be align to 512 bytes for cuda operations, # 4 is the bytes of a float number # TODO(jianhao): expose the `kCudaMemAllocAlignSize` from C++ to # avoid this hardcoded "512" return align(tensor.numel(), 512 // 4)
def _forward_impl(self, x: Tensor) -> Tensor: x = self.conv1(x) x = self.maxpool(x) x = self.stage2(x) x = self.stage3(x) x = self.stage4(x) x = self.conv5(x) x = x.mean([2, 3]) # globalpool x = self.fc(x) return x
def get_extended_attention_mask( self, attention_mask: flow.Tensor, input_ids: flow.Tensor, ): if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError("Wrong shape for input_ids (shape {}) " "or attention_mask (shape {})".format( input_ids.shape, attention_mask.shape)) extended_attention_mask = extended_attention_mask.to( dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask
def forward( self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None, ) -> Tensor: if not self.batch_first and src.size(1) != tgt.size(1): raise RuntimeError("the batch number of src and tgt must be equal") elif self.batch_first and src.size(0) != tgt.size(0): raise RuntimeError("the batch number of src and tgt must be equal") if src.size(2) != self.d_model or tgt.size(2) != self.d_model: raise RuntimeError( "the feature number of src and tgt must be equal to d_model") memory = self.encoder(src, src_mask, src_key_padding_mask) output = self.decoder( tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, ) return output
def forward( self, query: flow.Tensor, key: flow.Tensor, value: flow.Tensor, attn_mask: Optional[flow.Tensor] = None, ) -> Tuple[flow.Tensor, flow.Tensor]: r""" Args: query: [batch, num_attention_heads, len_query, dim_query] key: [batch, num_attention_heads, len_key, dim_key] value: [batch, num_attention_heads, len_value, dim_value] attn_mask: [batch, num_attention_heads, len_query, len_key] """ attention = flow.matmul(query, key.transpose(-1, -2)) attention = attention / math.sqrt(query.size(-1)) if attn_mask is not None: attention = attention + attn_mask attention = nn.Softmax(dim=-1)(attention) attention = self.dropout(attention) context = flow.matmul(attention, value) return context, attention
def channel_shuffle(x: Tensor, groups: int) -> Tensor: batchsize, num_channels, height, width = x.size() channels_per_group = num_channels // groups # reshape x = flow.reshape(x, [batchsize, groups, channels_per_group, height, width]) x = flow.transpose(x, 1, 2) # flatten x = flow.reshape(x, [batchsize, -1, height, width]) return x
def _in_projection_packed( q: Tensor, k: Tensor, v: Tensor, w: Tensor, b: Optional[Tensor] = None, ) -> List[Tensor]: E = q.size(-1) if k is v: if q is k: # self-attention # dim=-1的时候chunk不起作用 res = linear(q, w, b) chunk_dim = len(res.shape) return res.chunk(3, dim=chunk_dim - 1) else: # encoder-decoder attention # w_q, w_kv = w.split([E, E * 2]) w_q, w_k, w_v = w.chunk(3, dim=0) w_kv = flow.cat([w_k, w_v]) if b is None: b_q = b_kv = None else: # b_q, b_kv = b.split([E, E * 2]) b_q, b_k, b_v = b.chunk(3, dim=0) b_kv = flow.cat([b_k, b_v]) res = linear(k, w_kv, b_kv) chunk_dim = len(res.shape) # 似乎与return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(k, w_v, b_v)等效? return [linear(q, w_q, b_q)] + res.chunk(2, dim=chunk_dim - 1) else: w_q, w_k, w_v = w.chunk(3, dim=0) if b is None: b_q = b_k = b_v = None else: b_q, b_k, b_v = b.chunk(3, dim=0) return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
def _expand_mask(mask: flow.Tensor, dtype: flow.dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(flow.int32), -1e9)
def forward(self, x: flow.Tensor): """Add positional encoding. Args: x (torch.Tensor): Input. Its shape is (batch, time, ...) Returns: torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) """ pos = flow.arange(0, x.size(1), device=x.device).reshape(1, -1) # [1, t] posemb = self._embedding_from_positions(pos) # [1, t, emb_dim] if self.scale_learnable: x = x + self.alpha * posemb else: x = x * self.xscale + posemb return self.dropout(x), posemb
def _scaled_dot_product_attention( q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, ) -> Tuple[Tensor, Tensor]: B, Nt, E = q.shape q = q / math.sqrt(E) # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = flow.bmm(q, k.transpose(-2, -1)) if attn_mask is not None: attn += attn_mask attn = flow.softmax(attn, dim=-1) if dropout_p > 0.0: attn = flow.nn.functional.dropout(attn, p=dropout_p) # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) output = flow.bmm(attn, v) return output, attn
def prune_linear_layer(layer: nn.Linear, index: flow.Tensor, dim: int = 0) -> nn.Linear: index = index.to(layer.weight.device) W = layer.weight.index_select(dim, index).clone().detach() if layer.bias is not None: if dim == 1: b = layer.bias.clone().detach() else: b = layer.bias[index].clone().detach() new_size = list(layer.weight.size()) new_size[dim] = len(index) new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to( layer.weight.device ) new_layer.weight.requires_grad = False new_layer.weight.copy_(W.contiguous()) new_layer.weight.requires_grad = True if layer.bias is not None: new_layer.bias.requires_grad = False new_layer.bias.copy_(b.contiguous()) new_layer.bias.requires_grad = True return new_layer