def __init__( self, input_sz, output_sz, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout, ): super(TransformerModel, self).__init__() self.transformer = Transformer( d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=False, ) self.softmax = nn.Softmax(dim=2) self.linear = nn.Linear(d_model, output_sz) self.pos_encoder = PositionalEncoding(d_model, dropout) self.pos_decoder = PositionalEncoding(d_model, dropout) self.src_embedding = Embeddings(input_sz, d_model) self.tgt_embedding = Embeddings(output_sz, d_model)
def __init__(self, emb_sz, emb_dim, hidden_size, nfc, n_classes, num_layers=1): super(LSTMText, self).__init__() self.emb_sz = emb_sz self.emb_dim = emb_dim self.n_classes = n_classes self.hidden_size = hidden_size self.nfc = nfc self.num_layers = num_layers self.bilstm = BiLSTM(emb_dim, hidden_size, num_layers) self.embedding = nn.Embedding(self.emb_sz, self.emb_dim) self.linear = nn.Linear(hidden_size * 2 * nfc, n_classes) self.softmax = nn.Softmax(dim=1)
def _attn(self, query, key, value): attn_weights = flow.matmul(query, key.transpose(-2, -1)) if self.scale_attn_weights: attn_weights = attn_weights / (float(value.size(-1))**0.5) query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length] attn_weights = flow.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)) attn_weights = nn.Softmax(dim=-1)(attn_weights) attn_weights = self.attn_dropout(attn_weights) attn_output = flow.matmul(attn_weights, value) return attn_output, attn_weights
def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias # Author zzk: we add trunc normal here! self.relative_position_bias_table = nn.Parameter( flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH self.relative_position_bias_table.trunc_normal_(std=0.02) # get pair-wise relative position index for each token inside the window coords_h = flow.arange(self.window_size[0]) coords_w = flow.arange(self.window_size[1]) coords = flow.stack(flow.meshgrid(*[coords_h, coords_w])) # 2, Wh, Ww coords_flatten = flow.flatten(coords, 1) # 2, Wh*Ww relative_coords = (coords_flatten[:, :, None] - coords_flatten[:, None, :]) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.softmax = nn.Softmax(dim=-1)
def forward(self, x): b, n, _ = x.shape q = self.query(x) k = self.key(x) v = self.value(x) q = self.transpose_for_scores(q) k = self.transpose_for_scores(k) v = self.transpose_for_scores(v) attn_weights = flow.matmul(q, k.transpose(-2, -1)) / self.scale attn_weights = nn.Softmax(dim=-1)(attn_weights) out = flow.matmul(attn_weights, v) out = out.permute(0, 2, 1, 3) new_out_shape = tuple(out.size()[:-2]) + (self.heads * self.head_dim, ) out = out.view(*new_out_shape) out = self.out(out) return out
def forward( self, query: flow.Tensor, key: flow.Tensor, value: flow.Tensor, attn_mask: Optional[flow.Tensor] = None, ) -> Tuple[flow.Tensor, flow.Tensor]: r""" Args: query: [batch, num_attention_heads, len_query, dim_query] key: [batch, num_attention_heads, len_key, dim_key] value: [batch, num_attention_heads, len_value, dim_value] attn_mask: [batch, num_attention_heads, len_query, len_key] """ attention = flow.matmul(query, key.transpose(-1, -2)) attention = attention / math.sqrt(query.size(-1)) if attn_mask is not None: attention = attention + attn_mask attention = nn.Softmax(dim=-1)(attention) attention = self.dropout(attention) context = flow.matmul(attention, value) return context, attention
def __init__( self, model, input_size, output_size, num_experts, noisy_gating=True, k=4 ): super(MoE, self).__init__() self.noisy_gating = noisy_gating self.num_experts = num_experts self.output_size = output_size self.input_size = input_size self.k = k # instantiate experts self.experts = nn.ModuleList([model for i in range(self.num_experts)]) self.w_gate = nn.Parameter( flow.zeros(input_size, num_experts), requires_grad=True ) self.w_noise = nn.Parameter( flow.zeros(input_size, num_experts), requires_grad=True ) self.softplus = nn.Softplus() self.softmax = nn.Softmax(1) assert self.k <= self.num_experts
def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention and past_key_value is not None: # reuse k,v, cross_attentions key_layer = past_key_value[0] value_layer = past_key_value[1] attention_mask = encoder_attention_mask elif is_cross_attention: key_layer = self.transpose_for_scores( self.key(encoder_hidden_states)) value_layer = self.transpose_for_scores( self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = flow.cat([past_key_value[0], key_layer], dim=2) value_layer = flow.cat([past_key_value[1], value_layer], dim=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) query_layer = self.transpose_for_scores(mixed_query_layer) if self.is_decoder: # if cross_attention save Tuple(flow.Tensor, flow.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention # key/value_states (first "if" case) # if uni-directional self-attention (decoder) save Tuple(flow.Tensor, flow.Tensor) of # all previous decoder key/value_states. Further calls to uni-directional self-attention # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) # if encoder bi-directional self-attention `past_key_value` is always `None` past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = flow.matmul(query_layer, key_layer.transpose(-1, -2)) if (self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query"): seq_length = hidden_states.size()[1] position_ids_l = flow.arange(seq_length, dtype=flow.int64, device=hidden_states.device).view( -1, 1) position_ids_r = flow.arange(seq_length, dtype=flow.int64, device=hidden_states.device).view( 1, -1) distance = position_ids_l - position_ids_r positional_embedding = self.distance_embedding( distance + self.max_position_embeddings - 1) positional_embedding = positional_embedding.to( dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": relative_position_scores = position_scores( query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": relative_position_scores_query = position_scores( query_layer, positional_embedding) relative_position_scores_key = position_scores( key_layer, positional_embedding) attention_scores = (attention_scores + relative_position_scores_query + relative_position_scores_key) attention_scores = attention_scores / math.sqrt( self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs = attention_probs * head_mask context_layer = flow.matmul(attention_probs, value_layer) # oneflow doesnot support contiguous() context_layer = context_layer.permute(0, 2, 1, 3) # .contiguous() new_context_layer_shape = tuple( context_layer.size()[:-2]) + (self.all_head_size, ) context_layer = context_layer.view(*new_context_layer_shape) outputs = ((context_layer, attention_probs) if output_attentions else (context_layer, )) if self.is_decoder: outputs = outputs + (past_key_value, ) return outputs
def __init__(self, temperature, attn_dropout=0.1): super().__init__() self.temperature = temperature self.dropout = nn.Dropout(attn_dropout) self.softmax = nn.Softmax(dim=2)