def __init__(self, d_model: int, num_heads, feedforward_dimension: int = 2048, dropout: float = 0.1): super(TransformerDecoderLayer, self).__init__() # Masked Multi-Head Self-Attention self.masked_self_attention = MultiheadAttention(d_model, num_heads, dropout=dropout) self.dropout_a1 = Dropout(dropout) # Normalization after Self-Attention self.norm1 = LayerNorm(d_model) # Encoder-Decoder Attention self.self_attention = MultiheadAttention(d_model, num_heads, dropout=dropout) self.dropout_a2 = Dropout(dropout) # Normalization after Attention self.norm2 = LayerNorm(d_model) # Position-Wise Feed Forward NN self.linear1 = Linear(d_model, feedforward_dimension) self.relu = ReLU() self.dropout1 = Dropout(dropout) self.linear2 = Linear(feedforward_dimension, d_model) self.dropout2 = Dropout(dropout) # Normalization after PW-FFNN self.norm3 = LayerNorm(d_model)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", d_global2=None): super(TransformerEncoderLayerImproved, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) if d_global2 is not None: self.linear_global2 = Linear(d_global2, d_model) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2_2 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model if activation == "glu": self.linear1 = Linear(d_model, 2 * dim_feedforward) else: self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, nhead, dim_feedforward=256, dropout=0, activation="relu"): from torch.nn.modules.activation import MultiheadAttention from torch.nn.modules.normalization import LayerNorm from torch.nn.modules.dropout import Dropout from torch.nn.modules.rnn import LSTM from torch.nn.modules.linear import Linear super(DPTNetBlock, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model # self.linear1 = Linear(d_model, dim_feedforward) self.rnn = LSTM(d_model, d_model * 2, 1, bidirectional=True) self.dropout = Dropout(dropout) # self.linear2 = Linear(dim_feedforward, d_model) self.linear2 = Linear(d_model * 2 * 2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerEncoderLayer, self).__init__() # global countz # countz += 1 # self.count = countz # print("enc", countz) self.self_attn = MultiheadAttentionZSelf(d_model, nhead, dropout=dropout, name="EncoderSelfAttn") # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, num_features=22, nhead=3, dim_feedforward=2048, dropout=0.1, activation = "relu", use_LayerNorm = True, init_resweight = 0, resweight_trainable = True): super(ReZeroEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(num_features, nhead, dropout=dropout) # Define the Resisdual Weight for ReZero self.resweight = torch.nn.Parameter(torch.Tensor([init_resweight]), requires_grad = resweight_trainable) # Implementation of Feedforward model self.linear1 = Linear(num_features, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, num_features) self.use_LayerNorm = use_LayerNorm if self.use_LayerNorm != False: self.norm1 = LayerNorm(num_features) self.norm2 = LayerNorm(num_features) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) if activation == "relu": self.activation = F.relu elif activation == "gelu": self.activation = F.gelu elif activation == "tanh": self.activation = torch.tanh
def __init__(self, embed_dim, dropout=0.1, dim_feedforward=128, cycles=3, passthrough_mode=False, q_k_sim='dot'): ''' Hierarchical attention is a way to put keys in long sequences into slots/buckets to improve sparcity and time-space efficiency from O(n^2) to O(n log n) This is achieved in two passes, first we populates the slots with representative samples of the tokens bellow. Then when computing token level attention, the queries are compared to the slots first, then the derived attention scores weigh the tokens and lower level attention scores under under that slot. ''' super().__init__() self.embed_dim = embed_dim self.cycles = cycles self.passthrough_mode = passthrough_mode self.q_k_sim = q_k_sim self.scaling = float(embed_dim)**-0.5 self.slot_Wq = Linear(embed_dim, embed_dim, bias=False) self.slot_Wk = Linear(embed_dim, embed_dim, bias=False) self.slot_Wv = Linear(embed_dim, embed_dim, bias=False) self.Wq = Linear(embed_dim, embed_dim, bias=False) self.Wk = Linear(embed_dim, embed_dim, bias=False) self.Wv = Linear(embed_dim, embed_dim, bias=False) self.linear1 = Linear(embed_dim, dim_feedforward) self.linear2 = Linear(dim_feedforward, embed_dim) if passthrough_mode: dropout = 0 self.slot_Wq.weight.data = torch.eye(embed_dim, embed_dim) self.slot_Wk.weight.data = torch.eye(embed_dim, embed_dim) self.slot_Wv.weight.data = torch.eye(embed_dim, embed_dim) self.Wq.weight.data = torch.eye(embed_dim, embed_dim) self.Wk.weight.data = torch.eye(embed_dim, embed_dim) self.Wv.weight.data = torch.eye(embed_dim, embed_dim) self.linear1.weight.data = torch.eye(dim_feedforward, embed_dim) self.linear2.weight.data = torch.eye(embed_dim, dim_feedforward) self.linear1.bias.data = torch.zeros((dim_feedforward, )) self.linear2.bias.data = torch.zeros((embed_dim, )) self.norm1 = lambda x: x self.norm2 = lambda x: x self.scaling = 1.0 else: self.norm1 = LayerNorm(embed_dim) self.norm2 = LayerNorm(embed_dim) self.dropout = Dropout(dropout) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout=0.1) -> None: super(TransformerDecoderLayerCustom3, self).__init__(d_model, nhead, dim_feedforward, dropout) self.self_attn = MultiheadAttentionCustom2(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttentionCustom2(d_model, nhead, dropout=dropout) self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout)
def __init__(self, d_model, n_cat_embeddings, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super().__init__() self.self_attn = MultiheadAttention(d_model, n_cat_embeddings, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, heads, dropout=0.3): super().__init__() self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.attn = MultiHeadAttention(heads, d_model) self.ff = FeedForward(d_model) self.dropout_1 = Dropout(dropout) self.dropout_2 = Dropout(dropout)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu', factor_ff=False, adapter_finetune=False, adapter_d_ff=2048): super().__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.factor_ff = factor_ff if self.factor_ff: in_ff = int(dim_feedforward / 4) #self.linear1 = nn.Linear(d_model, in_ff) #self.fac_linear1 = nn.Linear(in_ff, in_ff) #self.fac_linear2 = nn.Linear(in_ff, in_ff) #self.linear2 = nn.Linear(in_ff, d_model) self.linear1 = nn.Linear(d_model, 100) self.fac_linear1 = nn.Linear(100, dim_feedforward) self.fac_linear2 = nn.Linear(dim_feedforward, 100) self.linear2 = nn.Linear(100, d_model) else: self.linear1 = Linear(d_model, dim_feedforward) self.linear2 = Linear(dim_feedforward, d_model) self.dropout = Dropout(dropout) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.resweight = nn.Parameter(torch.Tensor([0])) self.pre_norm = nn.LayerNorm(d_model) self.adapter_finetune = adapter_finetune if self.adapter_finetune: self.ada_linear1 = Linear(d_model, adapter_d_ff) self.ada_dropout1 = Dropout(dropout) self.ada_linear2 = Linear(adapter_d_ff, d_model) self.ada_dropout2 = Dropout(dropout) self.self_attn.requires_grad_(False) #self.self_attn.in_proj_weight.requires_grad = False #self.self_attn.in_proj_bias.requires_grad = False self.linear1.requires_grad_(False) self.linear2.requires_grad_(False) #self.linear1.weight.requires_grad = False #self.linear1.bias.requires_grad = False #self.linear2.weight.requires_grad = False #self.linear2.bias.requires_grad = False #self.resweight.requires_grad = False if activation == "relu": self.activation = F.relu elif activation == "gelu": self.activation = F.gelu
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, er_len=None): super(TransformerEncoderLayerRPR, self).__init__() self.self_attn = MultiheadAttentionRPR(d_model, nhead, dropout=dropout, er_len=er_len) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, d_model, nhead, min_dist, max_dist, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerEncoderLayerWithRelativePositionalEncoding, self).__init__() self.self_attn = MultiheadAttentionRelativePositionalEncoding(d_model, nhead, dropout=dropout, min_dist=min_dist,max_dist=max_dist) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = F.relu #get_activation_fn(activation)
def __init__(self, encoder, dropout_rate=0.5) -> None: super().__init__() self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = AdaptiveAvgPool2d((1, 1)) self.srm_conv = setup_srm_layer(3) self.dropout = Dropout(dropout_rate) self.fc = Linear(encoder_params[encoder]["features"], 1)
def __init__(self, src_dim, dest_dim, edge_dim, hidden_size, nhead=4, position_encoding=True): super().__init__() self.src_dim = src_dim self.dest_dim = dest_dim self.edge_dim = edge_dim self.hidden_size = hidden_size self.nhead = nhead src_layers = [] src_layers.append(nn.Linear(src_dim + edge_dim, hidden_size)) src_layers.append(GeLU()) self.src_pre_layer = nn.Sequential(*src_layers) dest_layers = [] dest_layers.append(nn.Linear(dest_dim, hidden_size)) dest_layers.append(GeLU()) self.dest_pre_layer = nn.Sequential(*dest_layers) self.att = MultiheadAttention(embed_dim=hidden_size, num_heads=nhead) self.att_dropout = Dropout(0.1) self.att_norm = LayerNorm(hidden_size) self.zero_padding_template = torch.zeros((1, src_dim), dtype=torch.float)
def __init__(self, encoder, name='', dropout_rate=0.0) -> None: super().__init__() self.name = name self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = AdaptiveAvgPool2d((1, 1)) self.dropout = Dropout(dropout_rate) self.fc = Linear(encoder_params[encoder]["features"], 1)
def __init__(self, encoder, dropout_rate=0.0,out_dim=56) -> None: super().__init__() self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = AdaptiveAvgPool2d((1, 1)) self.dropout = Dropout(dropout_rate) self.out_dim = out_dim self.fc = Linear(encoder_params[encoder]["features"], out_dim)
def __init__(self, model_name: str, output_dim: int) -> None: super(SentimentAnalysisModel, self).__init__() config = AutoConfig.from_pretrained(model_name) self.transformer = AutoModel.from_pretrained(model_name) # freeze all but last layer of transformer layers_to_freeze = None frozen_params = 0 if type(self.transformer) is GPT2Model: layers_to_freeze = self.transformer.h[:-1] layers_to_freeze.extend([self.transformer.h[-1].mlp]) layers_to_freeze.extend([self.transformer.h[-1].attn]) layers_to_freeze.extend([self.transformer.wte]) layers_to_freeze.extend([self.transformer.wpe]) elif type(self.transformer) is DistilBertModel: layers_to_freeze = self.transformer.transformer.layer[:-1] elif type(self.transformer) is T5Model: layers_to_freeze = self.transformer.encoder.block[:-1] layers_to_freeze.extend(self.transformer.decoder.block[:-1]) layers_to_freeze.extend([self.transformer.shared]) for layer in layers_to_freeze: for param in layer.parameters(): param.requires_grad = False frozen_params += param.numel() print(f'Init model: frozen {frozen_params} params.') self.pre_classifier = Linear(config.hidden_size, config.hidden_size) self.dropout = Dropout(0.3) self.classifier = Linear(config.hidden_size, output_dim)
def _construct_x_t(self, layer, synset): # find the x_t # when at stacked layer, the input is the previous hidden states (maybe concat when bidirectional) # otherwise, x_t comes from the sense embedding if layer > 0: ''' # if the previous step did not calculate the hyper # this is designed for the stacked version # when the recursion order left some of the hypers uncalculated if synset not in self.hidden_state[layer - 1]['up']: self.hidden_state[layer - 1]['up'][synset] = self._upward_downward((layer - 1), 'up', synset)[0] ''' x_t = torch.cat([ self.hidden_state[layer - 1]['up'][synset], self.hidden_state[layer - 1]['down'][synset] ]) else: # get the synset (sense) embedding synset_idx = self.synset_vocab(synset) lookup_tensor = torch.tensor([synset_idx], dtype=torch.long).to(device) # may add dropout if self.dropout: dropout = Dropout(p=self.dropout) x_t = dropout(self.embedding(lookup_tensor).squeeze(0)) else: x_t = self.embedding(lookup_tensor).squeeze(0) # print(x_t.requires_grad) # print(x_t.shape) return x_t
def __init__(self, encoder, dropout_rate=0.5) -> None: super().__init__() self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = GlobalWeightedAvgPool2d( encoder_params[encoder]["features"]) self.dropout = Dropout(dropout_rate) self.fc = Linear(encoder_params[encoder]["features"], 1)
def __init__(self, encoder, nclasses, dropout_rate=0.0, infer = False) -> None: super().__init__() self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = AdaptiveAvgPool2d((1, 1)) self.dropout = Dropout(dropout_rate) self.fc = Linear(encoder_params[encoder]["features"], nclasses) self.infer = infer
def __init__(self, d_model, nhead, bidirectional=True, dropout=0, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model # self.linear1 = Linear(d_model, dim_feedforward) self.gru = GRU(d_model, d_model*2, 1, bidirectional=bidirectional) self.dropout = Dropout(dropout) # self.linear2 = Linear(dim_feedforward, d_model) if bidirectional: self.linear2 = Linear(d_model*2*2, d_model) else: self.linear2 = Linear(d_model*2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, encoder, dropout_rate=0.5) -> None: super().__init__() self.decoder = Decoder(decoder_filters=encoder_params[encoder]["decoder_filters"], filters=encoder_params[encoder]["filters"]) self.avg_pool = AdaptiveAvgPool2d((1, 1)) self.dropout = Dropout(dropout_rate) self.fc = Linear(encoder_params[encoder]["features"], 1) self.final = Conv2d(encoder_params[encoder]["decoder_filters"][0], out_channels=1, kernel_size=1, bias=False) _initialize_weights(self) self.encoder = encoder_params[encoder]["init_op"]()
def __init__(self, nmemory, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(GenericTransformerDecoderLayer, self).__init__() self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) self.self_norm = LayerNorm(d_model) self.self_dropout = Dropout(dropout) self.self_highway = Highway(2*d_model, d_model) self.memory_attns = nn.ModuleList([nn.MultiheadAttention(d_model, nhead, dropout=dropout) for i in range(nmemory)]) self.memory_norms = nn.ModuleList([LayerNorm(d_model) for i in range(nmemory)]) self.memory_dropouts = nn.ModuleList([Dropout(dropout) for i in range(nmemory)]) self.memory_highways = nn.ModuleList([Highway(2 * d_model, d_model) for i in range(nmemory)]) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout1 = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, d_model, dropout=0.1, max_len=5000): super(AbsolutePositionalEncoding, self).__init__() self.dropout = Dropout(p=dropout) pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.slot_attn = Hierarchical_Attention(d_model, cycles=1) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout)
def __init__(self, d_model, n_cat_embeddings, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(TransformerClass.TransformerEncoderLayer_modified, self).__init__() try_import_torch() from torch.nn.modules.normalization import LayerNorm from torch.nn.modules.dropout import Dropout self.self_attn = TransformerClass.MultiheadAttention(d_model, n_cat_embeddings, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = TransformerClass.Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = TransformerClass.Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = self._get_activation_fn(activation)
def __init__(self, heads, d_model, dropout=0.5): super().__init__() self.d_model = d_model self.d_k = d_model // heads self.h = heads self.q_linear = torch.nn.Linear(d_model, d_model) self.v_linear = torch.nn.Linear(d_model, d_model) self.k_linear = torch.nn.Linear(d_model, d_model) self.dropout = Dropout(dropout) self.out = torch.nn.Linear(d_model, d_model)