def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, pre_norm: bool = False, att_dropout: float = 0.1, ffn_dropout: float = 0.1, activation: str = "relu") -> None: super(TransformerDncoderLayer, self).__init__() self.pre_norm = pre_norm self.self_attn = MultiheadAttention(d_model, nhead, dropout=att_dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=att_dropout) self.feedforward = nn.Sequential(nn.Linear(d_model, dim_feedforward), _get_activation_fn(activation), nn.Dropout(ffn_dropout), nn.Linear(dim_feedforward, d_model), nn.Dropout(ffn_dropout)) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(ffn_dropout) self.dropout2 = nn.Dropout(ffn_dropout)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", p_net=None): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) #Adding layer for BERT self.multihead_attn_bert = MultiheadAttention(d_model, nhead, dropout=dropout) self.p_net = p_net # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, dim_model: int, num_heads: int, dim_feedforward: int = 2048, dropout: float = 0.1, activation: str = "relu", pre_ln: bool = False) -> None: super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(dim_model, num_heads, dropout=dropout) self.multihead_attn = MultiheadAttention(dim_model, num_heads, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(dim_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, dim_model) self.norm1 = LayerNorm(dim_model) self.norm2 = LayerNorm(dim_model) self.norm3 = LayerNorm(dim_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.activation = _get_activation_fn(activation) self.pre_ln = pre_ln
def __init__( self, embed_dim, ffn_dim, num_heads, attn_dropout=0., act_dropout=0., dropout=0., layernorm_before=False, ): super().__init__() self.embed_dim = embed_dim self.layernorm_before = layernorm_before self.act_dropout = act_dropout self.dropout = dropout # self-attention part self.self_attn = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=attn_dropout) self.attn_layernorm = nn.LayerNorm(embed_dim, eps=1e-5) # end-dec attention part self.enc_dec_attention = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=attn_dropout) self.enc_dec_layernorm = nn.LayerNorm(embed_dim, eps=1e-5) # point-wise ffn self.ffn1 = nn.Linear(embed_dim, ffn_dim) self.ffn2 = nn.Linear(ffn_dim, embed_dim) self.ffn_layernorm = nn.LayerNorm(embed_dim, eps=1e-5) self.reset_parameters()
def __init__(self, d_model, n_head, d_inner, dropout=0.1): super(DecoderLayer, self).__init__() self.slf_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.enc_attn = MultiheadAttention(d_model, n_head, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner) self.connector = nn.ModuleList( [SublayerConnection(d_model, dropout) for _ in range(3)])
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, kdim=None, vdim=None): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, kdim=kdim, vdim=vdim) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu"): super(DecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, in_channels, out_channels, nn, aggr='add', bias=True, **kwargs): super().__init__(aggr=aggr, **kwargs) self.in_channels = in_channels self.out_channels = out_channels self.nn = nn self.aggr = aggr self.lin0 = Linear(self.out_channels, self.out_channels, bias=False) self.lin1 = Linear(self.out_channels, self.out_channels, bias=True) self.attn = MultiheadAttention(self.out_channels, 4) self.register_parameter('root', None) if bias: self.bias = Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters()
def __init__(self, d_model, subtokens_per_token, pointer_attention_type: AttentionType, n_attention_heads=8): super(PointerNetwork, self).__init__() self.d_model = d_model self.pointer_attention_type = pointer_attention_type # "Embedding" of the sentinel used for computing the logit of the gate self.sentinel = nn.Parameter(torch.Tensor(self.d_model, 1)) # Linear transformation for computing the query from the LSTM hidden state self.query_linear = nn.Linear(self.d_model, self.d_model) # Linear transformation for getting n subtokens out of the representations of the final encoder layer self.subtoken_extractor_linear = nn.Linear( self.d_model, subtokens_per_token * self.d_model) if self.pointer_attention_type == AttentionType.ADDITIVE: self.additive_attention_W = nn.Linear( self.d_model * 2, self.d_model) # bidirectional self.additive_attention_tanh = nn.Tanh() self.additive_attention_v = nn.Parameter( torch.Tensor(self.d_model, 1)) # context vector elif self.pointer_attention_type == AttentionType.MULTIHEAD: self.multihead_attention = MultiheadAttention( self.d_model, n_attention_heads) self._reset_parameters()
def __init__(self, d_model, n_head, head_dropout=0.1): super(AudioVideoInter, self).__init__() self.dropout = nn.Dropout(0.1) self.video_multihead = MultiheadAttention(d_model, num_heads=n_head, dropout=head_dropout) self.norm1 = nn.LayerNorm(d_model)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, use_gate=False): #fill in reordering of operations as done in https://arxiv.org/pdf/1910.06764.pdf #d_model: dimension of embedding for each input super(StableTransformerLayer, self).__init__() self.use_gate = use_gate self.gate_mha = GRUGate(d_model) self.gate_mlp = GRUGate(d_model) self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = F.relu
def __init__(self, d_model, n_heads, dropout=0.0): super(SelfAttentionLayer, self).__init__() self.multihead_attn = MultiheadAttention(d_model, n_heads, dropout=dropout) self.norm1 = LayerNorm(d_model) self.dropout1 = Dropout(dropout)
def __init__(self, num_heads: int, features_dim: int, dropout: float): """ :param num_heads: head num of attention :param features_dim: total features dimension4 """ super().__init__() self.model = MultiheadAttention(num_heads=num_heads, embed_dim=features_dim, dropout=dropout)
def __init__(self, d_model, nhead, dim_feedforward=1024, dropout=0.1): super(TransLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, d_model: int, nhead: int, d_hid: int, dropout=0.1): super(Smoother, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.conv1 = Conv1d(d_model, d_hid, 9, padding=4) self.conv2 = Conv1d(d_hid, d_model, 1, padding=0) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu, add_bias_kv=False, add_norm=False) -> None: super(TFDecorder, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True, add_bias_kv=add_bias_kv) self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True, add_bias_kv=add_bias_kv) # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.linear2 = nn.Linear(dim_feedforward, d_model) self.dropout = nn.Dropout(dropout) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.dropout3 = nn.Dropout(dropout) if add_norm: self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) else: self.norm1 = self.norm2 = self.norm3 = nn.Identity() # Legacy string support for activation function. if isinstance(activation, str): self.activation = _get_activation_fn(activation) else: self.activation = activation self.activation = self.activation()
def __init__(self, d_model, max_seq_len, max_docs, batch_size, nhead, mmr=False, query_doc_attn=False, head_pooling=False, dim_feedforward=2048, dropout=0.1, activation="relu"): super().__init__() # Definintions from pytorch encoder layer self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) if activation == "relu": self.activation = nn.functional.relu else: raise IOError("Please specify 'relu' activation") # Implementation of Feedforward model self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) self.pooling_norm = nn.LayerNorm(d_model) self.doc_attn_norm = nn.LayerNorm(d_model) self.query_doc_norm = nn.LayerNorm(d_model) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.d_model = d_model # New variables self.max_seq_len = max_seq_len self.max_docs = max_docs self.batch_size = batch_size self.head_pooling = head_pooling self.mmr = mmr self.query_doc_attn = query_doc_attn if mmr: self.mmr_attention = MMR(d_model, max_seq_len) # TODO: Add option for using cls token as doc representation if head_pooling: self.head_pooling = MultiHeadPooling(max_seq_len, max_docs, batch_size, d_model, nhead, dropout=dropout)
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, d_model: int, n_heads: int, dropout: float): """Uses self-attention to combine the meta-data and the temporal data. :param d_model: The dimension of the meta-data :type d_model: int :param n_heads: The number of heads to use in multi-head mechanism :type n_heads: int :param dropout: The dropout score as a flow :type dropout: float """ super().__init__() self.main_layer = MultiheadAttention(d_model, n_heads, dropout)
def __init__(self, d_model=256, nhead=8, dim_feedforward=2048, dropout=0.1): super(PDSLayer, self).__init__() self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward//2, d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.activation = F.glu
def __init__(self, embed_dim, n_head, hidden_dim, inner_dim, dropout, max_len, cross): super(ATTNLayer, self).__init__() # Meta data of mattn self.embed_dim = embed_dim self.n_head = n_head self.hidden_dim = hidden_dim self.inner_dim = inner_dim self.max_len = max_len self.dropout = dropout self.cross = cross # Multihead & Positionwise self.self_mattn = MultiheadAttention(embed_dim, n_head, dropout) self.pos_ff = PositionwiseFeedForward(embed_dim, inner_dim, dropout) self.norm = LayerNorm(embed_dim) # Cross attention if cross: self.cross_mattn = MultiheadAttention(embed_dim, n_head, dropout)
def __init__(self, decoder_layer, num_layers=4, norm=None): super(PDS, self).__init__() self.position_multihead_attn = MultiheadAttention(256, 8, dropout=0.1) self.norm1 = LayerNorm(256) self.linear1 = Linear(256, 2048) self.dropout = Dropout(0.1) self.linear2 = Linear(2048//2, 256) self.activation = F.glu self.dropout1 = Dropout(0.1) self.dropout2 = Dropout(0.1) self.layers = _get_clones(decoder_layer, num_layers-1) self.num_layers = num_layers-1 self.norm = norm
def __init__(self, d_model: int, nhead: int, d_hid: int, dropout=0.1, no_residual=False): super(Extractor, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.cross_attn = MultiheadAttention(d_model, nhead, dropout=dropout) self.conv1 = Conv1d(d_model, d_hid, 9, padding=4) self.conv2 = Conv1d(d_hid, d_model, 1, padding=0) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout) self.no_residual = no_residual
def __init__(self, args): super(TransformerMIL, self).__init__() encoder_layer = TransformerEncoderLayer(d_model=args.feature_depth, nhead=8, dim_feedforward=2048, dropout=args.dropout, activation="relu") encoder_norm = LayerNorm(args.feature_depth) self.attention = TransformerEncoder(encoder_layer, args.ntrans, encoder_norm) #self.attention1 = MultiheadAttention(args.feature_depth, 8) self.attention2 = MultiheadAttention(args.feature_depth, 8) self.classifier = Sequential(Linear(args.feature_depth, 1), Sigmoid()) self.mil = AttentionMILFeatures(args)
def __init__(self, n_head=2, d_head = 2, embed_dim=100, N_en=6, N_de=6, classes = 2, ff_dim=2048, do_rate=0.1, max_len=256, activation="relu", custom_encoder=None, custom_decoder=None, masks=[False, False, False], kmasks=[False, False, False]): super(Transformer, self).__init__() #===Base model(attn, enc, dec, ff) mhattn = MultiheadAttention(embed_dim, n_head) selfattn = MultiheadAttention(embed_dim, n_head) ff_1 = nn.Linear(embed_dim, ff_dim) ff_2 = nn.Linear(ff_dim, embed_dim) position = PositionalEncoding(embed_dim, do_rate) #===Masked attention(for seqs/keys) #src, tgt, memory self.masks = masks self.kmasks = kmasks #===Main Archetecture(enc, dec) self.encoder = Encoder( EncoderLayer(embed_dim, deepcopy(mhattn), deepcopy(ff_1), deepcopy(ff_2), do_rate), N_en) self.decoder = Decoder( DecoderLayer(embed_dim, deepcopy(selfattn), deepcopy(mhattn), deepcopy(ff_1), deepcopy(ff_2), do_rate), N_de) #===Embedding setting(src, tgt) self.src_embed = nn.Sequential(nn.Embedding(10000, embed_dim), deepcopy(position)) self.tgt_embed = nn.Sequential(nn.Embedding(10000, embed_dim), deepcopy(position)) #===Fianl FC self.final = nn.Linear(embed_dim*max_len, classes) #===Loss function definition self.loss = nn.CrossEntropyLoss() #===Parameters self.embed_dim = embed_dim self.max_len = max_len
def __init__(self, d_model, nhead, dim_feedforward=1024, dropout=0.1): super(TransformerDecoderLayer_BN, self).__init__() self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = Linear(d_model, dim_feedforward) self.dropout = Dropout(dropout) self.linear2 = Linear(dim_feedforward, d_model) self.norm2 = BatchNorm1d(d_model) self.norm3 = BatchNorm1d(d_model) self.dropout2 = Dropout(dropout) self.dropout3 = Dropout(dropout)
def load_model(self, run_id, snapshot_iteration, gpu=True): model_params = self.load_parameters(run_id, snapshot_iteration, gpu=gpu) config = self.load_config(run_id) model_config = self._prepare_model_config(config) language = config['data_setup']['language'] data_manager = CTPreprocessedDataManager(DATA_PATH_STAGE_2, language) decoder_config = model_config['lm_decoder'] word_vocab, token_type_vocab, node_type_vocab = data_manager.load_vocabularies() transformer_encoder_config = model_config['lm_encoder'] transformer_encoder_config['num_token_types'] = len(token_type_vocab) transformer_encoder_config['vocab_size'] = len(word_vocab) decoder_config['sos_id'] = word_vocab[SOS_TOKEN] if 'num_subtokens_output' in config['data_setup']: decoder_config['output_subtokens_per_token'] = config['data_setup']['num_subtokens_output'] else: decoder_config['output_subtokens_per_token'] = NUM_SUB_TOKENS if 'use_pointer_network' in config['data_setup']: decoder_config['use_pointer_network'] = config['data_setup']['use_pointer_network'] decoder_config['lm_encoder'] = transformer_encoder_config decoder_config['loss_fct'] = model_config['loss_fct'] model = XLNetTransformerDecoder(TransformerLMDecoderConfig(**decoder_config)) try: model.load_state_dict(model_params) except RuntimeError: # In most cases, this is due to the legacy issue with encoder_self_attention model.add_module('encoder_self_attention', MultiheadAttention(model.d_model, decoder_config['decoder_nhead'], dropout=decoder_config['decoder_dropout'])) try: model.load_state_dict(model_params) except RuntimeError: decoder_config['concat_query_and_pointer'] = False model = CodeTransformerDecoder(TransformerLMDecoderConfig(**decoder_config)) model.load_state_dict(model_params) return model
def __init__(self, model_size: int, num_heads: int, **kwargs): super().__init__(**kwargs) self.input_size = self.output_size = model_size self.num_heads = num_heads self.multihead_attention = MultiheadAttention( embed_dim=self.input_size, num_heads=num_heads ) self.attention_norm = AddAndNormLayer(model_size=self.input_size) self.linear_layer = nn.Sequential( nn.Linear(in_features=self.input_size, out_features=4 * self.input_size), nn.ReLU(), nn.Linear(in_features=4 * self.input_size, out_features=self.input_size), ) self.linear_layer_norm = AddAndNormLayer(model_size=self.input_size)
def __init__(self, questions_size=CONTENT_ID_VOCAB_SIZE, responses_size=RESPONSE_VOCAB_SIZE, part_size=PART_VOCAB_SIZE, task_container_id_size=CONTAINER_VOCAB_SIZE, user_id_size=USER_VOCAB_SIZE, day_size=DAYS_VOCAB_SIZE, maxlength=NDAY_LENGTH, num_heads=NUM_HEADS, embedding_size=EMBEDDING_DIM, dropout=DROPOUT): super(Encoder, self).__init__() self.input_length = maxlength #embedding layers for question, response #user, part, task_container_id and position self.embedding_ques = Embedding(num_embeddings=questions_size, embedding_dim=embedding_size) self.embedding_response = Embedding(num_embeddings=responses_size, embedding_dim=embedding_size) self.embedding_user = Embedding(num_embeddings=user_id_size, embedding_dim=embedding_size) self.embedding_part = Embedding(num_embeddings=part_size, embedding_dim=embedding_size) self.embedding_task = Embedding(num_embeddings=task_container_id_size, embedding_dim=embedding_size) self.embedding_pos = Embedding(num_embeddings=maxlength + day_size + DAY_VOCAB_SIZE, embedding_dim=embedding_size) #linear layers for day and days self.linear_day = Linear(maxlength, embedding_size) self.linear_days = Linear(maxlength, embedding_size) #multihead attention self.attention = MultiheadAttention(embed_dim=embedding_size, num_heads=num_heads, dropout=dropout) self.dropout1 = Dropout(dropout)
def __init__(self, trip_emb, embed_dim, num_heads=8, dropout=0.1, filter_inner=64): super(AttNet, self).__init__() # emb self.emb = Conv1d(trip_emb, embed_dim, 1) self.emb_bn = BatchNorm1d(embed_dim) # mha self.mha = MultiheadAttention(embed_dim, num_heads, dropout) self.mha_bn = BatchNorm1d(embed_dim) # ff self.inner = Conv1d(embed_dim, filter_inner, 1) self.outer = Conv1d(filter_inner, embed_dim, 1) self.ff_bn = BatchNorm1d(embed_dim) self.reset_parameters()