def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, share_qk=False, swish_activation=False): super(DecoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward( d_model, d_inner, dropout=dropout, swish_activation=swish_activation)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, keyword_module=None): super(DecoderLayer, self).__init__() self.key_module = keyword_module self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) if self.key_module is not None: self.key_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(DecoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) # 解码里面多一层.enc_attn self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) from pathlib import Path output_filedir = Path( __file__).resolve().parent.parent / 'vocab_pair' # 获取绝对路径的方法 dic2 = {} with open(output_filedir, encoding='utf-8') as f: tmp = f.readlines() for i in tmp: i = i.strip('\n').split(':') dic2[i[0]] = i[1] self.check_dic = dic2 tmmm = 1
def __init__(self, d_model, d_inner, n_head, dropout=0.1): super(DecoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, dropout=dropout) self.enc_attn = MultiHeadAttention(n_head, d_model, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): "d_model: dimension of model, eg: input is NCHW, W is the same as d_model" "d_inner: dimension of model" super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseForward(d_model, d_inner, dropout=dropout)
def __init__(self, hps, embed): """ :param hps: hyperparameters for the model :param embed: word embedding """ super(SummarizationModel, self).__init__() self._hps = hps self.Train = (hps.mode == 'train') # sentence encoder self.encoder = Encoder(hps, embed) # Multi-layer highway lstm self.num_layers = hps.n_layers self.sent_embedding_size = (hps.max_kernel_size - hps.min_kernel_size + 1) * hps.output_channel self.lstm_hidden_size = hps.lstm_hidden_size self.recurrent_dropout = hps.recurrent_dropout_prob self.deep_lstm = DeepLSTM(self.sent_embedding_size, self.lstm_hidden_size, self.num_layers, self.recurrent_dropout, hps.use_orthnormal_init, hps.fix_mask, hps.cuda) # Multi-head attention self.n_head = hps.n_head self.d_v = self.d_k = int(self.lstm_hidden_size / hps.n_head) self.d_inner = hps.ffn_inner_hidden_size self.slf_attn = MultiHeadAttention(hps.n_head, self.lstm_hidden_size , self.d_k, self.d_v, dropout=hps.atten_dropout_prob) self.pos_ffn = PositionwiseFeedForward(self.d_v, self.d_inner, dropout = hps.ffn_dropout_prob) self.wh = nn.Linear(self.d_v, 2)
def __init__(self, d_out, d_inner, n_head, d_k, d_v, dropout=0.1, type_='same', skip_connect=False, d_in=None): super(UNetEncoderLayer, self).__init__() d_in = d_in if d_in is not None else d_out # size of input to unet layer self.slf_attn = MultiHeadAttention( n_head, d_out, d_k, d_v, dropout=dropout, d_in=d_in) self.pos_ffn = PositionwiseFeedForward(d_out, d_inner, dropout=dropout) self.norm = nn.LayerNorm(d_out) self.skip_connect = skip_connect # TODO add depthwise-separable convolutions self.maxpool = None self.type = type_ if type_ == 'down': # half size of output self.conv = nn.Conv1d(d_in, d_in, kernel_size=3, padding=1, groups=d_in) self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) elif type_ == 'same': # keep size of output the same self.conv = nn.Conv1d(d_in, d_in, kernel_size=3, padding=1, groups=d_in) elif type_ == 'up': # double size of output self.conv = nn.ConvTranspose1d(d_in, d_in, kernel_size=3, stride=2, padding=1, groups=d_in) elif type_== 'none': self.conv = None else: raise RuntimeError('Did not specify appropriate convolution type') self.conv_out = nn.Linear(d_in, d_out)
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1, n_max_seq=None, use_attentions=None): super(EncoderLayer, self).__init__() self.d_model = d_model self.n_head = n_head self.use_attentions = use_attentions self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout) if self.use_attentions is not None: self.n_max_seq = n_max_seq both_mult = 2 if self.use_attentions == 'both' else 1 self.attn_fc = nn.Linear(d_model + both_mult * n_head * n_max_seq, d_model)
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1, layer_index=-1, layer_manager=None, length_adjuster=None): super(FunnelEncoderLayer, self).__init__() assert layer_manager is None or layer_index != -1, "layer index must be defined" if layer_manager is None: layer_manager = ConstantDimLayerManager(d_model) d_model = layer_manager.get_input_dim(layer_index=layer_index) d_out = layer_manager.get_output_dim(layer_index=layer_index) print("encoder, layer_index={} d_model: {} d_out: {}".format( layer_index, d_model, d_out)) self.d_model_input = d_model self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForwardFunnel( d_model, d_out, layer_manager.get_hidden_dim(layer_index=layer_index), dropout=dropout) self.length_adjuster = length_adjuster self.reduction_rate = 0.75
def __init__(self, attention_type, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, normalize_before=True): super(EncoderLayer, self).__init__() if attention_type == 'softmax': self.slf_attn = MultiHeadAttention( n_head, d_model, d_k, d_v, dropout=dropout, normalize_before=normalize_before) elif attention_type == 'performer': self.slf_attn = PerformerAttention( n_head, d_model, d_k, d_v, dropout=dropout, normalize_before=normalize_before) self.pos_ffn = PositionwiseFeedForward( d_model, d_inner, dropout=dropout, normalize_before=normalize_before)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention( # 实例化多头注意力模型 n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) # 定义前馈层
def __init__( self, input_size, hidden_size, kernel_size, stride_size, activation, dropout_prob=1.0, ): super(PCNNEncoder, self).__init__() self.cnn_encoder = nn.Conv1d(input_size, hidden_size, kernel_size, stride_size, padding=1) nn.init.xavier_uniform_(self.cnn_encoder.weight, gain=1) activations = {"relu": nn.ReLU(), "tanh": nn.Tanh()} self.activation = activations[activation] self.dropout = nn.Dropout(dropout_prob) self.mask_embedding = nn.Embedding.from_pretrained( torch.tensor([[0, 0, 0], [100, 0, 0], [0, 100, 0], [0, 0, 100]], dtype=torch.float32)) self.self_attention = MultiHeadAttention(n_head=8, d_model=256, d_k=32, d_v=32, dropout=0.1)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(GuidedAttention, self).__init__() self.slf_attn1 = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.slf_attn2 = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(DecoderLayer, self).__init__() # decoder的第一层是带有mask的multi-head attention self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) # 第2层是个 encoder相关的 multi-head attention self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) # 第3 层是一个前馈层 self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, rel_pos_op=None, dropout=0.1): super(EncoderLayer, self).__init__() self.rel_pos_op = rel_pos_op if rel_pos_op is None: self.slf_attn = MultiHeadAttention( n_head, d_model, d_k, d_v, dropout=dropout) else: self.slf_attn = RelMultiHeadAttention( n_head, d_model, d_k, d_v, rel_pos_op=rel_pos_op, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1, layer_index=-1, layer_manager=None, is_first_decoder_layer=False, length_adjuster=None): super(FunnelDecoderLayer, self).__init__() if layer_manager is None: layer_manager = ConstantDimLayerManager(constant_dimension=d_model) self.is_first_decoder_layer = is_first_decoder_layer d_model_in = layer_manager.get_input_dim(layer_index) d_model_out = layer_manager.get_output_dim(layer_index) self.layer_index = layer_index self.d_model = d_model_in d_out = layer_manager.get_output_dim(layer_index) self.d_model_out = d_model_out print("decoder, layer_index={} d_model: {} d_out: {}".format( layer_index, d_model_in, d_out)) # self attention works on initial-level encodings self.slf_attn = MultiHeadAttention(n_head, d_model_in, d_k, d_v, dropout=dropout) self.enc_attn = MultiHeadAttention(n_head, d_model_in, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForwardFunnel( d_input=d_model_in, d_output=d_model_out, d_inner_hid=layer_manager.get_hidden_dim(layer_index), dropout=dropout) self.length_adjuster = length_adjuster self.reduction_rate = 1 / 0.75
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(TargetLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.ctx_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.src_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() # 多头注意力机制,重复d_head = 8次, 包含了ADD&NORM # MultiHeadAttention中的forward需要的参数有Q、K、V以及mask # MultiHeadAttention网络需要的参数有n_head, d_model, d_k, d_v self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() # encoder 层首先是一个 multi-head attention self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) # 第2个部分是一个前馈层 self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, vocab_size, embedding_dim, output_size, dropout=0.5): super(LSTMFeatureExtractor, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.output_sz = output_size self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=output_size, bias=True, batch_first=True, bidirectional=True) self.slf_attn = MultiHeadAttention(1, output_size * 2, output_size * 2, output_size * 2, dropout=dropout)
def add_attr(obj, mod, dim, dropout): # setattr(obj, 'norm1_%s' % mod, Norm(dim)) # setattr(obj, 'norm2_%s' % mod, Norm(dim)) n_heads = gc.config['n_head'] setattr(obj, 'attn_%s' % mod, MultiHeadAttention(n_heads, dim, dropout=dropout)) if mod in ['l', 'a', 'v']: for head in range(n_heads): conv_in_d = 4 for i, conv_d in enumerate(gc.config['conv_dims']): setattr(obj, 'conv_%s_head_%d_%d' % (mod, head, i), nn.Linear(conv_in_d, conv_d)) conv_in_d = conv_d setattr(obj, 'conv_%s_out_%d' % (mod, head), nn.Linear(conv_in_d, 1))
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, tt_params={}): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout, tt_params=tt_params) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout, tt_params=tt_params)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, normalize_before=True): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout, normalize_before=normalize_before) self.pos_ffn = PositionwiseFeedForward( d_model, d_inner, dropout=dropout, normalize_before=normalize_before)
def __init__(self, n_tgt_vocab, d_model, d_k=64, d_v=64, n_head=8, dropout=0.1): super().__init__() print('Using GRU!!!!') self.tgt_word_emb = nn.Embedding(n_tgt_vocab, d_model, padding_idx=Constants.PAD) self.embedding = self.tgt_word_emb self.hidden_enc = nn.Linear(d_model, d_model) self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.gru = nn.GRUCell(d_model * 2, d_model)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): ''' :param d_model: 模型输入维度 :param d_inner: 前馈层隐层维度 :param n_head: 多头 :param d_k: 键向量维度 :param d_v: 值向量维度 :param dropout: ''' super(EncoderLayer, self).__init__() self.sef_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, kernel='linear', kernel_size_tcn=3, kernel_size_scn=2): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout, kernel=kernel, kernel_size_tcn=kernel_size_tcn, kernel_size_scn=kernel_size_scn) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): #(512, 2048, 8, 64, 64, 0.1 ) super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention( n_head, d_model, d_k, d_v, dropout=dropout) #(8, 512, 64, 64) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) #(512, 2048, 0.1)
def __init__(self, d_model, d_inner_hid, n_head): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(d_model, n_head) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner_hid)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1, d_enc=None): super(DecoderLayer, self).__init__() d_enc = d_model if d_enc is None else d_enc self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout, d_in=d_enc) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1): super(DecoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)