def __init__(self, d_model, heads, d_ff=1024, dropout=0.1, attention_type="Baseline", relative_time_pitch=False, max_relative_position=512): super().__init__() self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.norm_3 = Norm(d_model) self.attention_type = attention_type self.relative_time_pitch = relative_time_pitch self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) self.dropout_3 = nn.Dropout(dropout) self.attn_1 = MultiHeadAttention(heads, d_model, dropout = dropout, attention_type = self.attention_type, \ relative_time_pitch = self.relative_time_pitch, max_relative_position = max_relative_position) self.attn_2 = MultiHeadAttention(heads, d_model, dropout =dropout, attention_type = self.attention_type, \ relative_time_pitch = self.relative_time_pitch, max_relative_position = max_relative_position) self.ff = FeedForward(d_model, d_ff, dropout)
def __init__(self, d_model, heads, dropout=0.1): super().__init__() self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) self.ff = FeedForward(d_model, dropout=dropout) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout)
def __init__(self, d_model, heads, decoder_extra_layers, dropout=0.1): super().__init__() self.decoder_extra_layers = decoder_extra_layers self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.norm_3 = Norm(d_model) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) self.dropout_3 = nn.Dropout(dropout) self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) self.ff = FeedForward(d_model, dropout=dropout)
def __init__(self, vocab_size, d_model, N, heads, dropout): super().__init__() self.N = N self.embed = Embedder(vocab_size, d_model) self.pe = PositionalEncoder(d_model, dropout=dropout) self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) self.norm = Norm(d_model)
def __init__(self, bert, hidden_size, num_hidden_layers, num_attention_heads, dropout): super().__init__() self.N = num_hidden_layers self.bert = bert self.pe = PositionalEncoder(hidden_size, dropout=dropout) self.layers = get_clones(DecoderLayer(hidden_size, num_attention_heads, dropout), num_hidden_layers) self.norm = Norm(hidden_size)
def __init__(self, d_model, heads, dropout=0.1): super(EncoderLayer, self).__init__() self.norm = Norm(d_model) self.dropout = nn.Dropout(dropout) self.attention_layer = MultiHeadedSelfAttention(heads, d_model, dropout=dropout) self.ffnn_layer = FeedForward(d_model, dropout=dropout)
def __init__(self, vocab_size, opt): super().__init__() self.N = opt.n_layers self.embed = Embedder(vocab_size, opt.d_model) if opt.concat_pos_sinusoid is True: self.pe = PositionalEncoderConcat(opt.d_model, opt.dropout, opt.max_seq_len) self.d_model = 2 * opt.d_model else: self.pe = PositionalEncoder(opt.d_model, opt.dropout, opt.max_seq_len) self.d_model = opt.d_model if opt.relative_time_pitch is True: self.layers = get_clones(DecoderLayer(self.d_model, opt.heads, opt.d_ff, \ opt.dropout, opt.attention_type, \ opt.relative_time_pitch, max_relative_position = opt.max_relative_position), opt.n_layers-1) self.layers.insert(0, copy.deepcopy(DecoderLayer(self.d_model, opt.heads, opt.d_ff, \ opt.dropout, opt.attention_type, \ relative_time_pitch = False, max_relative_position = opt.max_relative_position))) else: self.layers = get_clones(DecoderLayer(self.d_model, opt.heads, opt.d_ff, \ opt.dropout, opt.attention_type, \ opt.relative_time_pitch, max_relative_position = opt.max_relative_position), opt.n_layers) self.norm = Norm(self.d_model)
def __init__(self, d_model, N_layers, heads, dropout): super().__init__() self.N_layers = N_layers # self.embed = Embedder(vocab_size, d_model) # self.pe = PositionalEncoder(d_model, dropout=dropout) # self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N_layers) self.norm = Norm(d_model)
def __init__(self, vocab_size, d_model, N, heads, dropout, field, word_emb, opt): super().__init__() self.N = N self.word_emb = word_emb self.opt = opt # unused, just for querying self.embed = Embedder(vocab_size, d_model, word_emb, field) self.pe = PositionalEncoder(d_model, dropout=dropout) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) # attention self.norm = Norm(d_model)
def __init__(self, vocab_size, d_model, N, heads, dropout, device): super().__init__() self.N = N # We need to use the embedder # self.embed = Embedder(vocab_size, d_model) # self.embed = nn.Linear(vocab_size, d_model) self.pe = PositionalEncoder(d_model, dropout=dropout, device=device) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) self.norm = Norm(d_model)
def __init__(self, d_model, heads, dropout=0.1): super(DecoderLayer, self).__init__() self.norm = Norm(d_model) self.dropout = nn.Dropout(dropout) # in the decoder, the self-attention layer is only allowed to attend to earlier positions in the output sequence # this is different than the encoder counterparts self.attention_layer = MultiHeadedSelfAttention(heads, d_model, dropout=dropout) self.encoder_decoder_attention_layer = MultiHeadedSelfAttention(heads, d_model, dropout=dropout) self.ffnn_layer = FeedForward(d_model, dropout=dropout)
def __init__(self, src_vocab, n_classes, d_model, N, heads, dropout): super().__init__() self.encoder = Encoder(src_vocab, d_model, N, heads, dropout) self.classifier = Classifier(d_model) self.out = nn.Linear(d_model, n_classes) self.norm = Norm(n_classes)
def __init__(self, d_model): super().__init__() self.layer = ClassifierLayer(d_model) self.norm = Norm(d_model)