def __init__(self, model_dim=512, num_heads=8, ffn_dim=204, dropout=0.0): super(DecoderLayer, self).__init__() self.self_attention = MultiHeadAttention(model_dim, num_heads, dropout) self.joint_attention = MultiHeadAttention(model_dim, num_heads, dropout) self.feed_forward = PositionWiseFeedForward(model_dim, ffn_dim, dropout)
def __init__(self, d_model, num_heads, dff, rate=0.1): super().__init__() self.mha1 = MultiHeadAttention(d_model, num_heads) self.mha2 = MultiHeadAttention(d_model, num_heads) self.ffn = point_wise_feed_forward_network(d_model, dff) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(rate) self.dropout2 = tf.keras.layers.Dropout(rate) self.dropout3 = tf.keras.layers.Dropout(rate)
def _attention_builder(x): return MultiHeadAttention( head_num=head_num, activation=activation, history_only=history_only, trainable=trainable, name=name, )(x)
def build(self, input_shape): self.embeddings = self.add_weight(shape=(self._vocab_size, self._model_dim), initializer='glorot_uniform', trainable=True, name="embeddings") self.EncoderPositionEncoding = PositionEncoding(self._model_dim) self.EncoderMultiHeadAttetions = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms0 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.EncoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms1 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.DecoderPositionEncoding = PositionEncoding(self._model_dim) self.DecoderMultiHeadAttetions0 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms0 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderMultiHeadAttetions1 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms1 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms2 = [ LayerNormalization() for _ in range(self._decoder_stack) ] super(Transformer, self).build(input_shape)
def build(self, input_shape): self.d_model = input_shape[-1] # Self multi head attention self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj) self.dropout_1 = layers.Dropout(rate=self.dropout_rate) self.norm_1 = layers.LayerNormalization(epsilon=1e-6) # Multi head attention combinado con la salida del encoder self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj) self.dropout_2 = layers.Dropout(rate=self.dropout_rate) self.norm_2 = layers.LayerNormalization(epsilon=1e-6) # Feed foward self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu") self.dense_2 = layers.Dense(units=self.d_model) self.dropout_3 = layers.Dropout(rate=self.dropout_rate) self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
def build(self, input_shape): self.d_model = input_shape[-1] self.multi_head_attention = MultiHeadAttention(self.nb_proj) self.dropout_1 = layers.Dropout(rate=self.dropout_rate) self.norm_1 = layers.LayerNormalization(epsilon=1e-6) self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu") self.dense_2 = layers.Dense(units=self.d_model) self.dropout_2 = layers.Dropout(rate=self.dropout_rate) self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
def sub_layer_multi_head_attention(self ,layer_index ,Q ,K_s,type,mask=None,is_training=None,dropout_keep_prob=None) :# COMMON FUNCTION """ multi head attention as sub layer :param layer_index: index of layer number :param Q: shape should be: [batch_size,sequence_length,embed_size] :param k_s: shape should be: [batch_size,sequence_length,embed_size] :param type: encoder,decoder or encoder_decoder_attention :param mask: when use mask,illegal connection will be mask as huge big negative value.so it's possiblitity will become zero. :return: output of multi head attention.shape:[batch_size,sequence_length,d_model] """ with tf.variable_scope("base_mode_sub_layer_multi_head_attention_" + type+str(layer_index)): # below is to handle attention for encoder and decoder with difference length: #length=self.decoder_sent_length if (type!='encoder' and self.sequence_length!=self.decoder_sent_length) else self.sequence_length #TODO this may be useful length=self.sequence_length #1. get V as learned parameters V_s = tf.get_variable("V_s", shape=(self.batch_size,length,self.d_model),initializer=self.initializer) #2. call function of multi head attention to get result multi_head_attention_class = MultiHeadAttention(Q, K_s, V_s, self.d_model, self.d_k, self.d_v, self.sequence_length, self.h,type=type,is_training=is_training,mask=mask,dropout_rate=(1.0-dropout_keep_prob)) sub_layer_multi_head_attention_output = multi_head_attention_class.multi_head_attention_fn() # [batch_size*sequence_length,d_model] return sub_layer_multi_head_attention_output # [batch_size,sequence_length,d_model]
def __init__(self, dim, src_n_vocab, n_encod_layer, tgt_n_vocab, n_decode_layer, max_len=512): self.src_emb = EmbeddingWithPositionalEncoding(dim, src_n_vocab, max_len) self.tgt_emb = EmbeddingWithLearnedPositionalEncoding( dim, tgt_n_vocab, max_len) enc_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1), None, nn.Linear(dim, dim), 0.1) self.encoder = Encoder(enc_layer, n_encod_layer) dec_layer = TransformerLayer(dim, MultiHeadAttention(6, dim, 0.1), MultiHeadAttention(6, dim, 0.1), nn.Linear(dim, dim), 0.1) self.decoder = Decoder(dec_layer, n_decode_layer) self.encoder_decoder = EncoderDecoder(self.encoder, self.decoder, self.src_emb, self.tgt_emb)
def __init__( self, dim_model: int = 512, num_heads: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, ): super().__init__() dim_k = dim_v = dim_model // num_heads self.attention_1 = Residual( MultiHeadAttention(num_heads, dim_model, dim_k, dim_v), dimension=dim_model, dropout=dropout, ) self.attention_2 = Residual( MultiHeadAttention(num_heads, dim_model, dim_k, dim_v), dimension=dim_model, dropout=dropout, ) self.feed_forward = Residual( feed_forward(dim_model, dim_feedforward), dimension=dim_model, dropout=dropout, )
def make_attention_cell(dec_cell, rnn_size, enc_output, bias_output, lengths, att_type, att_type_bias, bias_lengths, args): """Wraps the given cell with Attention. Args: dec_cell: the RNNCell for decoder. rnn_size: Integer. Number of hidden units to use for rnn cell. inputs: Array of input points. enc_output: encoder outputs in erery step. bias_output: bias representations. lengths: Array of integers. Sequence lengths of the input points. att_type: attention type for encoder. att_type_bias: attention for bias. bias_lengths: number of the bias words. Returns: a new Cell wrapped with attention. """ if att_type=='BahdanauAttention': # if the attention type is BahdanauAttention, the bias has not been implemented attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=rnn_size, memory=enc_output, memory_sequence_length=lengths, name='BahdanauAttention') return tf.contrib.seq2seq.AttentionWrapper(cell=dec_cell, attention_mechanism=attention_mechanism, attention_layer_size=None, output_attention=False) elif att_type=='MultiHeadAttention' and att_type_bias=='MultiHeadAttention': # multi_head_attention implement size_per_head = int(rnn_size/args.num_heads) my_attention_mechanism = MyAttentionMechanism(num_heads=args.num_heads, size_per_head=size_per_head, memory=enc_output, memory_sequence_length=lengths, name='MultiHeadAttention') my_attention_mechanism_bias = MyAttentionMechanism(num_heads=args.num_heads, size_per_head=size_per_head, memory=bias_output, memory_sequence_length=bias_lengths, name='MultiHeadAttentionBias') attention_mechanisms = [] attention_mechanisms_for_bias = [] for i in range(args.num_heads): attention_mechanism = MultiHeadAttention(num_units=rnn_size, memory=enc_output, memory_sequence_length=lengths, name='MultiHeadAttention') attention_mechanism_for_bias = MultiHeadAttention(num_units=rnn_size, memory=bias_output, memory_sequence_length=bias_lengths, name='MultiHeadAttentionBias') attention_mechanisms.append(attention_mechanism) attention_mechanisms_for_bias.append(attention_mechanism_for_bias) return AttentionWrapper(cell=dec_cell, attention_mechanism=attention_mechanisms, attention_mechanism_for_bias=attention_mechanisms_for_bias, my_attention_mechanism=my_attention_mechanism, my_attention_mechanism_bias=my_attention_mechanism_bias, attention_layer_size=None, output_attention=False) elif att_type=='MultiHeadAttention' and att_type_bias=='BahdanauAttention': if args.num_heads>1: raise ValueError("it's illegal if the num_heads>1 and att_type_bias \ equals BahdanauAttention at the same time, please set \ num_heads 1 or set att_type_bias MultiHeadAttention") size_per_head = int(rnn_size/args.num_heads) my_attention_mechanism = MyAttentionMechanism(num_heads=args.num_heads, size_per_head=size_per_head, memory=enc_output, memory_sequence_length=lengths, name='MultiHeadAttention') attention_mechanisms_for_bias = tf.contrib.seq2seq.BahdanauAttention( num_units=rnn_size, memory=bias_output, memory_sequence_length=bias_lengths, name='BahdanauAttention') attention_mechanisms = [] for i in range(args.num_heads): attention_mechanism = MultiHeadAttention(num_units=rnn_size, memory=enc_output, memory_sequence_length=lengths, name='MultiHeadAttention') attention_mechanisms.append(attention_mechanism) return AttentionWrapper(cell=dec_cell, attention_mechanism=attention_mechanisms, attention_mechanism_for_bias=attention_mechanisms_for_bias, my_attention_mechanism=my_attention_mechanism, my_attention_mechanism_bias=None, attention_layer_size=None, output_attention=False)
def build_Model(input_shape, n_states=2, n_speaker=40): ''' architecture: 1. 4 layers CNN (dilated convolution) 2. multi head attention(head num: 2) 3. FC ''' # Input layer inputs = Input(name='the_input', shape=input_shape, dtype='float32') # Convolution layer (VGG) inner = Conv2D(32, (3, 3), padding='same', name='conv1', dilation_rate=2, kernel_initializer='he_normal')(inputs) inner = BatchNormalization()(inner) inner = Activation('relu')(inner) inner = MaxPooling2D(pool_size=(2, 2), name='max1')(inner) inner = Conv2D(64, (3, 3), padding='same', name='conv2', dilation_rate=2, kernel_initializer='he_normal')(inner) inner = BatchNormalization()(inner) inner = Activation('relu')(inner) inner = MaxPooling2D(pool_size=(2, 2), name='max2')(inner) inner = Conv2D(256, (3, 3), padding='same', name='conv3', dilation_rate=2, kernel_initializer='he_normal')(inner) inner = BatchNormalization()(inner) inner = Activation('relu')(inner) inner = Conv2D(256, (3, 3), padding='same', name='conv4', dilation_rate=2, kernel_initializer='he_normal')(inner) inner = BatchNormalization()(inner) inner = Activation('relu')(inner) # CNN reshape inner = Reshape(target_shape=((125, 2560)), name='reshape')(inner) inner = Dense(256, activation='relu', kernel_initializer='he_normal', name='dense1')(inner) # Multi-head attention layer inner = MultiHeadAttention(head_num=2, name='Multi-Head')(inner) inner = Lambda(lambda xin: K.sum(xin, axis=1))(inner) # Gradient Reversal Layer Flip = GradientReversal(hp_lambda=0.31) dann_in = Flip(inner) dann_out = Dense(units=n_speaker, activation='softmax', name='gradient_reversal')(dann_in) # transforms RNN output to character activations: predictions = Dense(units=n_states, activation='softmax', name='output_layer')(inner) # (None, 3) model = Model(inputs=inputs, outputs=[predictions, dann_out]) adam = optimizers.Adam(lr=0.00001) model.compile(optimizer=adam, loss={ 'output_layer': 'categorical_crossentropy', 'gradient_reversal': 'categorical_crossentropy' }, loss_weights={ 'output_layer': 0.997, 'gradient_reversal': 0.003 }, metrics=['accuracy']) model.summary() return model