def test_additive_attention(): """ Bahdanau-style attention. query (batch, Tq, dim), key (batch, Tv, dim) and value (batch, Tv, dim) are inputs. following computations is processed. 1. reshape query as shape [batch, Tq, 1, dim] and value as shape [batch, 1, Tv, dim] 2. broadcasting multiply between additive of above as output shape [batch, Tq, Tv, dim] 3. reduce_sum above with dim axis as output shape [batch, Tq, Tv] 4. softmax of above 5. MatMul between 4. and value as output shape [batch, Tq, dim] """ Tq = 10 Tv = 10 dim = 16 q_shape = (Tq, dim) k_shape = (Tv, dim) v_shape = (Tv, dim) q = Input(q_shape) k = Input(k_shape) v = Input(v_shape) x = AdditiveAttention()([q, k, v]) model = Model([q, k, v], x) flops = get_flops(model, batch_size=1) assert ( flops == Tq * Tv * dim # No.2 (multiply) + Tq * Tv * dim # No.3 (add) + Tq * Tv * (dim - 1) # No.3 (reduce_sum) + 5 * Tq * Tv # No.4 (softmax) + 2 * Tv * Tq * dim # No.5 (MatMul) )
def build_decoder(params): # decoder layers de_inputs = Input(shape=(params.de_max_len, ), name='de_inputs') de_init_state_h = Input(shape=(params.hidden_units, ), name='de_init_state_h') de_init_state_c = Input(shape=(params.hidden_units, ), name='de_init_state_c') de_en_outputs = Input(shape=(params.en_max_len, params.hidden_units), name='de_en_inputs') # decoder forward de_embedding_layer = Embedding(params.de_vocab_size, params.embedding_dim, mask_zero=True) de_lstm_layer = LSTM(params.hidden_units, return_sequences=True) attention_layer = AdditiveAttention() con_layer = Concatenate() fc_layer = Dense(params.de_vocab_size, activation='softmax') # forward de_embedding = de_embedding_layer(de_inputs) de_lstm_outputs = de_lstm_layer( de_embedding, initial_state=[de_init_state_h, de_init_state_c]) attention_vec = attention_layer([de_lstm_outputs, de_en_outputs]) fc_inputs = con_layer([attention_vec, de_lstm_outputs]) fc_outputs = fc_layer(fc_inputs) # decoder definition decoder = Model( inputs=[de_inputs, de_init_state_h, de_init_state_c, de_en_outputs], outputs=fc_outputs, name='decoder') decoder.summary() return decoder
def attention_lstm(self): input_x = Input(shape = self.input_shape, name = 'input') X = input_x for i in range(self.lstm_blocks): query = Dense(10, name='query_' + str(i))(X) key = Dense(10, name='key_' + str(i))(X) attention_weights = AdditiveAttention(use_scale = False, name='attention_'+str(i))([query, X, key]) attention_weights = Dense(1, activation='softmax', name='attention_weights_'+str(i))(attention_weights) context = Multiply(name='context_'+str(i))([attention_weights,X]) X = LSTM(self.n_units, return_sequences = True, recurrent_dropout=self.recurrent_dropout, kernel_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), activity_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), name = 'lstm_' + str(i))(context) if self.dropout_rate > 0: X = Dropout(self.dropout_rate, name='dropout_'+str(i))(X) X = LSTM(self.n_units, return_sequences = False, recurrent_dropout=self.recurrent_dropout, kernel_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), activity_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), name = 'lstm_last')(X) if self.dropout_rate > 0: X = Dropout(self.dropout_rate, name='dropout_last')(X) X = Dense(self.n_outputs, activation=self.activation, name = 'output')(X) return Model(inputs=input_x, outputs=X, name='attention_lstm')
def build_decoder(params): # decoder layers de_inputs = Input(shape=(1, )) de_init_state_h = Input(shape=(params.hidden_units, )) de_init_state_c = Input(shape=(params.hidden_units, )) de_en_outputs = Input(shape=(params.en_max_len, params.hidden_units)) # decoder forward de_embedding_layer = Embedding(params.de_vocab_size, params.embedding_dim, mask_zero=True) de_lstm_layer = LSTM(params.hidden_units, return_sequences=True, return_state=True) attention_layer = AdditiveAttention() add_layer = Add() fc_layer = Dense(params.de_vocab_size, activation='softmax') # forward de_embedding = de_embedding_layer(de_inputs) de_lstm_outputs, de_output_state_h, de_output_state_c = de_lstm_layer( de_embedding) attention_vec = attention_layer([de_lstm_outputs, de_en_outputs]) fc_inputs = add_layer([attention_vec, de_lstm_outputs]) fc_outputs = fc_layer(fc_inputs) # decoder definition decoder = Model( inputs=[de_inputs, de_init_state_h, de_init_state_c, de_en_outputs], outputs=[fc_outputs, de_output_state_h, de_output_state_c]) return decoder
def get_document_model(self, section_model, question_model): document_input = Input(shape=(None, None, None), name="document_input") document_encoded = TimeDistributed(section_model)(document_input) cnn_1d = Conv1D(128, 4, padding="same", activation="relu", strides=1)(document_encoded) attention = AdditiveAttention()([cnn_1d, question_model]) output = GlobalAveragePooling1D()(attention) model = Model(document_input, output) return model
def CRNN_Attention(img_width, img_height, img_channels, len_characters, trainable=True, cnn_backbone_name=get_cnn_backbone('resnet_attention'), rnn_backbone_name=None): """Instantiate a CRNN architecture with attention mechanism. Parameters: img_width: int, the width of image. img_height: int, the height of image. img_channels: int, the channels of image. len_characters: int, the length of characters. trainable: bool, default=True If true the model will be trained, if false the model will be inferred. cnn_backbone_name: str, the name of convolution part of CRNN model. rnn_backbone_name: str, the name of recurrent part of CRNN model. Returns: A Keras model instance. """ input_image = Input(shape=(img_width, img_height, img_channels), name='Input-Image', dtype='float32') input_label = Input(shape=(None,), name='Input-Label', dtype='float32') # CNN backbone cnn_backbone = get_cnn_backbone(cnn_backbone_name) cnn_layer = cnn_backbone(input_image) # RNN backbone rnn_backbone = get_rnn_backbone(rnn_backbone_name) rnn_layer = rnn_backbone(cnn_layer) # Add attention attention_layer = AdditiveAttention(name='Attention')([cnn_layer, rnn_layer]) concatenate_layer = Concatenate(name='Concatenate')([cnn_layer, attention_layer]) # Connect to full-connect-layer. dense_layer = Dense(units=len_characters + 1, activation='softmax', name='Output-Dense')(concatenate_layer) ctc_layer = CTCLayer(name='ctc_loss')(input_label, dense_layer) if trainable is True: model = Model(inputs=[input_image, input_label], outputs=ctc_layer, name='ocr_model_train') else: model = Model(inputs=input_image, outputs=dense_layer, name='ocr_model_inference') return model
def get_text_model(self, embedding, use_attention=False, question_model=None): text_input = Input(shape=(None,), name="text_input") text_embedding = embedding(text_input) output = Conv1D(128, 4, padding="same", activation="relu", strides=1)(text_embedding) if use_attention: attention = AdditiveAttention()([output, question_model]) output = GlobalAveragePooling1D()(attention) model = Model(text_input, output) return model
def __init__( self, num_hidden, embedding=False, embedding_len = 50 ): """ :param num_hidden: :param embedding: :param embedding_len: """ super( SequenceAttentionLayer, self ).__init__() self.num_hidden = num_hidden self.if_embedding = embedding self.embedding_len = embedding_len if embedding: self.embedding_len = embedding_len self.embed = Embedding( input_dim=len(word_vectors), output_dim=self.embedding_len, weights=[word_vectors], mask_zero=True, trainable=False ) self.embed_mask = Masking(mask_value=0) self.lstm = LSTM( self.num_hidden, activation="tanh", return_sequences=True ) # pass the type of attention self.attention_layer = AdditiveAttention( use_scale=True ) self.attention_layer.__setattr__('supports_masking', True)
def attention_lstm_residual(self): input_x = Input(shape = self.input_shape, name = 'input') X = input_x for i in range(self.lstm_blocks): query = Dense(10, name='query_' + str(i))(X) key = Dense(10, name='key_' + str(i))(X) attention_weights = AdditiveAttention(use_scale = False, name='attention_'+str(i))([query, X, key]) attention_weights = Dense(1, activation='softmax', name='attention_weights_'+str(i))(attention_weights) context = Multiply(name='context_'+str(i))([attention_weights,X]) X = LSTM(self.n_units, return_sequences = True, recurrent_dropout=self.recurrent_dropout, kernel_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), activity_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), name = 'lstm_' + str(i))(context) if self.dropout_rate > 0: X = Dropout(self.dropout_rate, name='dropout_'+str(i))(X) X = LSTM(self.n_units, return_sequences = False, recurrent_dropout=self.recurrent_dropout, kernel_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), activity_regularizer=l1_l2(self.lstm_l1, self.lstm_l2), name = 'lstm_last')(X) if self.dropout_rate > 0: X = Dropout(self.dropout_rate, name='dropout_last')(X) crop_input = Cropping1D(cropping=(0, self.input_shape[0] - 1), name='crop_input')(input_x) if self.dropout_rate > 0: crop_input = Dropout(self.dropout_rate, name='dropout_crop_input')(crop_input) flatten_crop = Flatten(name='flatten_crop_input')(crop_input) query_input = Dense(10, name='query_input')(flatten_crop) key_input = Dense(10, name='key_input')(flatten_crop) attention_weights_input = AdditiveAttention(use_scale = False, name='attention_input')([query_input, flatten_crop, key_input]) attention_weights_input = Dense(1, activation='softmax', name='attention_weights_input')(attention_weights_input) context_input = Multiply(name='context_input')([attention_weights_input, flatten_crop]) concat = Concatenate(name='concat_output')([X, context_input]) X = Dense(self.n_outputs, activation=self.activation, name = 'output')(concat) return Model(inputs=input_x, outputs=X, name='attention_lstm')
def get_section_model(self, sentence_model, question_output=None, question_input=None): section_input = Input(shape=(section_max_size, sentence_max_size), name="section_input") section_encoded = TimeDistributed(sentence_model)( [section_input, question_input]) section_encoded = Conv1D(128, 4, padding="same", activation="relu", strides=1)(section_encoded) attention = AdditiveAttention()([section_encoded, question_output]) output = GlobalAveragePooling1D()(attention) model = Model(section_input, output) return model
def __init__(self, vocab_size, embedding_dim, hidden_units, de_max_len, en_max_len): super(Decoder, self).__init__() # parameters initialization self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.hidden_units = hidden_units self.de_max_len = de_max_len self.en_max_len = en_max_len # layers initialization self.embedding = Embedding(self.vocab_size, self.embedding_dim, input_length=self.de_max_len) self.lstm = LSTM(self.hidden_units, return_sequences=True, return_state=True, input_shape=(self.de_max_len, self.embedding_dim)) self.attention = AdditiveAttention(input_shape=()) self.fc = Dense(self.vocab_size, activation='softmax') self.add_layer = Add() self.encoder_outputs_layer = Input(shape=(self.en_max_len, embedding_dim)) self.init_states_layer = Input(shape=(2, hidden_units))
def get_model(self): input = Input(shape=(self.max_len, )) embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.max_len, trainable=True)(input) embedding = SpatialDropout1D(self.dropout_rate)(embedding) lstm_forward = LSTM(128, return_sequences=True)(embedding) lstm_backward = LSTM(128, return_sequences=True, go_backwards=True)(embedding) x = Concatenate()([lstm_forward, embedding, lstm_backward]) attn = AdditiveAttention()([x, x]) x = [GlobalAveragePooling1D()(x)] + \ [GlobalMaxPooling1D()(x)] + \ [GlobalAveragePooling1D()(attn)] + \ [GlobalMaxPooling1D()(attn)] x = Concatenate()(x) output = Dense(self.class_num, activation=self.last_activation)(x) model = Model(inputs=input, outputs=output) return model
def _create_model(self): '''Creates the GRU architecture described in the paper ''' model = Sequential() input_shape = (self.window_size, 1) inputs = Input(input_shape) cnn_1 = Conv1D(16, 4, activation="relu", padding="same", strides=1)(inputs) att = AdditiveAttention(causal='True')([cnn_1, cnn_1]) b1 = Bidirectional(GRU(64, return_sequences=False, stateful=False), merge_mode='concat')(att) x = Dense(64, activation='relu')(b1) outputs = Dense(1, activation='linear')(x) model = Model(inputs=inputs, outputs=outputs) model.compile(loss='mse', optimizer='adam') print(model.summary()) plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=False) return model
def attention_lstm_dropout_input(self): dropout_input = Input(shape = (self.seq_len, self.droput_input_cols), name = 'dropout_input') remain_input = Input(shape = (self.seq_len, self.remain_input_cols), name = 'remain_input') dropout_x = Dropout(self.dropout_rate)(dropout_input) X = Concatenate(axis=-1)([remain_input, dropout_x]) for i in range(self.lstm_blocks): query = Dense(10)(X) key = Dense(10)(X) context = AdditiveAttention()([query, X, key]) #context = one_step_attention(a) X = LSTM(self.n_units, return_sequences = True, recurrent_dropout=self.recurrent_dropout)(context) if self.dropout_rate > 0: X = Dropout(self.dropout_rate)(X) X = LSTM(self.n_units, return_sequences = False, recurrent_dropout=self.recurrent_dropout)(X) if self.dropout_rate > 0: X = Dropout(self.dropout_rate)(X) X = Dense(self.n_outputs)(X) X = Activation(self.activation, name = 'output')(X) return Model(inputs=[dropout_input, remain_input], outputs=X)
if fr_timesteps: decoder_inputs = Input(shape=(dec_timesteps - 1, dec_vsize), name='decoder_inputs') else: decoder_inputs = Input(shape=(None, dec_vsize), name='decoder_inputs') # Encoder GRU encoder_gru = GRU(hidden_size, return_sequences=True, return_state=True, name='encoder_gru') encoder_out, encoder_state = encoder_gru(encoder_inputs) # Set up the decoder GRU, using `encoder_states` as initial state. decoder_gru = GRU(hidden_size, return_sequences=True, return_state=True, name='decoder_gru') decoder_out, decoder_state = decoder_gru(decoder_inputs, initial_state=encoder_state) # Attention layer # attn_layer = AttentionLayer(name='attention_layer') attn_layer = AdditiveAttention(name="attention_layer") ## The input for AdditiveAttention: query, key ## It returns a tensor of shape as query ## This is different from the AttentionLayer developed by Thushan # attn_out, attn_states = attn_layer([encoder_out, decoder_out]) attn_out, attn_states = attn_layer([decoder_out,encoder_out],return_attention_scores=True) # Concat attention input and decoder GRU output decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_out, attn_out]) # Dense layer dense = Dense(dec_vsize, activation='softmax', name='softmax_layer') dense_time = TimeDistributed(dense, name='time_distributed_layer') decoder_pred = dense_time(decoder_concat_input)
class SequenceAttentionLayer(tf.keras.layers.Layer): """Sequence model(LSTM) with attention sum of hidden states""" def __init__( self, num_hidden, embedding=False, embedding_len = 50 ): """ :param num_hidden: :param embedding: :param embedding_len: """ super( SequenceAttentionLayer, self ).__init__() self.num_hidden = num_hidden self.if_embedding = embedding self.embedding_len = embedding_len if embedding: self.embedding_len = embedding_len self.embed = Embedding( input_dim=len(word_vectors), output_dim=self.embedding_len, weights=[word_vectors], mask_zero=True, trainable=False ) self.embed_mask = Masking(mask_value=0) self.lstm = LSTM( self.num_hidden, activation="tanh", return_sequences=True ) # pass the type of attention self.attention_layer = AdditiveAttention( use_scale=True ) self.attention_layer.__setattr__('supports_masking', True) def get_config(self): config = super(SequenceAttentionLayer, self).get_config() config['num_hidden'] = self.num_hidden config["embedding"] = self.if_embedding config["embedding_len"] = self.embedding_len return config def compute_mask(self, inputs, mask=None): # Also split the mask into 2 if it presents. if not self.if_embedding: return None # inputs = self.embed(inputs) embed_mask = tf.math.not_equal(inputs, 0) return tf.math.not_equal( tf.reduce_sum(tf.cast(embed_mask, tf.int32), axis=-1), 0 ) def call(self, inputs, mask = None): # TODO include mask as input and make sure gets flowing # if embedding if self.if_embedding: inputs = self.embed(inputs) inputs = self.embed_mask(inputs) # putting every sentence in a single axis inputs_mask = inputs._keras_mask inputs = tf.reshape( inputs, shape = (-1 ,maxlen ,self.embedding_len) ) mask = tf.reshape( inputs_mask, shape=(-1, maxlen) ) lstm_out = self.lstm(inputs, mask=mask) lstm_mask = lstm_out._keras_mask h = self.attention_layer( [lstm_out, lstm_out], mask = [lstm_mask, lstm_mask] ) out = tf.reduce_mean (h, axis=-2) if self.if_embedding: # reshaping back to (batch_size, max seq len, num hidden) out = tf.reshape( out, shape=(-1 ,max_sentences, self.num_hidden) ) return out
def get_model(self): print("Vocabulary Size:", vectorizer.get_vocabulary_size()) overview_input = Input(shape=(None, None), dtype='int64', name="OverviewInput") plot_input = Input(shape=(None, None), dtype='int64', name="PlotInput") subtitles_input = Input(shape=(None, None), dtype='int64', name="SubtitlesInput") sentence_input = Input(shape=(None, ), dtype='int64', name="SentenceInput") embedded_sentence = Embedding(vectorizer.get_vocabulary_size(), 300, trainable=True, name="Embedding")(sentence_input) spatial_dropout_sentence = SpatialDropout1D( 0.20, name="SpatialDropoutSentence")(embedded_sentence) cnn_sentence = Conv1D(64, 4, padding="same", activation="relu", strides=1, name="Conv1DSentence")(spatial_dropout_sentence) max_pool_sentence = MaxPooling1D( pool_size=3, name="MaxPooling1DSentence")(cnn_sentence) sentence_encoding = Bidirectional(LSTM(500))(max_pool_sentence) sentence_model = Model(sentence_input, sentence_encoding) segment_time_distributed = TimeDistributed( sentence_model, name="TimeDistributedSegment") segment_cnn = Conv1D(172, 2, padding="same", activation="relu", name="SegmentConv1D") segment_max_pool = MaxPooling1D(pool_size=3, name="SegementMaxPool1D") segment_cnn_2 = Conv1D(172, 5, padding="same", activation="relu", name="Segment2Conv1D") segment_max_pool_2 = MaxPooling1D(pool_size=3, name="Segment2MaxPool1D") overview_time_distributed = segment_time_distributed(overview_input) overview_cnn = segment_cnn(overview_time_distributed) overview_maxpool = segment_max_pool(overview_cnn) plot_time_distributed = segment_time_distributed(plot_input) plot_cnn = segment_cnn(plot_time_distributed) plot_maxpool = segment_max_pool(plot_cnn) subtitles_timedistributed = segment_time_distributed(subtitles_input) subtitles_cnn = segment_cnn_2(subtitles_timedistributed) subtitles_maxpool = segment_max_pool_2(subtitles_cnn) overview_dropout = SpatialDropout1D(0.40)(overview_maxpool) overview_pre_attention_output = Dense( 172, name="OverviewPreAttnOutput")(overview_dropout) plot_dropout = SpatialDropout1D(0.40)(plot_maxpool) plot_pre_attention_output = Dense( 172, name="PlotPreAttnOutput")(plot_dropout) subtitles_dropout = SpatialDropout1D( 0.40, name="SubtitlesDropout")(subtitles_maxpool) subtitles_pre_attention_output = Dense( 172, name="SubtitlesPreAttnOutput")(subtitles_dropout) attention_overview = AdditiveAttention(name="OverviewAttention")( [overview_pre_attention_output, overview_maxpool]) attention_plot = AdditiveAttention(name="PlotAttention")( [plot_pre_attention_output, plot_maxpool]) attention_subtitles = AdditiveAttention(name="SubtitlesAttention")( [subtitles_pre_attention_output, subtitles_maxpool]) overview_output = GlobalAveragePooling1D( name="GlobalAvgPoolOverview")(attention_overview) plot_output = GlobalAveragePooling1D( name="GlobalAvgPoolPlot")(attention_plot) subtitles_output = GlobalAveragePooling1D( name="GlobalAvgPoolSubitles")(attention_subtitles) concat_output = Concatenate(axis=-1, name="OutputConcatenate")( [overview_output, plot_output, subtitles_output]) dropput = Dropout(0.40)(concat_output) output = Dense(172, activation="sigmoid", name="Output")(dropput) model = Model([overview_input, plot_input, subtitles_input], output) model.compile(loss='binary_crossentropy', optimizer='adamax', metrics=self.METRICS) print(sentence_model.summary()) print(model.summary()) self.sentence_model = sentence_model self.model = model if self.load_weights: self.sentence_model.load_weights("data/weights/sentence_model.h5") self.model.load_weights("data/weights/model.h5") self.vectorizer.load("data/weights/vectorizer.dat") return sentence_model, model
def LSTM_model_veracity(x_train_embeddings, x_train_metafeatures, y_train, x_test_embeddings, x_test_metafeatures, params, eval=False, use_embeddings=True, use_metafeatures=True, Early_Stopping=True, log_path=""): # Parameter search log_dir = log_path + datetime.datetime.now().strftime("%d%m%Y-%H%M%S") num_lstm_units = int(params['num_lstm_units']) num_lstm_layers = int(params['num_lstm_layers']) num_dense_layers = int(params['num_dense_layers']) num_dense_units = int(params['num_dense_units']) num_epochs = params['num_epochs'] learn_rate = params['learn_rate'] mb_size = params['mb_size'] l2reg = params['l2reg'] dropout = params['dropout'] attention = params['attention'] # Defining input shapes if use_embeddings: emb_shape = x_train_embeddings[0].shape if use_metafeatures: metafeatures_shape = x_train_metafeatures[0].shape # Creating the two inputs if use_embeddings: emb_input = Input(shape=emb_shape, name='Embeddings') if use_metafeatures: metafeatures_input = Input(shape=metafeatures_shape, name='Metafeatures') # Adding masks to account for zero paddings if use_embeddings: emb_mask = Masking(mask_value=0, input_shape=(None, emb_shape))(emb_input) if use_metafeatures: metafeatures_mask = (Masking( mask_value=0, input_shape=(None, metafeatures_shape)))(metafeatures_input) # Adding attention and LSTM layers with varying layers and units using parameter search if attention == 1: for nl in range(num_lstm_layers): if use_embeddings: emb_LSTM_query = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=0.2, return_sequences=True))(emb_mask) emb_LSTM_value = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=0.2, return_sequences=True))(emb_mask) if use_metafeatures: metafeatures_LSTM_query = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=0.2, return_sequences=True))(metafeatures_mask) metafeatures_LSTM_value = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=0.2, return_sequences=True))(metafeatures_mask) if use_embeddings: emb_LSTM = AdditiveAttention(name='Attention_Embeddings')( [emb_LSTM_query, emb_LSTM_value]) if use_metafeatures: metafeatures_LSTM = AdditiveAttention( name='Attention_Metafeatures')( [metafeatures_LSTM_query, metafeatures_LSTM_value]) else: if use_embeddings: emb_LSTM = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=dropout, return_sequences=True))(emb_mask) if use_metafeatures: metafeatures_LSTM = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=dropout, return_sequences=True))(metafeatures_mask) if use_embeddings and use_metafeatures: # Concatenating the two inputs model = Concatenate()([emb_LSTM, metafeatures_LSTM]) elif use_metafeatures: model = metafeatures_LSTM # Adding attention and another LSTM to the concatenated layers if attention == 1: model_query = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=0.2, return_sequences=False))(model) model_value = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=0.2, return_sequences=False))(model) model = AdditiveAttention(name='Attention_Model')( [model_query, model_value]) else: model = Bidirectional( LSTM(num_lstm_units, dropout=dropout, recurrent_dropout=dropout, return_sequences=False))(model) # Adding dense layer with varying layers and units using parameter search for nl in range(num_dense_layers): model = Dense(num_dense_units)(model) model = LeakyReLU()(model) # Adding dropout to the model model = Dropout(dropout)(model) # Adding softmax dense layer with varying l2 regularizers using parameter search output = Dense(3, activation='softmax', activity_regularizer=regularizers.l2(l2reg), name='labels')(model) # Model output if use_embeddings and use_metafeatures: model = Model(inputs=[emb_input, metafeatures_input], outputs=output) elif use_metafeatures: model = Model(inputs=metafeatures_input, outputs=output) #model = Model(inputs=emb_input, outputs=output) # Plotting the model #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True) # Adding Adam optimizer with varying learning rate using parameter search adam = optimizers.Adam(lr=learn_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) # Compiling model model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) callback_list = [] #TensorBoard tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1) callback_list.append(tensorboard_callback) #Early_Stopping if Early_Stopping: earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=5) callback_list.append(earlystop_callback) #plot_model(model, "model.png") if Early_Stopping: # Fitting the model with varying batch sizes and epochs using parameter search if use_embeddings and use_metafeatures: model.fit( { 'Embeddings': x_train_embeddings, 'Metafeatures': x_train_metafeatures }, y_train, batch_size=mb_size, epochs=num_epochs, shuffle=True, class_weight=None, verbose=1, callbacks=callback_list, validation_split=.1) elif use_metafeatures: model.fit(x_train_metafeatures, y_train, batch_size=mb_size, epochs=num_epochs, shuffle=True, class_weight=None, verbose=1, callbacks=callback_list, validation_split=.1) else: # Fitting the model with varying batch sizes and epochs using parameter search if use_embeddings and use_metafeatures: model.fit( { 'Embeddings': x_train_embeddings, 'Metafeatures': x_train_metafeatures }, y_train, batch_size=mb_size, epochs=num_epochs, shuffle=True, class_weight=None, verbose=1, callbacks=callback_list) elif use_metafeatures: model.fit(x_train_metafeatures, y_train, batch_size=mb_size, epochs=num_epochs, shuffle=True, class_weight=None, verbose=1, callbacks=callback_list) # Evaluation time if eval == True: model.save('output\\model_veracity.h5') json_string = model.to_json() with open('output\\model_architecture_veracity.json', 'w') as fout: json.dump(json_string, fout) model.save_weights('output\\model_veracity_weights.h5') # Getting confidence of the model if use_embeddings and use_metafeatures: pred_probabilities = model.predict( [x_test_embeddings, x_test_metafeatures], batch_size=mb_size, verbose=0) confidence = np.max(pred_probabilities, axis=1) # Getting predictions of the model y_prob = model.predict([x_test_embeddings, x_test_metafeatures], batch_size=mb_size) Y_pred = y_prob.argmax(axis=-1) elif use_metafeatures: pred_probabilities = model.predict(x_test_metafeatures, batch_size=mb_size, verbose=0) confidence = np.max(pred_probabilities, axis=1) # Getting predictions of the model y_prob = model.predict(x_test_metafeatures, batch_size=mb_size) Y_pred = y_prob.argmax(axis=-1) return Y_pred, confidence
def compile(self, learning_rate=None, initial_step=0): """ Build models (train, encoder and decoder) Architecture based on Bahdanau and Transformer model approach. Reference: Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio Neural Machine Translation by Jointly Learning to Align and Translate, 2014 arXiv, URL: https://arxiv.org/abs/1409.0473 Architecture based on Luong and Transformer model approach. Reference: Minh-Thang Luong and Hieu Pham and Christopher D. Manning Effective Approaches to Attention-based Neural Machine Translation, 2015 arXiv, URL: https://arxiv.org/abs/1508.04025 More References: Thushan Ganegedara Attention in Deep Networks with Keras Medium: https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39 Github: https://github.com/thushv89/attention_keras Trung Tran Neural Machine Translation With Attention Mechanism Machine Talk: https://machinetaltf.keras.backend.org/2019/03/29/neural-machine-translation-with-attention-mechanism/ Github: https://github.com/ChunML/NLP/tree/master/machine_translation """ # Encoder and Decoder Inputs encoder_inputs = Input(shape=(None, self.tokenizer.vocab_size), name="encoder_inputs") decoder_inputs = Input(shape=(None, self.tokenizer.vocab_size), name="decoder_inputs") # Encoder bgru encoder_bgru = Bidirectional(GRU(self.units, return_sequences=True, return_state=True, dropout=self.dropout), name="encoder_bgru") encoder_out, state_h, state_c = encoder_bgru(encoder_inputs) # Set up the decoder GRU, using `encoder_states` as initial state. decoder_gru = GRU(self.units * 2, return_sequences=True, return_state=True, dropout=self.dropout, name="decoder_gru") decoder_out, _ = decoder_gru( decoder_inputs, initial_state=Concatenate(axis=-1)([state_h, state_c])) # Attention layer if self.mode == "bahdanau": attn_layer = AdditiveAttention(use_scale=False, name="attention_layer") else: attn_layer = Attention(use_scale=False, name="attention_layer") attn_out = attn_layer([decoder_out, encoder_out]) # Normalization layer norm_layer = LayerNormalization(name="normalization") decoder_concat_input = norm_layer( Concatenate(axis=-1)([decoder_out, attn_out])) # Dense layer dense = Dense(self.tokenizer.vocab_size, activation="softmax", name="softmax_layer") dense_time_distributed = TimeDistributed(dense, name="time_distributed_layer") decoder_pred = dense_time_distributed(decoder_concat_input) """ Train model """ if learning_rate is None: learning_rate = CustomSchedule(d_model=self.tokenizer.vocab_size, initial_step=initial_step) self.learning_schedule = True else: self.learning_schedule = False optimizer = Adam(learning_rate=learning_rate, clipnorm=1.0, clipvalue=0.5, epsilon=1e-8) self.model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_pred, name="seq2seq") self.model.compile(optimizer=optimizer, loss=self.loss_func, metrics=["accuracy"]) """ Inference model """ """ Encoder (Inference) model """ self.encoder = Model(inputs=encoder_inputs, outputs=[encoder_out, state_h, state_c]) """ Decoder (Inference) model """ # Decoder Inputs (states) encoder_inf_states = Input(shape=(self.tokenizer.maxlen, self.units * 2), name="encoder_inf_states") decoder_init_states = Input(shape=(self.units * 2), name="decoder_init") decoder_inf_inputs = Input(shape=(1, self.tokenizer.vocab_size), name="decoder_inf_inputs") # Decoder GRU decoder_inf_out, decoder_inf_states = decoder_gru( decoder_inf_inputs, initial_state=decoder_init_states) # Attention layer attn_inf_out = attn_layer([decoder_inf_out, encoder_inf_states]) # Normalization layer decoder_inf_concat = norm_layer( Concatenate(axis=-1)([decoder_inf_out, attn_inf_out])) # Dense layer decoder_inf_pred = dense_time_distributed(decoder_inf_concat) # Decoder model self.decoder = Model(inputs=[ encoder_inf_states, decoder_init_states, decoder_inf_inputs ], outputs=[decoder_inf_pred, decoder_inf_states])