def __init__(self, rnn_dim, rnn_unit='gru', input_shape=(0, ), dropout=0.0, highway=False, return_sequences=False, dense_dim=0): if rnn_unit == 'gru': rnn = GRU else: rnn = LSTM self.model = Sequential() self.model.add( Bidirectional(rnn(rnn_dim, dropout=dropout, recurrent_dropout=dropout, return_sequences=return_sequences), input_shape=input_shape)) # self.model.add(rnn(rnn_dim, # dropout=dropout, # recurrent_dropout=dropout, # return_sequences=return_sequences, # input_shape=input_shape)) if highway: if return_sequences: self.model.add(TimeDistributed(Highway(activation='tanh'))) else: self.model.add(Highway(activation='tanh')) if dense_dim > 0: self.model.add(TimeDistributed(Dense(dense_dim, activation='relu'))) self.model.add(TimeDistributed(Dropout(dropout))) self.model.add(TimeDistributed(BatchNormalization()))
def build_word_feature_char(vocab_size=5, char_emb_dim=CHAR_EMB_DIM, mode="padding", cnn_encoder=True, highway=True): # build the feature computed by cnn for each word in the sentence. used to input to the next rnn. # expected input: every #comp_width int express a character. # mode: # "average": average pool the every #comp_with input embedding, output average of the indexed embeddings of a character # "padding": convoluate every #comp_width embedding # real vocab_size for ucs is 2481, including paddingblank, unkown, puncutations, kanas init_width = 0.5 / char_emb_dim init_weight = numpy.random.uniform(low=-init_width, high=init_width, size=(vocab_size, char_emb_dim)) init_weight[0] = 0 # maybe the padding should not be zero # print(init_weight) # first layer embeds # every components word_input = Input(shape=(MAX_WORD_LENGTH, )) char_embedding = \ Embedding(input_dim=vocab_size, output_dim=char_emb_dim, weights=[init_weight], trainable=True)(word_input) # print("char_embedding:", char_embedding._keras_shape) if cnn_encoder: if mode == "padding": # print(char_embedding._keras_shape) # conv, filter with [1, 2, 3]*#comp_width, feature maps 50 100 150 feature1 = Conv1D(filters=100, kernel_size=1, activation='relu')(char_embedding) feature1 = MaxPooling1D(pool_size=MAX_WORD_LENGTH - 1 + 1)(feature1) feature2 = Conv1D(filters=200, kernel_size=2, activation='relu')(char_embedding) feature2 = MaxPooling1D(pool_size=MAX_WORD_LENGTH - 2 + 1)(feature2) feature3 = Conv1D(filters=300, kernel_size=3, activation='relu')(char_embedding) feature3 = MaxPooling1D(pool_size=MAX_WORD_LENGTH - 3 + 1)(feature3) feature = concatenate([feature1, feature2, feature3]) feature = Flatten()(feature) # print(feature._keras_shape) if highway: feature = Highway(activation="relu")(feature) else: feature = Flatten()(char_embedding) word_feature_encoder = Model(word_input, feature) return word_feature_encoder
def __init__(self, rnn, rnn_dim, input_dim, dropout_W=0.0, dropout_U=0.0, cnn_border_mode='same'): if rnn == 'lstm': from keras.layers import CuDNNLSTM as RNN elif rnn == 'sru': from nea.cell import SRU as RNN elif rnn == 'nlstm': from nea.cell import NestedLSTM as RNN elif rnn == 'gru': from keras.layers import CuDNNGRU as RNN elif rnn == 'simple': from keras.layers.recurrent import SimpleRNN as RNN elif rnn == 'indrnn': from nea.cell import IndRNN as RNN self.model = Sequential() self.model.add( Conv1D(filters=100, kernel_size=3, padding=cnn_border_mode, strides=1, input_shape=input_dim)) for i in range(MC.DEPTH): self.model.add( Bidirectional( RNN( rnn_dim, # dropout=dropout_W, #recurrent_dropout=dropout_U, return_sequences=True), )) if MC.HIGHWAY: self.model.add(TimeDistributed(Highway(activation='tanh'))) #self.model.add(TimeDistributed(Dense(MC.DENSE_DIM,activation='relu'))) self.model.add(Dropout(MC.DROPOUT)) self.model.add(Attention())
def build_sentence_rnn(real_vocab_number, word_vocab_size=10, char_vocab_size=10, classes=2, attention=False, dropout=0, word=True, char=False, char_shape=True, model="rnn", cnn_encoder=True, highway=None, nohighway=None, shape_filter=True, char_filter=True): # build the rnn of words, use the output of build_word_feature as the feature of each word if char_shape: word_feature_encoder = build_word_feature_shape( vocab_size=real_vocab_number, cnn_encoder=cnn_encoder, highway=highway, nohighway=nohighway, shape_filter=shape_filter, char_filter=char_filter) sentence_input = Input(shape=(MAX_SENTENCE_LENGTH, COMP_WIDTH * MAX_WORD_LENGTH), dtype='int32') word_feature_sequence = TimeDistributed(word_feature_encoder)( sentence_input) # print(word_feature_sequence._keras_shape) if word: sentence_word_input = Input(shape=(MAX_SENTENCE_LENGTH, ), dtype='int32') word_embedding_sequence = Embedding( input_dim=word_vocab_size, output_dim=WORD_DIM)(sentence_word_input) if char: word_feature_encoder = build_word_feature_char( vocab_size=char_vocab_size, cnn_encoder=cnn_encoder, highway=highway) char_input = Input(shape=(MAX_SENTENCE_LENGTH, MAX_WORD_LENGTH), dtype='int32') word_feature_sequence = TimeDistributed(word_feature_encoder)( char_input) if char_shape and word and not char: word_feature_sequence = concatenate( [word_feature_sequence, word_embedding_sequence], axis=2) if word and not char_shape and not char: word_feature_sequence = word_embedding_sequence # print(word_feature_sequence._keras_shape) if model == "rnn": if attention: lstm_rnn = Bidirectional( LSTM(150, dropout=dropout, return_sequences=True))(word_feature_sequence) if highway: lstm_rnn = TimeDistributed( Highway(activation=highway))(lstm_rnn) elif nohighway: lstm_rnn = TimeDistributed( Dense(units=300, activation=nohighway))(lstm_rnn) lstm_rnn = AttentionWithContext()(lstm_rnn) else: lstm_rnn = Bidirectional( LSTM(150, dropout=dropout, return_sequences=False))(word_feature_sequence) x = lstm_rnn if classes < 2: print("class number cannot less than 2") exit(1) else: preds = Dense(classes, activation='softmax')(x) if char_shape and not word and not char: sentence_model = Model(sentence_input, preds) if word and not char_shape and not char: sentence_model = Model(sentence_word_input, preds) if word and char_shape and not char: sentence_model = Model([sentence_input, sentence_word_input], preds) if char and not word and not char_shape: sentence_model = Model(char_input, preds) sentence_model.summary() return sentence_model
def build_word_feature_shape(vocab_size=5, char_emb_dim=CHAR_EMB_DIM, comp_width=COMP_WIDTH, mode="padding", cnn_encoder=True, highway="linear", nohighway=None, shape_filter=True, char_filter=True): # build the feature computed by cnn for each word in the sentence. used to input to the next rnn. # expected input: every #comp_width int express a character. # mode: # "average": average pool the every #comp_with input embedding, output average of the indexed embeddings of a character # "padding": convoluate every #comp_width embedding # real vocab_size for ucs is 2481, including paddingblank, unkown, puncutations, kanas assert shape_filter or char_filter init_width = 0.5 / char_emb_dim init_weight = numpy.random.uniform(low=-init_width, high=init_width, size=(vocab_size, char_emb_dim)) init_weight[0] = 0 # maybe the padding should not be zero # print(init_weight) # first layer embeds # every components word_input = Input(shape=(COMP_WIDTH * MAX_WORD_LENGTH, )) char_embedding = \ Embedding(input_dim=vocab_size, output_dim=char_emb_dim, weights=[init_weight], trainable=True)(word_input) # print("char_embedding:", char_embedding._keras_shape) if cnn_encoder: if mode == "padding": # print(char_embedding._keras_shape) # print(comp_width) if shape_filter and char_filter: filter_sizes = [50, 100, 150] else: filter_sizes = [100, 200, 300] if shape_filter: feature_s1 = Conv1D(filters=filter_sizes[0], kernel_size=1, activation='relu')(char_embedding) feature_s1 = MaxPooling1D(pool_size=MAX_WORD_LENGTH * COMP_WIDTH)(feature_s1) feature_s2 = Conv1D(filters=filter_sizes[1], kernel_size=2, activation='relu')(char_embedding) feature_s2 = MaxPooling1D( pool_size=MAX_WORD_LENGTH * COMP_WIDTH - 1)(feature_s2) feature_s3 = Conv1D(filters=filter_sizes[2], kernel_size=3, activation='relu')(char_embedding) feature_s3 = MaxPooling1D( pool_size=MAX_WORD_LENGTH * COMP_WIDTH - 2)(feature_s3) if char_filter: feature1 = Conv1D(filters=filter_sizes[0], kernel_size=1 * comp_width, strides=comp_width, activation='relu')(char_embedding) feature1 = MaxPooling1D(pool_size=MAX_WORD_LENGTH - 1 + 1)(feature1) feature2 = Conv1D(filters=filter_sizes[1], kernel_size=2 * comp_width, strides=comp_width, activation='relu')(char_embedding) feature2 = MaxPooling1D(pool_size=MAX_WORD_LENGTH - 2 + 1)(feature2) feature3 = Conv1D(filters=filter_sizes[2], kernel_size=3 * comp_width, strides=comp_width, activation='relu')(char_embedding) feature3 = MaxPooling1D(pool_size=MAX_WORD_LENGTH - 3 + 1)(feature3) if shape_filter and char_filter: feature = concatenate([ feature_s1, feature_s2, feature_s3, feature1, feature2, feature3 ]) elif shape_filter and not char_filter: feature = concatenate([feature_s1, feature_s2, feature_s3]) elif char_filter and not shape_filter: feature = concatenate([feature1, feature2, feature3]) else: feature = None feature = Flatten()(feature) # print(feature._keras_shape) if highway: if isinstance(highway, str): feature = Highway(activation=highway)(feature) else: feature = Highway(activation='relu')(feature) else: if nohighway: feature = Dense(units=600, activation=nohighway)(feature) else: pass else: feature = Flatten()(char_embedding) word_feature_encoder = Model(word_input, feature) return word_feature_encoder
def get_darnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size, projection_dim=50, projection_hidden=0, projection_dropout=0.2, compare_dim=288, compare_dropout=0.2, dense_dim=50, dense_dropout=0.2, lr=1e-3, activation='relu'): q1 = Input(shape=(max_sequence_length, ), name='first_sentences') q2 = Input(shape=(max_sequence_length, ), name='second_sentences') q1_exact_match = Input(shape=(max_sequence_length, ), name='first_exact_match') q2_exact_match = Input(shape=(max_sequence_length, ), name='second_exact_match') input_layer_3 = Input(shape=(36, ), name='mata-features', dtype="float32") embedding = Embedding(nb_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False) em_embeddings = Embedding(2, 1, input_length=max_sequence_length, trainable=True) q1_embed = embedding(q1) q1_embed = SpatialDropout1D(0.1)(q1_embed) q2_embed = embedding(q2) q2_embed = SpatialDropout1D(0.1)(q2_embed) th = TimeDistributed(Highway(activation='relu')) q1_embed = Dropout(0.1)(th(q1_embed, )) q2_embed = Dropout(0.1)(th(q2_embed, )) rnns = [ Bidirectional(CuDNNGRU(42, return_sequences=True)) for i in range(3) ] q1_res = [] q2_res = [] for idx, rnn in enumerate(rnns): q1_seq = rnn(q1_embed) q1_seq = Dropout(0.15)(q1_seq) q2_seq = rnn(q2_embed) q2_seq = Dropout(0.15)(q2_seq) q1_aligned, q2_aligned = soft_attention_alignment(q1_seq, q2_seq) q1_res.append(q2_aligned) q1_res.append(q1_seq) q2_res.append(q1_aligned) q2_res.append(q2_seq) q1_embed = Concatenate()([ q1_embed, q1_seq, q2_aligned, ]) q2_embed = Concatenate()([ q2_embed, q2_seq, q1_aligned, ]) q1_res = Concatenate()(q1_res) q2_res = Concatenate()(q2_res) attn = AttentionWeightedAverage() q1_rep = apply_multiple( q1_embed, [GlobalAvgPool1D(), GlobalMaxPool1D(), attn]) q2_rep = apply_multiple( q2_embed, [GlobalAvgPool1D(), GlobalMaxPool1D(), attn]) # Classifier q_diff = substract(q1_rep, q2_rep) q_multi = Multiply()([q1_rep, q2_rep]) h_all = Concatenate()([ q1_rep, q2_rep, q_diff, q_multi, ]) h_all = Dropout(0.35)(h_all) h_all = Dense(300, activation='relu')(h_all) out_ = Dense(3, activation='softmax')(h_all) model = Model( inputs=[q1, q2, input_layer_3, q1_exact_match, q2_exact_match], outputs=out_) model.compile(optimizer=Adam(lr=lr, decay=1e-6, clipvalue=1.5), loss='categorical_crossentropy', metrics=['accuracy', weighted_accuracy]) model.summary() return model
def get_multiwindow_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size, projection_dim=50, projection_hidden=0, projection_dropout=0.2, compare_dim=288, compare_dropout=0.2, dense_dim=50, dense_dropout=0.2, lr=1e-3, activation='relu'): q1 = Input(shape=(max_sequence_length, ), name='first_sentences') q2 = Input(shape=(max_sequence_length, ), name='second_sentences') meta_features_input = Input(shape=(36, ), name='mata-features') embedding = Embedding(nb_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False) q1_embed = embedding(q1) q1_embed = SpatialDropout1D(0.2)(q1_embed) q2_embed = embedding(q2) q2_embed = SpatialDropout1D(0.2)(q2_embed) th = TimeDistributed(Highway(activation='relu')) q1_encoded = th(q1_embed, ) q2_encoded = th(q2_embed, ) q1_in = q1_encoded q2_in = q2_encoded nb_filters = 64 for i in range(1, 5): tanh_conv = Conv1D(nb_filters, i, padding='same', activation='tanh') sigm_conv = Conv1D(nb_filters, i, padding='same', activation='sigmoid') res_conv = Conv1D(nb_filters, i, padding='same', activation='relu') drop = Dropout(0.1) q1_t = tanh_conv(q1_in) q1_s = sigm_conv(q1_in) q1_x = Multiply()([q1_t, q1_s]) res_q1 = res_conv(q1_x) res_q1 = drop(res_q1) q1_encoded = Concatenate()([q1_encoded, q1_x]) q2_t = tanh_conv(q2_in) q2_s = sigm_conv(q2_in) q2_x = Multiply()([q2_t, q2_s]) res_q2 = res_conv(q2_x) res_q2 = drop(res_q2) q2_encoded = Concatenate()([q2_encoded, q2_x]) # Align after align q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) q1_encoded = Concatenate()([ q1_encoded, q2_aligned, ]) q2_encoded = Concatenate()([ q2_encoded, q1_aligned, ]) attn = AttentionWeightedAverage() q1_rep = apply_multiple(q1_encoded, [ GlobalAvgPool1D(), GlobalMaxPool1D(), attn, ]) q2_rep = apply_multiple(q2_encoded, [ GlobalAvgPool1D(), GlobalMaxPool1D(), attn, ]) # Classifier q_diff = substract(q1_rep, q2_rep) q_multi = Multiply()([q1_rep, q2_rep]) h_all = Concatenate()([ q1_rep, q2_rep, q_diff, q_multi, ]) h_all = Dropout(0.2)(h_all) out_ = Dense(3, activation='softmax')(h_all) model = Model(inputs=[q1, q2, meta_features_input], outputs=out_) model.compile(optimizer=Adam(lr=lr, decay=1e-6, clipnorm=1.5), loss='categorical_crossentropy', metrics=['accuracy', weighted_accuracy]) model.summary() return model
def get_dense_cnn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size, projection_dim=50, projection_hidden=0, projection_dropout=0.2, compare_dim=288, compare_dropout=0.2, dense_dim=50, dense_dropout=0.2, lr=1e-3, activation='relu'): q1 = Input(shape=(max_sequence_length, ), name='first_sentences') q2 = Input(shape=(max_sequence_length, ), name='second_sentences') meta_features_input = Input(shape=(36, ), name='mata-features') embedding = Embedding(nb_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False) q1_embed = embedding(q1) q1_embed = SpatialDropout1D(0.2)(q1_embed) q2_embed = embedding(q2) q2_embed = SpatialDropout1D(0.2)(q2_embed) th = TimeDistributed(Highway(activation='relu')) q1_encoded = th(q1_embed, ) q2_encoded = th(q2_embed, ) q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) q1_encoded = Concatenate()([q2_aligned, q1_encoded]) q2_encoded = Concatenate()([q1_aligned, q2_encoded]) cnn_init = Conv1D(42, 1, strides=1, padding='same', activation='relu') q1_seq = cnn_init(q1_encoded) q2_seq = cnn_init(q2_encoded) cnns = [ Conv1D(42, 3, strides=1, padding='same', activation='relu') for i in range(3) ] trans = [ Conv1D(32, 1, strides=1, padding='same', activation='relu') for i in range(3) ] for idx, cnn in enumerate(cnns): q1_aligned, q2_aligned = soft_attention_alignment(q1_seq, q2_seq) q1_encoded = Concatenate()([q1_seq, q2_aligned, q1_encoded]) q2_encoded = Concatenate()([q2_seq, q1_aligned, q2_encoded]) q1_seq = cnn(q1_encoded) q2_seq = cnn(q2_encoded) attn = AttentionWeightedAverage() q1_rep = apply_multiple( q1_encoded, [GlobalAvgPool1D(), GlobalMaxPool1D(), attn]) q2_rep = apply_multiple( q2_encoded, [GlobalAvgPool1D(), GlobalMaxPool1D(), attn]) # Classifier q_diff = substract(q1_rep, q2_rep) q_multi = Multiply()([q1_rep, q2_rep]) h_all = Concatenate()([ q1_rep, q2_rep, q_diff, q_multi, ]) h_all = Dropout(0.5)(h_all) h_all = Dense(128, activation='relu')(h_all) out_ = Dense(3, activation='softmax')(h_all) model = Model(inputs=[q1, q2, meta_features_input], outputs=out_) model.compile(optimizer=Adam(lr=lr, decay=1e-6, clipnorm=1), loss='categorical_crossentropy', metrics=['accuracy', weighted_accuracy]) model.summary() return model
def get_char_decomposable_attention(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size, projection_dim=50, projection_hidden=0, projection_dropout=0.2, compare_dim=288, compare_dropout=0.2, dense_dim=50, dense_dropout=0.2, lr=1e-3, activation='relu'): q1 = Input(shape=(max_sequence_length, ), name='first_sentences') q2 = Input(shape=(max_sequence_length, ), name='second_sentences') q1_exact_match = Input(shape=(max_sequence_length, ), name='first_exact_match') q2_exact_match = Input(shape=(max_sequence_length, ), name='second_exact_match') input_layer_3 = Input(shape=(36, ), name='mata-features', dtype="float32") #input_encoded = BatchNormalization()(input_layer_3) input_encoded = Dense(2016, activation='elu')(input_layer_3) input_encoded = Dropout(0.25)(input_encoded) embedding = Embedding(nb_words, 150, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False) em_embeddings = Embedding(2, 1, input_length=max_sequence_length, trainable=True) #q1_embed = Concatenate()([embedding(q1), em_embeddings(q1_exact_match)]) q1_embed = embedding(q1) q1_embed = SpatialDropout1D(0.1)(q1_embed) #q2_embed = Concatenate()([embedding(q2), em_embeddings(q2_exact_match)]) q2_embed = embedding(q2) q2_embed = SpatialDropout1D(0.1)(q2_embed) th = TimeDistributed(Highway(activation='relu')) q1_embed = th(q1_embed) q2_embed = th(q2_embed) q1_aligned, q2_aligned = soft_attention_alignment(q1_embed, q2_embed) q1_vec = Concatenate()([ q1_embed, q2_aligned, substract(q1_embed, q2_aligned), Multiply()([q1_embed, q2_aligned]) ]) q2_vec = Concatenate()([ q2_embed, q1_aligned, substract(q2_embed, q1_aligned), Multiply()([q2_embed, q1_aligned]) ]) dense_compares = [ Dense(300, activation='elu'), Dropout(0.2), Dense(200, activation='elu'), Dropout(0.2), ] q1_compared = time_distributed(q1_vec, dense_compares) q2_compared = time_distributed(q2_vec, dense_compares) q1_rep = apply_multiple( q1_compared, [GlobalAvgPool1D(), GlobalMaxPool1D()]) q2_rep = apply_multiple( q2_compared, [GlobalAvgPool1D(), GlobalMaxPool1D()]) h_all = Concatenate()([q1_rep, q2_rep]) h_all = BatchNormalization()(h_all) h_all = Dense(256, activation='elu')(h_all) h_all = Dropout(0.2)(h_all) h_all = BatchNormalization()(h_all) h_all = Dense(256, activation='elu')(h_all) h_all = Dropout(0.2)(h_all) h_all = BatchNormalization()(h_all) out_ = Dense(3, activation='softmax')(h_all) model = Model( inputs=[q1, q2, input_layer_3, q1_exact_match, q2_exact_match], outputs=out_) model.compile(optimizer=Adam(lr=lr, decay=1e-6, clipnorm=1.5, amsgrad=True), loss='categorical_crossentropy', metrics=['accuracy', weighted_accuracy]) model.summary() return model
def carnn(embedding_matrix, config, compare_out_size=CARNN_COMPARE_LAYER_OUTSIZE, rnn_size=CARNN_RNN_SIZE, rnn_dropout=CARNN_AGGREATION_DROPOUT): q1 = Input(shape=(config['max_length'], ), dtype='int32', name='q1_input') q2 = Input((config['max_length'], ), dtype='int32', name='q2_input') activation = 'elu' compare_dim = 500 compare_dropout = 0.2 embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], trainable=config['embed_trainable'], weights=[embedding_matrix] # mask_zero=True ) q1_embed = embedding_layer(q1) q2_embed = embedding_layer(q2) # bsz, 1, emb_dims q1_embed = BatchNormalization(axis=2)(q1_embed) q2_embed = BatchNormalization(axis=2)(q2_embed) q1_embed = SpatialDropout1D(config['spatial_dropout_rate'])(q1_embed) q2_embed = SpatialDropout1D(config['spatial_dropout_rate'])(q2_embed) highway_encoder = TimeDistributed(Highway(activation='relu')) self_attention = SelfAttention(d_model=embedding_matrix.shape[1]) q1_encoded = highway_encoder(q1_embed, ) q2_encoded = highway_encoder(q2_embed, ) s1_encoded = self_attention(q1, q1_encoded) s2_encoded = self_attention(q2, q2_encoded) # Attention q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) # Compare q1_combined1 = Concatenate()([ q1_encoded, q2_aligned, interaction(q1_encoded, q2_aligned), ]) q1_combined2 = Concatenate()([ q2_aligned, q1_encoded, interaction(q1_encoded, q2_aligned), ]) q2_combined1 = Concatenate()([ q2_encoded, q1_aligned, interaction(q2_encoded, q1_aligned), ]) q2_combined2 = Concatenate()([ q1_aligned, q2_encoded, interaction(q2_encoded, q1_aligned), ]) s1_combined1 = Concatenate()([ q1_encoded, s1_encoded, interaction(q1_encoded, s1_encoded), ]) s1_combined2 = Concatenate()([ s1_encoded, q1_encoded, interaction(q1_encoded, s1_encoded), ]) s2_combined1 = Concatenate()([ q2_encoded, s2_encoded, interaction(q2_encoded, s2_encoded), ]) s2_combined2 = Concatenate()([ s2_encoded, q2_encoded, interaction(q2_encoded, s2_encoded), ]) compare_layers_d = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_out_size, activation=activation), Dropout(compare_dropout), ] compare_layers_g = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_out_size, activation=activation), Dropout(compare_dropout), ] # NOTE these can be optimized q1_compare1 = time_distributed(q1_combined1, compare_layers_d) q1_compare2 = time_distributed(q1_combined2, compare_layers_d) q1_compare = Average()([q1_compare1, q1_compare2]) q2_compare1 = time_distributed(q2_combined1, compare_layers_d) q2_compare2 = time_distributed(q2_combined2, compare_layers_d) q2_compare = Average()([q2_compare1, q2_compare2]) s1_compare1 = time_distributed(s1_combined1, compare_layers_g) s1_compare2 = time_distributed(s1_combined2, compare_layers_g) s1_compare = Average()([s1_compare1, s1_compare2]) s2_compare1 = time_distributed(s2_combined1, compare_layers_g) s2_compare2 = time_distributed(s2_combined2, compare_layers_g) s2_compare = Average()([s2_compare1, s2_compare2]) # Aggregate q1_encoded = Concatenate()([q1_encoded, q1_compare, s1_compare]) q2_encoded = Concatenate()([q2_encoded, q2_compare, s2_compare]) aggreate_rnn = CuDNNGRU(rnn_size, return_sequences=True) q1_aggreated = aggreate_rnn(q1_encoded) q1_aggreated = Dropout(rnn_dropout)(q1_aggreated) q2_aggreated = aggreate_rnn(q2_encoded) q2_aggreated = Dropout(rnn_dropout)(q2_aggreated) # Pooling q1_rep = apply_multiple(q1_aggreated, [ GlobalAvgPool1D(), GlobalMaxPool1D(), ]) q2_rep = apply_multiple(q2_aggreated, [ GlobalAvgPool1D(), GlobalMaxPool1D(), ]) q_diff = Lambda(lambda x: K.abs(x[0] - x[1]))([q1_rep, q2_rep]) q_multi = Lambda(lambda x: x[0] * x[1])([q1_rep, q2_rep]) feature_input = Input(shape=(config['feature_length'], )) feature_dense = BatchNormalization()(feature_input) feature_dense = Dense(config['dense_dim'], activation='relu')(feature_dense) h_all1 = Concatenate()([q1_rep, q2_rep, q_diff, q_multi, feature_dense]) h_all2 = Concatenate()([q2_rep, q1_rep, q_diff, q_multi, feature_dense]) h_all1 = Dropout(0.5)(h_all1) h_all2 = Dropout(0.5)(h_all2) dense = Dense(256, activation='relu') h_all1 = dense(h_all1) h_all2 = dense(h_all2) h_all = Average()([h_all1, h_all2]) predictions = Dense(1, activation='sigmoid')(h_all) model = Model(inputs=[q1, q2, feature_input], outputs=predictions) opt = optimizers.get(config['optimizer']) K.set_value(opt.lr, config['learning_rate']) model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[f1]) return model
def decom(embedding_matrix, config): q1 = Input(shape=(config['max_length'], ), dtype='int32', name='q1_input') q2 = Input((config['max_length'], ), dtype='int32', name='q2_input') projection_hidden = 300 activation = 'elu' projection_dropout = 0.2 projection_dim = 300 compare_dim = 500 # 300 compare_dropout = 0.2 embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], trainable=config['embed_trainable'], weights=[embedding_matrix] # mask_zero=True ) q1_embed = embedding_layer(q1) q2_embed = embedding_layer(q2) # bsz, 1, emb_dims q1_embed = BatchNormalization(axis=2)(q1_embed) q2_embed = BatchNormalization(axis=2)(q2_embed) q1_embed = SpatialDropout1D(config['spatial_dropout_rate'])(q1_embed) q2_embed = SpatialDropout1D(config['spatial_dropout_rate'])(q2_embed) highway_encoder = TimeDistributed(Highway(activation='relu')) q1_encoded = highway_encoder(q1_embed, ) q2_encoded = highway_encoder(q2_embed, ) # Attention q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) # Compare q1_combined = Concatenate()( [q1_encoded, q2_aligned, interaction(q1_encoded, q2_aligned)]) q2_combined = Concatenate()( [q2_encoded, q1_aligned, interaction(q2_encoded, q1_aligned)]) compare_layers = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_dim, activation=activation), Dropout(compare_dropout), ] q1_compare = time_distributed(q1_combined, compare_layers) q2_compare = time_distributed(q2_combined, compare_layers) # Aggregate q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) sub_rep = Lambda(lambda x: K.abs(x[0] - x[1]))([q1_rep, q2_rep]) mul_rep = Lambda(lambda x: x[0] * x[1])([q1_rep, q2_rep]) # Dense meta featues # meta_densed = BatchNormalization()(meta_features) # meta_densed = Highway(activation='relu')(meta_densed) # meta_densed = Dropout(0.2)(meta_densed) # Classifier merged = Concatenate()([q1_rep, q2_rep, sub_rep, mul_rep]) dense = BatchNormalization()(merged) dense = Dense(config['dense_dim'], activation='elu')(dense) dense = BatchNormalization()(dense) dense = Dropout(config['dense_dropout'])(dense) dense = Dense(config['dense_dim'], activation='elu')(dense) dense = BatchNormalization()(dense) dense = Dropout(config['dense_dropout'])(dense) predictions = Dense(1, activation='sigmoid')(dense) model = Model(inputs=[q1, q2], outputs=predictions) opt = optimizers.get(config['optimizer']) K.set_value(opt.lr, config['learning_rate']) model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[f1]) return model
def get_decomposable_attention(nb_words, embedding_size, embedding_matrix, max_sequence_length, out_size, compare_dim=300, compare_dropout=0.2, dense_dim=256, dense_dropout=0.2, lr=1e-3, activation='relu', with_meta_features=False, word_level=True): q1, q1_c, q2, q2_c, meta_features = get_input_layers() if word_level: q1_embedded, q2_embedded = get_word_embeddings( q1, q2, nb_words, embedding_size, embedding_matrix, max_sequence_length, trainable=False, embedding_dropout=model_config.EMBEDDING_DROPOUT) else: q1_embedded, q2_embedded = get_char_embeddings( q1_c, q2_c, max_sequence_length, model_config.CHAR_EMBEDDING_SIZE, feature_map_nums=model_config.CHAR_EMBEDDING_FEATURE_MAP_NUMS, window_sizes=model_config.CHAR_EMBEDDING_WINDOW_SIZES, embedding_dropout=model_config.EMBEDDING_DROPOUT) # Context encoder highway_encoder = TimeDistributed(Highway(activation='relu')) q1_encoded = highway_encoder(q1_embedded, ) q2_encoded = highway_encoder(q2_embedded, ) q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) # Compare deep views q1_combined = Concatenate()([ q1_encoded, q2_aligned, interaction(q1_encoded, q2_aligned), ]) q2_combined = Concatenate()([ q2_encoded, q1_aligned, interaction(q2_encoded, q1_aligned), ]) compare_layers_d = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_dim, activation=activation), Dropout(compare_dropout), ] q1_compare = time_distributed(q1_combined, compare_layers_d) q2_compare = time_distributed(q2_combined, compare_layers_d) # Aggregate q1_rep = apply_multiple(q1_compare, [ GlobalAvgPool1D(), GlobalMaxPool1D(), ]) q2_rep = apply_multiple(q2_compare, [ GlobalAvgPool1D(), GlobalMaxPool1D(), ]) # Dense meta featues meta_densed = BatchNormalization()(meta_features) meta_densed = Highway(activation='relu')(meta_densed) meta_densed = Dropout(0.2)(meta_densed) # Classifier q_diff = substract(q1_rep, q2_rep) q_multi = Multiply()([q1_rep, q2_rep]) q_rep = Concatenate()([q1_rep, q2_rep]) if with_meta_features: h_all = Concatenate()([q_diff, q_multi, q_rep, meta_densed]) else: h_all = Concatenate()([ q_diff, q_multi, q_rep, ]) h_all = Dropout(0.5)(h_all) dense = Dense(dense_dim, activation=activation)(h_all) dense = BatchNormalization()(dense) dense = Dropout(dense_dropout)(dense) dense = Dense(dense_dim, activation=activation)(dense) dense = BatchNormalization()(dense) dense = Dropout(dense_dropout)(dense) out_ = Dense(1, activation='sigmoid')(dense) model = Model(inputs=[q1, q2, q1_c, q2_c, meta_features], outputs=out_) model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy']) return model
def get_CARNN(nb_words, embedding_size, embedding_matrix, max_sequence_length, out_size=1, compare_dim=model_config.CARNN_COMPARE_LAYER_HIDDEN_SIZE, compare_out_size=model_config.CARNN_COMPARE_LAYER_OUTSIZE, compare_dropout=model_config.COMPARE_LAYER_DROPOUT, meta_features_dropout=model_config.META_FEATURES_DROPOUT, rnn_size=model_config.CARNN_RNN_SIZE, rnn_dropout=model_config.CARNN_AGGREATION_DROPOUT, with_meta_features=False, word_level=True, lr=1e-3, activation='relu'): q1, q1_c, q2, q2_c, meta_features = get_input_layers() if word_level: q1_embedded, q2_embedded = get_word_embeddings( q1, q2, nb_words, embedding_size, embedding_matrix, max_sequence_length, trainable=False, embedding_dropout=model_config.EMBEDDING_DROPOUT) embedding_size = model_config.WORD_EMBEDDING_SIZE else: q1_embedded, q2_embedded = get_char_embeddings( q1_c, q2_c, max_sequence_length, model_config.CHAR_EMBEDDING_SIZE, feature_map_nums=model_config.CHAR_EMBEDDING_FEATURE_MAP_NUMS, window_sizes=model_config.CHAR_EMBEDDING_WINDOW_SIZES, embedding_dropout=model_config.EMBEDDING_DROPOUT) embedding_size = model_config.CHAR_CNN_OUT_SIZE self_attention = SelfAttention(d_model=embedding_size) # Context encoder highway_encoder = TimeDistributed(Highway(activation='selu')) q1_encoded = highway_encoder(q1_embedded, ) q2_encoded = highway_encoder(q2_embedded, ) s1_encoded = self_attention(q1, q1_encoded) s2_encoded = self_attention(q2, q2_encoded) # Attention q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) # Compare deep views q1_combined1 = Concatenate()([ q1_encoded, q2_aligned, interaction(q1_encoded, q2_aligned), ]) q1_combined2 = Concatenate()([ q2_aligned, q1_encoded, interaction(q1_encoded, q2_aligned), ]) q2_combined1 = Concatenate()([ q2_encoded, q1_aligned, interaction(q2_encoded, q1_aligned), ]) q2_combined2 = Concatenate()([ q1_aligned, q2_encoded, interaction(q2_encoded, q1_aligned), ]) s1_combined1 = Concatenate()([ q1_encoded, s1_encoded, interaction(q1_encoded, s1_encoded), ]) s1_combined2 = Concatenate()([ s1_encoded, q1_encoded, interaction(q1_encoded, s1_encoded), ]) s2_combined1 = Concatenate()([ q2_encoded, s2_encoded, interaction(q2_encoded, s2_encoded), ]) s2_combined2 = Concatenate()([ s2_encoded, q2_encoded, interaction(q2_encoded, s2_encoded), ]) compare_layers_d = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_out_size, activation=activation), Dropout(compare_dropout), ] compare_layers_g = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_out_size, activation=activation), Dropout(compare_dropout), ] # NOTE these can be optimized q1_compare1 = time_distributed(q1_combined1, compare_layers_d) q1_compare2 = time_distributed(q1_combined2, compare_layers_d) q1_compare = Average()([q1_compare1, q1_compare2]) q2_compare1 = time_distributed(q2_combined1, compare_layers_d) q2_compare2 = time_distributed(q2_combined2, compare_layers_d) q2_compare = Average()([q2_compare1, q2_compare2]) s1_compare1 = time_distributed(s1_combined1, compare_layers_g) s1_compare2 = time_distributed(s1_combined2, compare_layers_g) s1_compare = Average()([s1_compare1, s1_compare2]) s2_compare1 = time_distributed(s2_combined1, compare_layers_g) s2_compare2 = time_distributed(s2_combined2, compare_layers_g) s2_compare = Average()([s2_compare1, s2_compare2]) # Aggregate q1_encoded = Concatenate()([q1_encoded, q1_compare, s1_compare]) q2_encoded = Concatenate()([q2_encoded, q2_compare, s2_compare]) aggreate_rnn = CuDNNGRU(rnn_size, return_sequences=True) q1_aggreated = aggreate_rnn(q1_encoded) q1_aggreated = Dropout(rnn_dropout)(q1_aggreated) q2_aggreated = aggreate_rnn(q2_encoded) q2_aggreated = Dropout(rnn_dropout)(q2_aggreated) # Pooling q1_rep = apply_multiple(q1_aggreated, [ GlobalAvgPool1D(), GlobalMaxPool1D(), ]) q2_rep = apply_multiple(q2_aggreated, [ GlobalAvgPool1D(), GlobalMaxPool1D(), ]) # Dense meta featues meta_densed = Highway(activation='relu')(meta_features) meta_densed = Dropout(model_config.META_FEATURES_DROPOUT)(meta_densed) # Classifier q_diff = substract(q1_rep, q2_rep) q_multi = Multiply()([q1_rep, q2_rep]) if with_meta_features: h_all1 = Concatenate()([q1_rep, q2_rep, q_diff, q_multi, meta_densed]) h_all2 = Concatenate()([q2_rep, q1_rep, q_diff, q_multi, meta_densed]) else: h_all1 = Concatenate()([ q1_rep, q2_rep, q_diff, q_multi, ]) h_all2 = Concatenate()([ q2_rep, q1_rep, q_diff, q_multi, ]) h_all1 = Dropout(0.5)(h_all1) h_all2 = Dropout(0.5)(h_all2) dense = Dense(256, activation='relu') h_all1 = dense(h_all1) h_all2 = dense(h_all2) h_all = Average()([h_all1, h_all2]) out = Dense(out_size, activation='sigmoid')(h_all) model = Model(inputs=[q1, q2, q1_c, q2_c, meta_features], outputs=out) model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy']) return model