def build_attention2_model(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): N = maxnum L = maxlen logger = get_logger('Build attention pooling model') logger.info("Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, lstm_units = %s, drop rate = %s, l2 = %s" % (N, L, embedd_dim, opts.lstm_units, opts.dropout, opts.l2_value)) model = Sequential() model.add(Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N*L, weights=embedding_weights,name='x')) model.add(Dropout(opts.dropout, name='drop_x')) model.add(Reshape((N, L, embedd_dim),dtype='int32', input_shape=(N*L,), name='resh_W')) model.add(TimeDistributed(LSTM(opts.lstm_units, return_sequences=True), name='z')) model.add(TimeDistributed(Attention(name='att_z'))) model.add(LSTM(opts.lstm_units, return_sequences=True, name='hz')) model.add(Attention(name='attent_hz')) model.add(Dense(output_dim=1, activation='sigmoid', name='output')) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].bias = bias_value if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='rmsprop') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model
def build_attention2_model(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): N = maxnum L = maxlen logger = get_logger('Build attention pooling model') logger.info( "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, lstm_units = %s, drop rate = %s, l2 = %s" % (N, L, embedd_dim, opts.lstm_units, opts.dropout, opts.l2_value)) word_input = Input(shape=(N * L, ), dtype='int32', name='word_input') x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights, name='x')(word_input) drop_x = Dropout(opts.dropout, name='drop_x')(x) resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x) z = TimeDistributed(LSTM(opts.lstm_units, return_sequences=True), name='z')(resh_W) att_z = TimeDistributed(Attention(name='att_z'))(z) hz = LSTM(opts.lstm_units, return_sequences=True, name='hz')(att_z) # avg_h = MeanOverTime(mask_zero=True, name='avg_h')(hz) # avg_hz = GlobalAveragePooling1D(name='avg_hz')(hz) attent_hz = Attention(name='attent_hz')(hz) y = Dense(output_dim=1, activation='sigmoid', name='output')(attent_hz) model = Model(input=word_input, output=y) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value) if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='rmsprop') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model
def build_model(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): N = maxnum L = maxlen p = Input(shape=(4, 2048), dtype='float32', name='p') # img_vector = Dense(name='img_vector', units=128)(p) word_input = Input(shape=(N * L,), dtype='int32', name='word_input') x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights, mask_zero=True, trainable=False, name='x')(word_input) x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x) drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout) resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x) cnn_e = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, border_mode='valid'), name='cnn_e')(resh_W) att_cnn_e = TimeDistributed(Attention(), name='att_cnn_e')(cnn_e) lstm_e = LSTM(opts.lstm_units, return_sequences=True, name='lstm_e')(att_cnn_e) G = CoAttention(name='essay')([lstm_e, p]) avg = GlobalAveragePooling1D()(G) final_vec_drop = Dropout(rate=0.5, name='final_vec_drop')(avg) if opts.l2_value: logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value) y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(final_vec_drop) else: y = Dense(units=1, activation='sigmoid', name='output')(final_vec_drop) model = Model(input=[word_input, p], output=y) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value ) if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='adam') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model
def build_hrcnn_model(opts, vocab_size=0, char_vocabsize=0, maxnum=50, maxlen=50, maxcharlen=20, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): # LSTM stacked over CNN based on sentence level N = maxnum L = maxlen logger.info( "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s" % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout)) word_input = Input(shape=(N * L, ), dtype='int32', name='word_input') x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights, mask_zero=True, name='x')(word_input) x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x) drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout) resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x) # add char-based CNN, concatenating with word embedding to compose word representation if opts.use_char: char_input = Input(shape=(N * L * maxcharlen, ), dtype='int32', name='char_input') xc = Embedding(output_dim=opts.char_embedd_dim, input_dim=char_vocabsize, input_length=N * L * maxcharlen, mask_zero=True, name='xc')(char_input) xc_masked = ZeroMaskedEntries(name='xc_masked')(xc) drop_xc = Dropout(opts.dropout, name='drop_xc')(xc_masked) res_xc = Reshape((N * L, maxcharlen, opts.char_embedd_dim), name='res_xc')(drop_xc) cnn_xc = TimeDistributed(Conv1D(opts.char_nbfilters, opts.filter2_len, padding='valid'), name='cnn_xc')(res_xc) max_xc = TimeDistributed(GlobalMaxPooling1D(), name='avg_xc')(cnn_xc) res_xc2 = Reshape((N, L, opts.char_nbfilters), name='res_xc2')(max_xc) w_repr = merge([resh_W, res_xc2], mode='concat', name='w_repr') zcnn = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, padding='valid'), name='zcnn')(w_repr) else: zcnn = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, padding='valid'), name='zcnn')(resh_W) # pooling mode if opts.mode == 'mot': logger.info("Use mean-over-time pooling on sentence") avg_zcnn = TimeDistributed(GlobalAveragePooling1D(), name='avg_zcnn')(zcnn) elif opts.mode == 'att': logger.info('Use attention-pooling on sentence') avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn) elif opts.mode == 'merged': logger.info( 'Use mean-over-time and attention-pooling together on sentence') avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(), name='avg_zcnn1')(zcnn) avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn) avg_zcnn = merge([avg_zcnn1, avg_zcnn2], mode='concat', name='avg_zcnn') else: raise NotImplementedError hz_lstm = LSTM(opts.lstm_units, return_sequences=True, name='hz_lstm')(avg_zcnn) if opts.mode == 'mot': logger.info('Use mean-over-time pooling on text') avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm) elif opts.mode == 'att': logger.info('Use attention-pooling on text') avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm) elif opts.mode == 'merged': logger.info( 'Use mean-over-time and attention-pooling together on text') avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm) avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm) avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2], mode='concat', name='avg_hz_lstm') else: raise NotImplementedError if opts.l2_value: logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value) y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(avg_hz_lstm) else: y = Dense(units=1, activation='sigmoid', name='output')(avg_hz_lstm) if opts.use_char: model = Model(inputs=[word_input, char_input], outputs=y) else: model = Model(inputs=word_input, outputs=y) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value) if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='rmsprop') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model
def build_model_fusion(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): # p_input1 = Input(shape=(256, 256, 3), dtype='float32', name='p_input1') # p_input2 = Input(shape=(256, 256, 3), dtype='float32', name='p_input2') # p_input3 = Input(shape=(256, 256, 3), dtype='float32', name='p_input3') # p_input4 = Input(shape=(256, 256, 3), dtype='float32', name='p_input4') p = Input(shape=(256, 256, 3), dtype='float32', name='p') cnn_model = cnn() img = cnn_model(p) img = Reshape([6*6, 100])(img) # img1 = cnn_model(p_input1) # img2 = cnn_model(p_input2) # img3 = cnn_model(p_input3) # img4 = cnn_model(p_input4) # img1 = GlobalMaxPooling2D()(img1) # img2 = GlobalMaxPooling2D()(img2) # img3 = GlobalMaxPooling2D()(img3) # img4 = GlobalMaxPooling2D()(img4) # img = concatenate([img1, img2, img3, img4], axis=1) # img = Reshape((4, 100))(img) N = maxnum L = maxlen word_input = Input(shape=(N * L,), dtype='int32', name='word_input') x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights, mask_zero=True, name='x')(word_input) x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x) drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout) resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x) cnn_e = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, border_mode='valid', activation='tanh'), name='cnn_e')(resh_W) cnn_e = Dropout(rate=0.5)(cnn_e) att_cnn_e = TimeDistributed(Attention(), name='att_cnn_e')(cnn_e) att_cnn_e = Dropout(rate=0.5)(att_cnn_e) lstm_e = LSTM(opts.lstm_units, return_sequences=True, name='lstm_e')(att_cnn_e) lstm_e = Dropout(rate=0.5)(lstm_e) G = CoAttention(name='essay')([lstm_e, img]) avg = GlobalAveragePooling1D()(G) final_vec_drop = Dropout(rate=0.5, name='final_vec_drop')(avg) if opts.l2_value: logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value) y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(final_vec_drop) else: y = Dense(units=1, activation='sigmoid', name='output')(final_vec_drop) # model = Model(input=[word_input, p_input1, p_input2, p_input3, p_input4], output=y) model = Model(input=[word_input, p], output=y) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value) if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='adam') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model
X_train = E.reshape(E.shape[0],68,178*50,1).astype('float32') print(np.shape(X_train)) labeled_data = zip(E, resolved_scores) from keras.models import Sequential from keras.layers import Bidirectional, Conv1D,Input,Flatten,MaxPooling2D,TimeDistributed,LSTM,Dense, Conv2D, Flatten, GlobalAveragePooling1D, GlobalAveragePooling2D from keras.models import Model cnn_input= Input(shape=(68,178*50,1)) #Frames,height,width,channel of imafe conv1 = TimeDistributed(Conv1D(64, 3, activation='relu'))(cnn_input) #conv2 = TimeDistributed(Conv2D(64, (3,3), activation='relu'))(conv1) pool1=TimeDistributed(MaxPooling1D(pool_size=4))(conv1) att=TimeDistributed(Attention())(pool1) flat=TimeDistributed(Flatten())(att) #cnn_op= TimeDistributed(Dense(output_dim=3))(flat) lstm = Bidirectional(LSTM(100, return_sequences=True, activation='tanh'))(flat) bb = Flatten()(lstm) op =Dense(1, activation='sigmoid')(bb) fun_model = Model(inputs=[cnn_input], outputs=op) fun_model.compile(loss='mse', optimizer='rmsprop') y_train = resolved_scores print(y_train) print(np.shape(y_train)) from sklearn import preprocessing from sklearn.model_selection import train_test_split
X_train = E.reshape(E.shape[0], 68, 178, 50, 1).astype('float32') print(np.shape(X_train)) labeled_data = zip(E, resolved_scores) from keras.models import Sequential from keras.layers import Bidirectional, Conv1D, Input, Flatten, MaxPooling2D, TimeDistributed, LSTM, Dense, Conv2D, Flatten, GlobalAveragePooling1D, GlobalAveragePooling2D from keras.models import Model from softattention import Attention cnn_input = Input(shape=(68, 178, 50, 1)) #Frames,height,width,channel of imafe conv1 = TimeDistributed(Conv2D(100, (3, 3), activation='relu'))(cnn_input) #conv2 = TimeDistributed(Conv2D(64, (3,3), activation='relu'))(conv1) pool1 = TimeDistributed(TimeDistributed(Attention()))(conv1) flat = TimeDistributed(Flatten())(pool1) #cnn_op= TimeDistributed(Dense(output_dim=3))(flat) lstm = Bidirectional(LSTM(128, return_sequences=True, activation='tanh'))(flat) bb = Flatten()(lstm) op = Dense(1, activation='sigmoid')(bb) fun_model = Model(inputs=[cnn_input], outputs=op) from keras.utils.np_utils import to_categorical #model = Sequential() #model.add(Dropout(0.5,input_shape=(178,50,1))) #model.add(TimeDistributed(Conv2D(64, kernel_size=13, activation='relu'))) #model.add(TimeDistributed(GlobalAveragePooling1D())) #model.add(LSTM())
def build_shrcnn_model(opts, vocab_size=0, char_vocabsize=0, maxnum=50, maxlen=50, maxcnum=50, maxclen=50, maxcharlen=20, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): # LSTM stacked over CNN based on sentence level N = maxnum L = maxlen cN = maxcnum cL = maxclen logger.info( "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s" % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout)) word_input = Input(shape=(N * L, ), dtype='int32', name='word_input') context_input = Input(shape=(cN * cL, ), dtype='int32', name='context_input') emb = Embedding(output_dim=embedd_dim, input_dim=vocab_size, weights=embedding_weights, mask_zero=True, name='cx') cx = emb(context_input) cx_maskedout = ZeroMaskedEntries(name='cx_maskedout')(cx) drop_cx = Dropout(opts.dropout, name='drop_cx')(cx_maskedout) resh_C = Reshape((cN, cL, embedd_dim), name='resh_C')(drop_cx) czcnn = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, padding='valid'), name='czcnn')(resh_C) x = emb(word_input) x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x) drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout) resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x) # add char-based CNN, concatenating with word embedding to compose word representation zcnn = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, padding='valid'), name='zcnn')(resh_W) ''' encoded_essay = Reshape((zcnn.shape[1].value*zcnn.shape[2].value, opts.nbfilters))(zcnn) encoded_context = Reshape((czcnn.shape[1].value*czcnn.shape[2].value, opts.nbfilters))(czcnn) # bidaf # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention(name='essay_context_similarity') # matrix_attention_layer = LinearMatrixAttention(name='passage_question_similarity') # Shape: (batch_size, num_passage_words, num_question_words) essay_context_similarity = matrix_attention_layer([encoded_essay, encoded_context]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. essay_context_attention = MaskedSoftmax()(essay_context_similarity) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="essay_context_vectors", use_masking=False) essay_context_vectors = weighted_sum_layer([encoded_context, essay_context_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) context_essay_similarity = Max(axis=-1)(essay_context_similarity) # Shape: (batch_size, num_passage_words) context_essay_attention = MaskedSoftmax()(context_essay_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) context_essay_vector = weighted_sum_layer([encoded_essay, context_essay_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_context_essay_vector = repeat_layer([context_essay_vector, encoded_essay]) complex_concat_layer = ComplexConcat(combination='1*2,1*3', name='final_merged_passage') final_merged_passage = complex_concat_layer([encoded_essay, essay_context_vectors, tiled_context_essay_vector]) complex_concat_layer = ComplexConcat(combination='1*2', name='final_merged_passage') final_merged_passage = complex_concat_layer([encoded_essay, essay_context_vectors]) mcnn = Reshape((zcnn.shape[1].value, zcnn.shape[2].value, opts.nbfilters), name='mcnn')(final_merged_passage) ''' # pooling mode if opts.mode == 'mot': logger.info("Use mean-over-time pooling on sentence") avg_zcnn = TimeDistributed(GlobalAveragePooling1D(), name='avg_zcnn')(zcnn) elif opts.mode == 'att': logger.info('Use attention-pooling on sentence') avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn) avg_czcnn = TimeDistributed(Attention(), name='avg_czcnn')(czcnn) elif opts.mode == 'merged': logger.info( 'Use mean-over-time and attention-pooling together on sentence') avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(), name='avg_zcnn1')(zcnn) avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn) avg_zcnn = merge([avg_zcnn1, avg_zcnn2], mode='concat', name='avg_zcnn') else: raise NotImplementedError hz_lstm = LSTM(opts.lstm_units, return_sequences=True, name='hz_lstm')(avg_zcnn) chz_lstm = LSTM(opts.lstm_units, return_sequences=True, name='chz_lstm')(avg_czcnn) if opts.mode == 'mot': logger.info('Use mean-over-time pooling on text') avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm) elif opts.mode == 'att': logger.info('Use co-attention on text') # PART 2: # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention( name='essay_context_similarity') # Shape: (batch_size, num_passage_words, num_question_words) essay_context_similarity = matrix_attention_layer([hz_lstm, chz_lstm]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. essay_context_attention = MaskedSoftmax()(essay_context_similarity) weighted_sum_layer = WeightedSum(name="essay_context_vectors", use_masking=False) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_hz_lstm = weighted_sum_layer( [chz_lstm, essay_context_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) context_essay_similarity = Max(axis=-1)(essay_context_similarity) # Shape: (batch_size, num_passage_words) context_essay_attention = MaskedSoftmax()(context_essay_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="context_essay_vector", use_masking=False) context_essay_vector = weighted_sum_layer( [hz_lstm, context_essay_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_context_essay_vector = repeat_layer( [context_essay_vector, hz_lstm]) complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_passage = complex_concat_layer( [hz_lstm, weighted_hz_lstm, tiled_context_essay_vector]) avg_hz_lstm = LSTM(opts.lstm_units, return_sequences=False, name='avg_hz_lstm')(final_merged_passage) # avg_hz_lstm = CoAttentionWithoutBi(name='avg_hz_lstm')([hz_lstm, weighted_hz_lstm]) # avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm) elif opts.mode == 'merged': logger.info( 'Use mean-over-time and attention-pooling together on text') avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm) avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm) avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2], mode='concat', name='avg_hz_lstm') else: raise NotImplementedError if opts.l2_value: logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value) y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(avg_hz_lstm) else: y = Dense(units=1, activation='sigmoid', name='output')(avg_hz_lstm) model = Model(inputs=[word_input, context_input], outputs=y) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].b.set_value(bias_value) if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='rmsprop') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model
def build_hrcnn_model(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None): # LSTM stacked over CNN based on sentence level N = maxnum L = maxlen print(opts) logger.info( "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s" % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout)) word_input = Input(shape=(N * L, ), dtype='int32', name='word_input') # embedding layer if opts.use_mask == 0: x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights, mask_zero=False, name='x')(word_input) x_maskedout = x elif opts.use_mask == 1: x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights, mask_zero=True, name='x')(word_input) x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x) # drop out drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout) # reshape resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x) # CNN layer zcnn = TimeDistributed(Convolution1D(opts.nbfilters, opts.filter1_len, border_mode='valid'), name='zcnn')(resh_W) # pooling mode1 on CNN if opts.mode1 == 'mot': logger.info("Use mean-over-time pooling on sentence") avg_zcnn = TimeDistributed(GlobalAveragePooling1D(), name='avg_zcnn')(zcnn) elif opts.mode1 == 'att': logger.info('Use attention-pooling on sentence') avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn) elif opts.mode1 == 'merged': logger.info( 'Use mean-over-time and attention-pooling together on sentence') avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(), input_shape=(K.int_shape(zcnn)[2], K.int_shape(zcnn)[3]), name='avg_zcnn1')(zcnn) avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn) avg_zcnn = merge([avg_zcnn1, avg_zcnn2], mode='concat', name='avg_zcnn') else: raise NotImplementedError hz_lstm = LSTM(opts.lstm_units, return_sequences=True, name='hz_lstm')(avg_zcnn) # pooling mode1 on LSTM if opts.mode2 == 'mot': logger.info('Use mean-over-time pooling on text') avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm) elif opts.mode2 == 'att': logger.info('Use attention-pooling on text') avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm) elif opts.mode2 == 'merged': logger.info( 'Use mean-over-time and attention-pooling together on text') avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm) avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm) avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2], mode='concat', name='avg_hz_lstm') else: raise NotImplementedError # l2 regularization if opts.l2_value: logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value) y = Dense(output_dim=1, activation='sigmoid', name='output', kernel_regularizer=regularizers.l2( opts.l2_value))(avg_hz_lstm) else: y = Dense(output_dim=1, activation='sigmoid', name='output')(avg_hz_lstm) model = Model(input=word_input, output=y) if opts.init_bias and init_mean_value: logger.info("Initialise output layer bias with log(y_mean/1-y_mean)") bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx()) model.layers[-1].bias = bias_value if verbose: model.summary() start_time = time.time() model.compile(loss='mse', optimizer='rmsprop') total_time = time.time() - start_time logger.info("Model compiled in %.4f s" % total_time) return model