def test_call_handles_masking_properly(self): sentence_length = 4 vocab_size = 4 embedding_dim = 3 embedding_weights = numpy.asarray([[0, 0, 0], [1, 1, 1], [-1, 0, 1], [-1, -1, 0]]) embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_weights], mask_zero=True) sentence_1_input = Input(shape=(sentence_length, ), dtype='int32') sentence_2_input = Input(shape=(sentence_length, ), dtype='int32') sentence_1_embedding = embedding(sentence_1_input) sentence_2_embedding = embedding(sentence_2_input) attention_layer = MatrixAttention() attention = attention_layer( [sentence_1_embedding, sentence_2_embedding]) attention_mask = OutputMask()(attention) model = Model(inputs=[sentence_1_input, sentence_2_input], outputs=[attention, attention_mask]) sentence_1_tensor = numpy.asarray([[0, 0, 1, 3]]) sentence_2_tensor = numpy.asarray([[0, 1, 0, 2]]) attention_tensor, attention_mask = model.predict( [sentence_1_tensor, sentence_2_tensor]) expected_attention = numpy.asarray([[[0, 0, 0, 0], [0, 0, 0, 0], [0, 3, 0, 0], [0, -2, 0, 1]]]) expected_mask = numpy.asarray([[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 1], [0, 1, 0, 1]]]) assert_allclose(attention_tensor, expected_attention) assert_allclose(attention_mask, expected_mask)
def test_model_loads_correctly(self): sentence_1_length = 2 sentence_2_length = 3 embedding_dim = 3 sentence_1_embedding = Input(shape=(sentence_1_length, embedding_dim), dtype='float32') sentence_2_embedding = Input(shape=( sentence_2_length, embedding_dim, ), dtype='float32') similarity_function_params = { 'type': 'linear', 'combination': 'x,y,x*y' } attention_layer = MatrixAttention( similarity_function=similarity_function_params) attention = attention_layer( [sentence_1_embedding, sentence_2_embedding]) attention = Dense(2)(attention) model = Model(inputs=[sentence_1_embedding, sentence_2_embedding], outputs=[attention]) sentence_1_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1]]]) sentence_2_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1], [-1, -1, -1]]]) model_file = self.TEST_DIR + "model.tmp" before_loading = model.predict([sentence_1_tensor, sentence_2_tensor]) model.save(model_file) model = load_model( model_file, # pylint: disable=redefined-variable-type custom_objects={'MatrixAttention': MatrixAttention}) after_loading = model.predict([sentence_1_tensor, sentence_2_tensor]) assert_allclose(before_loading, after_loading)
def test_call_works_on_simple_input(self): sentence_1_length = 2 sentence_2_length = 3 embedding_dim = 3 sentence_1_embedding = Input(shape=(sentence_1_length, embedding_dim), dtype='float32') sentence_2_embedding = Input(shape=( sentence_2_length, embedding_dim, ), dtype='float32') attention_layer = MatrixAttention() attention = attention_layer( [sentence_1_embedding, sentence_2_embedding]) model = Model(inputs=[sentence_1_embedding, sentence_2_embedding], outputs=[attention]) sentence_1_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1]]]) sentence_2_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1], [-1, -1, -1]]]) attention_tensor = model.predict( [sentence_1_tensor, sentence_2_tensor]) assert attention_tensor.shape == (1, sentence_1_length, sentence_2_length) assert_allclose(attention_tensor, [[[3, 0, -3], [0, 2, 0]]])
def run_biDAF(): # Create embedding for both Question and News ON both word level and char level question_input = Input(shape=(max_len_Q,), dtype='int32', name="question_input") passage_input = Input(shape=(max_len_P,), dtype='int32', name="passage_input") # Load num of options input options_input = Input(shape=(max_num_options,), dtype='int32', name="options_input") # in order to map only options output embedding_layer_P = Embedding(em_len, emb_dim, weights=[embeddings], input_length=max_len_P, batch_input_shape=(batch_size, max_len_P), trainable=False) embedding_layer_Q = Embedding(em_len, emb_dim, weights=[embeddings], input_length=max_len_Q, batch_input_shape=(batch_size, max_len_Q), trainable=False) passage_embedding = embedding_layer_P(passage_input) question_embedding = embedding_layer_Q(question_input) bi_lstm_Q = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_Q, emb_dim))( question_embedding) bi_lstm_Q1 = Bidirectional(LSTM(256), batch_input_shape=(batch_size, max_len_Q, emb_dim))(question_embedding) bi_lstm_P = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_P, emb_dim))( passage_embedding) ##### Create Attention Layer similarity_function_params = {'type': 'linear', 'combination': 'x,y,x*y'} matrix_attention_layer = MatrixAttention(similarity_function=similarity_function_params,name='matrix_attention_layer') # Shape: (batch_size, num_passage_words, num_question_words) passage_question_similarity = matrix_attention_layer([bi_lstm_P, bi_lstm_Q]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question words for each passage word. passage_question_attention = MaskedSoftmax()(passage_question_similarity) weighted_sum_layer = WeightedSum(name="passage_question_vectors", use_masking=False) # Shape: (batch_size, num_passage_words, embedding_dim * 2) passage_question_vectors = weighted_sum_layer([bi_lstm_Q, passage_question_attention]) # sum at(U~:t)=1 ## Query - Passage 2d * max_len_Q # find most important context words by max() passage_question_similarity question_passage_similarity = Max(axis=-1)(passage_question_similarity) # Shape: (batch_size, num_passage_words) # use softmax for b (max softmax value for similarity matrix column wise) question_passage_attention = MaskedSoftmax()(question_passage_similarity) # Shape: (batch_size, num_passage_words) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) # h~ = sum(weighted_bt * H:t) 2*embed_dim question_passage_vector = weighted_sum_layer([bi_lstm_P, question_passage_attention]) # sum bt(H~:t)=1 repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_passage_vector = repeat_layer([question_passage_vector, bi_lstm_P]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_passage = complex_concat_layer([bi_lstm_P, passage_question_vectors, tiled_question_passage_vector]) # Denote G # Modelling layer. Take input of (?,?,emb*8) and apply bi-directional LSTM each with d dimensions, finally get 2d * Max_len_[] bi_model_passage = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_P, emb_dim))(final_merged_passage) # denote M # span begin output is calculated by Attention weight & LSTM softmax(Wp1 * [G;M]) span_begin_input = Concatenate()([final_merged_passage, bi_model_passage]) span_begin_weights = TimeDistributed(Dense(units=1))(span_begin_input) # Wp1 # Shape: (batch_size, num_passage_words) span_begin_probabilities = MaskedSoftmax(name="span_begin_softmax")(span_begin_weights) # (700,) # as Minjoon's bidaf indicated, after obtain p1, span_start_prob, he sum all probability values of the entity instances # by mask out all non-entity value. and the loss function apply withoutp2 multiword_option_mode = 'mean' options_sum_layer_minj = OptionAttentionSum(multiword_option_mode, name="options_probability_sum_minj") options_probabilities_minj = options_sum_layer_minj([passage_input, span_begin_probabilities, options_input]) l1_norm_layer = L1Normalize() option_normalized_probabilities_cnn = l1_norm_layer(options_probabilities_minj) # dense = Dense(377, activation='sigmoid')(option_normalized_probabilities_cnn) biDAF = Model(inputs=[question_input, passage_input, options_input], outputs=option_normalized_probabilities_cnn) biDAF.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) return biDAF
def build_model(embedding_layer): """ l2_lambda = 0.0001 question_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # * 2 since doubling the question and passage answer_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # * 2 since doubling the question and passage question_embedding = embedding_layer(question_input) answer_embedding = embedding_layer(answer_input) # Min's model has some highway layers here, with relu activations. Note that highway # layers don't change the tensor's shape. We need to have two different `TimeDistributed` # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed` # layer gets applied to two inputs with different numbers of time steps. highway_layers = 2 for i in range(highway_layers): highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i)) question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd") question_embedding = question_layer(question_embedding) passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd") answer_embedding = passage_layer(answer_embedding) # Then we pass the question and passage through a seq2seq encoder (like a biLSTM). This # essentially pushes phrase-level information into the embeddings of each word. phrase_layer = Bidirectional(layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout= 0.2, dropout=0.2))#, kernel_regularizer=l2(l2_lambda),kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"]))) # Shape: (batch_size, num_question_words, embedding_dim * 2) encoded_question = phrase_layer(question_embedding) # Shape: (batch_size, num_passage_words, embedding_dim * 2) encoded_answer = phrase_layer(answer_embedding) #encoded_question = layers.Dropout(0.2)(encoded_question) #encoded_answer = layers.Dropout(0.2)(encoded_answer) # PART 2: # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention(similarity_function={'type': 'linear', 'combination': 'x,y,x*y'}, name='passage_question_similarity') # Shape: (batch_size, num_passage_words, num_question_words) answer_question_similarity = matrix_attention_layer([encoded_answer, encoded_question]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. answer_question_attention = MaskedSoftmax()(answer_question_similarity) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False) answer_question_vectors = weighted_sum_layer([encoded_question, answer_question_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) question_answer_similarity = Max(axis=-1)(answer_question_similarity) # Shape: (batch_size, num_passage_words) question_answer_attention = MaskedSoftmax()(question_answer_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) question_answer_vector = weighted_sum_layer([encoded_answer, question_answer_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_answer_vector = repeat_layer([question_answer_vector, encoded_answer]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = complex_concat.ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_answer = complex_concat_layer([encoded_answer, answer_question_vectors, tiled_question_answer_vector]) # PART 3: # Having computed a combined representation of the document that includes attended question # vectors, we'll pass this through a few more bi-directional encoder layers, then predict # the span_begin word. Hard to find a good name for this; Min calls this part of the # network the "modeling layer", so we'll call this the `modeled_passage`. modeled_answer = final_merged_answer for i in range(1): hidden_layer = Bidirectional(layers.GRU(return_sequences=True, units=300, activation='relu', recurrent_dropout= 0.2))#, kernel_regularizer=l2(l2_lambda), kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"]))) modeled_answer = hidden_layer(modeled_answer) #PART 4: BY HELEN #get the maximum for each word max_answer = Max(axis=-1)(modeled_answer) preds = layers.Dense(1, activation = 'sigmoid', name = 'prediction')(max_answer) model = models.Model(inputs=[question_input, answer_input], outputs=preds) return model """ question_input = layers.Input( shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') # * 2 since doubling the question and passage answer_input = layers.Input( shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') # * 2 since doubling the question and passage question_embedding = embedding_layer(question_input) answer_embedding = embedding_layer(answer_input) # Min's model has some highway layers here, with relu activations. Note that highway # layers don't change the tensor's shape. We need to have two different `TimeDistributed` # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed` # layer gets applied to two inputs with different numbers of time steps. highway_layers = 2 for i in range(highway_layers): highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i)) question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd") question_embedding = question_layer(question_embedding) passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd") answer_embedding = passage_layer(answer_embedding) # Then we pass the question and passage through a seq2seq encoder (like a biLSTM). This # essentially pushes phrase-level information into the embeddings of each word. phrase_layer = Bidirectional( layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout=0.2, dropout=0.3, kernel_regularizer=l2(0.0001), kernel_initializer='he_uniform') ) #, **(params["encoder_params"]), **(params["wrapper_params"]))) # Shape: (batch_size, num_question_words, embedding_dim * 2) encoded_question = phrase_layer(question_embedding) # Shape: (batch_size, num_passage_words, embedding_dim * 2) encoded_answer = phrase_layer(answer_embedding) # PART 2: # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention( similarity_function={ 'type': 'linear', 'combination': 'x,y,x*y' }, name='passage_question_similarity') # Shape: (batch_size, num_passage_words, num_question_words) answer_question_similarity = matrix_attention_layer( [encoded_answer, encoded_question]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. answer_question_attention = MaskedSoftmax()(answer_question_similarity) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False) answer_question_vectors = weighted_sum_layer( [encoded_question, answer_question_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) question_answer_similarity = Max(axis=-1)(answer_question_similarity) # Shape: (batch_size, num_passage_words) question_answer_attention = MaskedSoftmax()(question_answer_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) question_answer_vector = weighted_sum_layer( [encoded_answer, question_answer_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_answer_vector = repeat_layer( [question_answer_vector, encoded_answer]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = complex_concat.ComplexConcat( combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_answer = complex_concat_layer([ encoded_answer, answer_question_vectors, tiled_question_answer_vector ]) # PART 3: # Having computed a combined representation of the document that includes attended question # vectors, we'll pass this through a few more bi-directional encoder layers, then predict # the span_begin word. Hard to find a good name for this; Min calls this part of the # network the "modeling layer", so we'll call this the `modeled_passage`. modeled_answer = final_merged_answer for i in range(1): hidden_layer = Bidirectional( layers.GRU( return_sequences=True, units=300, activation='relu', recurrent_dropout=0.2, dropout=0.3, )) #, **(params["encoder_params"]), **(params["wrapper_params"]))) modeled_answer = hidden_layer(modeled_answer) #PART 4: BY HELEN #get the maximum for each word max_answer = Max(axis=-1)(modeled_answer) print("max answer shape", max_answer.shape) print("modeled_answer shape", modeled_answer.shape) preds = layers.Dense(1, activation='sigmoid', name='prediction', kernel_regularizer=l2(0.0001), kernel_initializer='he_uniform')(max_answer) print("pred shape", preds.shape) model = models.Model(inputs=[question_input, answer_input], outputs=preds) return model