Exemplo n.º 1
0
 def test_call_handles_masking_properly(self):
     sentence_length = 4
     vocab_size = 4
     embedding_dim = 3
     embedding_weights = numpy.asarray([[0, 0, 0], [1, 1, 1], [-1, 0, 1],
                                        [-1, -1, 0]])
     embedding = Embedding(vocab_size,
                           embedding_dim,
                           weights=[embedding_weights],
                           mask_zero=True)
     sentence_1_input = Input(shape=(sentence_length, ), dtype='int32')
     sentence_2_input = Input(shape=(sentence_length, ), dtype='int32')
     sentence_1_embedding = embedding(sentence_1_input)
     sentence_2_embedding = embedding(sentence_2_input)
     attention_layer = MatrixAttention()
     attention = attention_layer(
         [sentence_1_embedding, sentence_2_embedding])
     attention_mask = OutputMask()(attention)
     model = Model(inputs=[sentence_1_input, sentence_2_input],
                   outputs=[attention, attention_mask])
     sentence_1_tensor = numpy.asarray([[0, 0, 1, 3]])
     sentence_2_tensor = numpy.asarray([[0, 1, 0, 2]])
     attention_tensor, attention_mask = model.predict(
         [sentence_1_tensor, sentence_2_tensor])
     expected_attention = numpy.asarray([[[0, 0, 0, 0], [0, 0, 0, 0],
                                          [0, 3, 0, 0], [0, -2, 0, 1]]])
     expected_mask = numpy.asarray([[[0, 0, 0, 0], [0, 0, 0, 0],
                                     [0, 1, 0, 1], [0, 1, 0, 1]]])
     assert_allclose(attention_tensor, expected_attention)
     assert_allclose(attention_mask, expected_mask)
Exemplo n.º 2
0
    def test_model_loads_correctly(self):
        sentence_1_length = 2
        sentence_2_length = 3
        embedding_dim = 3
        sentence_1_embedding = Input(shape=(sentence_1_length, embedding_dim),
                                     dtype='float32')
        sentence_2_embedding = Input(shape=(
            sentence_2_length,
            embedding_dim,
        ),
                                     dtype='float32')
        similarity_function_params = {
            'type': 'linear',
            'combination': 'x,y,x*y'
        }
        attention_layer = MatrixAttention(
            similarity_function=similarity_function_params)
        attention = attention_layer(
            [sentence_1_embedding, sentence_2_embedding])
        attention = Dense(2)(attention)
        model = Model(inputs=[sentence_1_embedding, sentence_2_embedding],
                      outputs=[attention])

        sentence_1_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1]]])
        sentence_2_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1],
                                            [-1, -1, -1]]])
        model_file = self.TEST_DIR + "model.tmp"
        before_loading = model.predict([sentence_1_tensor, sentence_2_tensor])

        model.save(model_file)
        model = load_model(
            model_file,  # pylint: disable=redefined-variable-type
            custom_objects={'MatrixAttention': MatrixAttention})
        after_loading = model.predict([sentence_1_tensor, sentence_2_tensor])

        assert_allclose(before_loading, after_loading)
Exemplo n.º 3
0
 def test_call_works_on_simple_input(self):
     sentence_1_length = 2
     sentence_2_length = 3
     embedding_dim = 3
     sentence_1_embedding = Input(shape=(sentence_1_length, embedding_dim),
                                  dtype='float32')
     sentence_2_embedding = Input(shape=(
         sentence_2_length,
         embedding_dim,
     ),
                                  dtype='float32')
     attention_layer = MatrixAttention()
     attention = attention_layer(
         [sentence_1_embedding, sentence_2_embedding])
     model = Model(inputs=[sentence_1_embedding, sentence_2_embedding],
                   outputs=[attention])
     sentence_1_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1]]])
     sentence_2_tensor = numpy.asarray([[[1, 1, 1], [-1, 0, 1],
                                         [-1, -1, -1]]])
     attention_tensor = model.predict(
         [sentence_1_tensor, sentence_2_tensor])
     assert attention_tensor.shape == (1, sentence_1_length,
                                       sentence_2_length)
     assert_allclose(attention_tensor, [[[3, 0, -3], [0, 2, 0]]])
def run_biDAF():
    # Create embedding for both Question and News ON both word level and char level
    question_input = Input(shape=(max_len_Q,),
                           dtype='int32', name="question_input")
    passage_input = Input(shape=(max_len_P,),
                          dtype='int32', name="passage_input")
    # Load num of options input
    options_input = Input(shape=(max_num_options,),
                          dtype='int32', name="options_input")  # in order to map only options output
    embedding_layer_P = Embedding(em_len,
                                  emb_dim,
                                  weights=[embeddings],
                                  input_length=max_len_P,
                                  batch_input_shape=(batch_size, max_len_P),
                                  trainable=False)
    embedding_layer_Q = Embedding(em_len,
                                  emb_dim,
                                  weights=[embeddings],
                                  input_length=max_len_Q,
                                  batch_input_shape=(batch_size, max_len_Q),
                                  trainable=False)

    passage_embedding = embedding_layer_P(passage_input)
    question_embedding = embedding_layer_Q(question_input)



    bi_lstm_Q = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_Q, emb_dim))(
        question_embedding)
    bi_lstm_Q1 = Bidirectional(LSTM(256), batch_input_shape=(batch_size, max_len_Q, emb_dim))(question_embedding)
    bi_lstm_P = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_P, emb_dim))(
        passage_embedding)
    ##### Create Attention Layer

    similarity_function_params = {'type': 'linear', 'combination': 'x,y,x*y'}
    matrix_attention_layer = MatrixAttention(similarity_function=similarity_function_params,name='matrix_attention_layer')
    # Shape: (batch_size, num_passage_words, num_question_words)
    passage_question_similarity = matrix_attention_layer([bi_lstm_P, bi_lstm_Q])

    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question words for each passage word.
    passage_question_attention = MaskedSoftmax()(passage_question_similarity)

    weighted_sum_layer = WeightedSum(name="passage_question_vectors",
                                     use_masking=False)  # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    passage_question_vectors = weighted_sum_layer([bi_lstm_Q, passage_question_attention])  # sum at(U~:t)=1
    ## Query - Passage 2d * max_len_Q
    # find most important context words by max() passage_question_similarity

    question_passage_similarity = Max(axis=-1)(passage_question_similarity)  # Shape: (batch_size, num_passage_words)
    # use softmax for b (max softmax value for similarity matrix column wise)
    question_passage_attention = MaskedSoftmax()(question_passage_similarity)  # Shape: (batch_size, num_passage_words)

    weighted_sum_layer = WeightedSum(name="question_passage_vector",
                                     use_masking=False)  # h~ = sum(weighted_bt * H:t) 2*embed_dim
    question_passage_vector = weighted_sum_layer([bi_lstm_P, question_passage_attention])  # sum bt(H~:t)=1

    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_question_passage_vector = repeat_layer([question_passage_vector, bi_lstm_P])

    # Shape: (batch_size, num_passage_words, embedding_dim * 8)
    complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage')
    final_merged_passage = complex_concat_layer([bi_lstm_P,
                                                 passage_question_vectors,
                                                 tiled_question_passage_vector])  # Denote G
    # Modelling layer. Take input of (?,?,emb*8) and apply bi-directional LSTM each with d dimensions, finally get 2d * Max_len_[]
    bi_model_passage = Bidirectional(LSTM(256, return_sequences=True),
                                     batch_input_shape=(batch_size, max_len_P, emb_dim))(final_merged_passage)
    # denote M

    # span begin output is calculated by Attention weight & LSTM softmax(Wp1 * [G;M])
    span_begin_input = Concatenate()([final_merged_passage, bi_model_passage])
    span_begin_weights = TimeDistributed(Dense(units=1))(span_begin_input)  # Wp1
    # Shape: (batch_size, num_passage_words)
    span_begin_probabilities = MaskedSoftmax(name="span_begin_softmax")(span_begin_weights)  # (700,)

    # as Minjoon's bidaf indicated, after obtain p1, span_start_prob, he sum all probability values of the entity instances
    # by mask out all non-entity value. and the loss function apply withoutp2
    multiword_option_mode = 'mean'
    options_sum_layer_minj = OptionAttentionSum(multiword_option_mode, name="options_probability_sum_minj")
    options_probabilities_minj = options_sum_layer_minj([passage_input, span_begin_probabilities, options_input])
    l1_norm_layer = L1Normalize()
    option_normalized_probabilities_cnn = l1_norm_layer(options_probabilities_minj)
    # dense = Dense(377, activation='sigmoid')(option_normalized_probabilities_cnn)

    biDAF = Model(inputs=[question_input, passage_input, options_input],
                      outputs=option_normalized_probabilities_cnn)
    biDAF.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return biDAF
Exemplo n.º 5
0
def build_model(embedding_layer):
    """
    l2_lambda = 0.0001
    question_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,),
                                  dtype='int32')  # * 2 since doubling the question and passage
    answer_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,),
                                dtype='int32')  # * 2 since doubling the question and passage

    question_embedding = embedding_layer(question_input)
    answer_embedding = embedding_layer(answer_input)

    # Min's model has some highway layers here, with relu activations.  Note that highway
    # layers don't change the tensor's shape.  We need to have two different `TimeDistributed`
    # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed`
    # layer gets applied to two inputs with different numbers of time steps.

    highway_layers = 2
    for i in range(highway_layers):
        highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i))
        question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd")
        question_embedding = question_layer(question_embedding)
        passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd")
        answer_embedding = passage_layer(answer_embedding)

    # Then we pass the question and passage through a seq2seq encoder (like a biLSTM).  This
    # essentially pushes phrase-level information into the embeddings of each word.
    phrase_layer = Bidirectional(layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout= 0.2, dropout=0.2))#, kernel_regularizer=l2(l2_lambda),kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"])))

    # Shape: (batch_size, num_question_words, embedding_dim * 2)
    encoded_question = phrase_layer(question_embedding)

    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    encoded_answer = phrase_layer(answer_embedding)

    #encoded_question = layers.Dropout(0.2)(encoded_question)
    #encoded_answer = layers.Dropout(0.2)(encoded_answer)

    # PART 2:
    # Now we compute a similarity between the passage words and the question words, and
    # normalize the matrix in a couple of different ways for input into some more layers.
    matrix_attention_layer = MatrixAttention(similarity_function={'type': 'linear', 'combination': 'x,y,x*y'},
                                             name='passage_question_similarity')

    # Shape: (batch_size, num_passage_words, num_question_words)
    answer_question_similarity = matrix_attention_layer([encoded_answer, encoded_question])

    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
    # words for each passage word.
    answer_question_attention = MaskedSoftmax()(answer_question_similarity)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False)
    answer_question_vectors = weighted_sum_layer([encoded_question, answer_question_attention])

    # Min's paper finds, for each document word, the most similar question word to it, and
    # computes a single attention over the whole document using these max similarities.
    # Shape: (batch_size, num_passage_words)
    question_answer_similarity = Max(axis=-1)(answer_question_similarity)
    # Shape: (batch_size, num_passage_words)
    question_answer_attention = MaskedSoftmax()(question_answer_similarity)
    # Shape: (batch_size, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False)
    question_answer_vector = weighted_sum_layer([encoded_answer, question_answer_attention])

    # Then he repeats this question/passage vector for every word in the passage, and uses it
    # as an additional input to the hidden layers above.
    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_question_answer_vector = repeat_layer([question_answer_vector, encoded_answer])

    # Shape: (batch_size, num_passage_words, embedding_dim * 8)
    complex_concat_layer = complex_concat.ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage')
    final_merged_answer = complex_concat_layer([encoded_answer,
                                                 answer_question_vectors,
                                                 tiled_question_answer_vector])

    # PART 3:
    # Having computed a combined representation of the document that includes attended question
    # vectors, we'll pass this through a few more bi-directional encoder layers, then predict
    # the span_begin word.  Hard to find a good name for this; Min calls this part of the
    # network the "modeling layer", so we'll call this the `modeled_passage`.
    modeled_answer = final_merged_answer
    for i in range(1):
        hidden_layer = Bidirectional(layers.GRU(return_sequences=True, units=300, activation='relu', recurrent_dropout= 0.2))#, kernel_regularizer=l2(l2_lambda), kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"])))
        modeled_answer = hidden_layer(modeled_answer)


    #PART 4: BY HELEN
    #get the maximum for each word
    max_answer = Max(axis=-1)(modeled_answer)
    preds = layers.Dense(1, activation = 'sigmoid', name = 'prediction')(max_answer)

    model = models.Model(inputs=[question_input, answer_input], outputs=preds)

    return model
    """

    question_input = layers.Input(
        shape=(MAX_SEQUENCE_LENGTH, ),
        dtype='int32')  # * 2 since doubling the question and passage
    answer_input = layers.Input(
        shape=(MAX_SEQUENCE_LENGTH, ),
        dtype='int32')  # * 2 since doubling the question and passage

    question_embedding = embedding_layer(question_input)
    answer_embedding = embedding_layer(answer_input)

    # Min's model has some highway layers here, with relu activations.  Note that highway
    # layers don't change the tensor's shape.  We need to have two different `TimeDistributed`
    # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed`
    # layer gets applied to two inputs with different numbers of time steps.
    highway_layers = 2
    for i in range(highway_layers):
        highway_layer = highway.Highway(activation='relu',
                                        name='highway_{}'.format(i))
        question_layer = layers.TimeDistributed(highway_layer,
                                                name=highway_layer.name +
                                                "_qtd")
        question_embedding = question_layer(question_embedding)
        passage_layer = layers.TimeDistributed(highway_layer,
                                               name=highway_layer.name +
                                               "_ptd")
        answer_embedding = passage_layer(answer_embedding)

    # Then we pass the question and passage through a seq2seq encoder (like a biLSTM).  This
    # essentially pushes phrase-level information into the embeddings of each word.
    phrase_layer = Bidirectional(
        layers.GRU(return_sequences=True,
                   units=500,
                   activation='relu',
                   recurrent_dropout=0.2,
                   dropout=0.3,
                   kernel_regularizer=l2(0.0001),
                   kernel_initializer='he_uniform')
    )  #, **(params["encoder_params"]), **(params["wrapper_params"])))

    # Shape: (batch_size, num_question_words, embedding_dim * 2)
    encoded_question = phrase_layer(question_embedding)

    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    encoded_answer = phrase_layer(answer_embedding)

    # PART 2:
    # Now we compute a similarity between the passage words and the question words, and
    # normalize the matrix in a couple of different ways for input into some more layers.
    matrix_attention_layer = MatrixAttention(
        similarity_function={
            'type': 'linear',
            'combination': 'x,y,x*y'
        },
        name='passage_question_similarity')

    # Shape: (batch_size, num_passage_words, num_question_words)
    answer_question_similarity = matrix_attention_layer(
        [encoded_answer, encoded_question])

    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
    # words for each passage word.
    answer_question_attention = MaskedSoftmax()(answer_question_similarity)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="answer_question_vectors",
                                     use_masking=False)
    answer_question_vectors = weighted_sum_layer(
        [encoded_question, answer_question_attention])

    # Min's paper finds, for each document word, the most similar question word to it, and
    # computes a single attention over the whole document using these max similarities.
    # Shape: (batch_size, num_passage_words)
    question_answer_similarity = Max(axis=-1)(answer_question_similarity)
    # Shape: (batch_size, num_passage_words)
    question_answer_attention = MaskedSoftmax()(question_answer_similarity)
    # Shape: (batch_size, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="question_passage_vector",
                                     use_masking=False)
    question_answer_vector = weighted_sum_layer(
        [encoded_answer, question_answer_attention])

    # Then he repeats this question/passage vector for every word in the passage, and uses it
    # as an additional input to the hidden layers above.
    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_question_answer_vector = repeat_layer(
        [question_answer_vector, encoded_answer])

    # Shape: (batch_size, num_passage_words, embedding_dim * 8)
    complex_concat_layer = complex_concat.ComplexConcat(
        combination='1,2,1*2,1*3', name='final_merged_passage')
    final_merged_answer = complex_concat_layer([
        encoded_answer, answer_question_vectors, tiled_question_answer_vector
    ])

    # PART 3:
    # Having computed a combined representation of the document that includes attended question
    # vectors, we'll pass this through a few more bi-directional encoder layers, then predict
    # the span_begin word.  Hard to find a good name for this; Min calls this part of the
    # network the "modeling layer", so we'll call this the `modeled_passage`.
    modeled_answer = final_merged_answer
    for i in range(1):
        hidden_layer = Bidirectional(
            layers.GRU(
                return_sequences=True,
                units=300,
                activation='relu',
                recurrent_dropout=0.2,
                dropout=0.3,
            ))  #, **(params["encoder_params"]), **(params["wrapper_params"])))
        modeled_answer = hidden_layer(modeled_answer)

    #PART 4: BY HELEN
    #get the maximum for each word
    max_answer = Max(axis=-1)(modeled_answer)
    print("max answer shape", max_answer.shape)
    print("modeled_answer shape", modeled_answer.shape)

    preds = layers.Dense(1,
                         activation='sigmoid',
                         name='prediction',
                         kernel_regularizer=l2(0.0001),
                         kernel_initializer='he_uniform')(max_answer)

    print("pred shape", preds.shape)

    model = models.Model(inputs=[question_input, answer_input], outputs=preds)

    return model