示例#1
0
def bigru_with_attention(max_len=74, emb_dim=32, max_vocab_len=40, W_reg=regularizers.l2(1e-4)):
    # """Bidirectional GRU with Attention model with the Keras Sequential API"""

    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, dropout=0.2, W_regularizer=W_reg)(main_input)

    # Bi-directional LSTM layer
    lstm = Bidirectional(GRU(units=128, return_sequences=True))(emb)
    lstm = Dropout(0.2)(lstm)

    att_layer, att_score = ScaledDotProductAttention(history_only=True,
                                     return_attention=True,)([lstm, lstm, lstm])
    att = Flatten()(att_layer)

    hidden1 = Dense(9472)(att)
    hidden1 = Dropout(0.5)(hidden1)

    # Output layer (last fully connected layer)
    output = Dense(21, activation='softmax', name='output')(hidden1)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

    model.compile(optimizer=adam, loss='categorical_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.CategoricalAccuracy(),
                           Evaluator.precision, Evaluator.recall, Evaluator.fmeasure])
    return model
 def test_sample(self):
     input_layer = keras.layers.Input(
         shape=(5, ),
         name='Input',
     )
     embed_layer = keras.layers.Embedding(
         input_dim=4,
         output_dim=5,
         mask_zero=True,
         weights=[
             np.array([
                 [0.1, 0.2, 0.3, 0.4, 0.5],
                 [0.2, 0.3, 0.4, 0.6, 0.5],
                 [0.4, 0.7, 0.2, 0.6, 0.9],
                 [0.3, 0.5, 0.8, 0.9, 0.1],
             ]),
         ],
         name='Embedding',
     )(input_layer)
     att_layer = ScaledDotProductAttention(name='Attention')(embed_layer)
     model = keras.models.Model(inputs=input_layer, outputs=att_layer)
     model.compile(optimizer='adam', loss='mse')
     model.summary()
     inputs = np.array([[1, 2, 3, 1, 0]])
     predict = model.predict(inputs)[0]
     self.assertTrue(np.allclose(predict[0], predict[3]))
     self.assertTrue(
         np.allclose(
             np.asarray([
                 0.27883747, 0.45767492, 0.47448885, 0.69199574, 0.47368336
             ]),
             predict[2],
         ), predict[2])
示例#3
0
 def test_save_load(self):
     input_q = keras.layers.Input(shape=(5, 3), name='Input-Q')
     input_k = keras.layers.Input(shape=(4, 3), name='Input-K')
     input_v = keras.layers.Input(shape=(4, 6), name='Input-V')
     attention, weights = ScaledDotProductAttention(
         return_attention=True,
         history_only=True,
         name='Attention',
     )([input_q, input_k, input_v])
     model = keras.models.Model(inputs=[input_q, input_k, input_v],
                                outputs=[attention, weights])
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model_path = os.path.join(
         tempfile.gettempdir(),
         'keras_self_att_test_sl_%f.h5' % random.random())
     model.save(model_path)
     model = keras.models.load_model(
         model_path,
         custom_objects={
             'ScaledDotProductAttention': ScaledDotProductAttention,
         },
     )
     model.summary(line_length=120)
     self.assertTrue(model is not None)
 def call(self, inputs, mask=None):
     if isinstance(inputs, list):
         q, k, v = inputs
     else:
         q = k = v = inputs
     if isinstance(mask, list):
         q_mask, k_mask, v_mask = mask
     else:
         q_mask = k_mask = v_mask = mask
     q = K.dot(q, self.Wq)
     k = K.dot(k, self.Wk)
     v = K.dot(v, self.Wv)
     if self.use_bias:
         q += self.bq
         k += self.bk
         v += self.bv
     if self.activation is not None:
         q = self.activation(q)
         k = self.activation(k)
         v = self.activation(v)
     scaled_dot_product_attention = ScaledDotProductAttention(
         history_only=self.history_only,
         name='%s-Attention' % self.name,
     )
     y = scaled_dot_product_attention(
         inputs=[
             self._reshape_to_batches(q, self.head_num),
             self._reshape_to_batches(k, self.head_num),
             self._reshape_to_batches(v, self.head_num),
         ],
         mask=[
             self._reshape_mask(q_mask, self.head_num),
             self._reshape_mask(k_mask, self.head_num),
             self._reshape_mask(v_mask, self.head_num),
         ],
     )
     self.intensity = self._reshape_attention_from_batches(
         scaled_dot_product_attention.intensity, self.head_num)
     self.attention = self._reshape_attention_from_batches(
         scaled_dot_product_attention.attention, self.head_num)
     y = self._reshape_from_batches(y, self.head_num)
     y = K.dot(y, self.Wo)
     if self.use_bias:
         y += self.bo
     if self.activation is not None:
         y = self.activation(y)
     if TF_KERAS:
         # Add shape information to tensor when using `tf.keras`
         input_shape = [K.int_shape(q), K.int_shape(k), K.int_shape(v)]
         output_shape = self.compute_output_shape(input_shape)
         if output_shape[1] is not None:
             output_shape = (-1, ) + output_shape[1:]
             y = K.reshape(y, output_shape)
     return y
    def call(self, inputs, mask=None):
        if isinstance(inputs, list):
            q, k, v = inputs
        else:
            q = k = v = inputs
        if isinstance(mask, list):
            q_mask, k_mask, v_mask = mask
        else:
            q_mask = k_mask = v_mask = mask
        q = K.dot(q, self.Wq)
        k = K.dot(k, self.Wk)
        v = K.dot(v, self.Wv)
        if self.use_bias:
            q += self.bq
            k += self.bk
            v += self.bv
        if self.activation is not None:
            q = self.activation(q)
            k = self.activation(k)
            v = self.activation(v)
        y, a = ScaledDotProductAttention(
            history_only=self.history_only,
            return_attention=True,
            name='%s-Attention' % self.name,
        )(
            inputs=[
                self._reshape_to_batches(q, self.head_num),
                self._reshape_to_batches(k, self.head_num),
                self._reshape_to_batches(v, self.head_num),
            ],
            mask=[
                self._reshape_mask(q_mask, self.head_num),
                self._reshape_mask(k_mask, self.head_num),
                self._reshape_mask(v_mask, self.head_num),
            ],
        )
        self.a = a
        y = self._reshape_from_batches(y, self.head_num)
        y = K.dot(y, self.Wo)
        if self.use_bias:
            y += self.bo
        if self.activation is not None:
            y = self.activation(y)

        input_shape = [K.int_shape(q), K.int_shape(k), K.int_shape(v)]
        output_shape = self.compute_output_shape(input_shape)
        if output_shape[1] is not None:
            output_shape = (-1, ) + output_shape[1:]
            y = K.reshape(y, output_shape)
        return y
 def test_history(self):
     input_layer = keras.layers.Input(
         shape=(5, ),
         name='Input',
     )
     embed_layer = keras.layers.Embedding(
         input_dim=4,
         output_dim=5,
         mask_zero=True,
         weights=[
             np.asarray([
                 [0.1, 0.2, 0.3, 0.4, 0.5],
                 [0.2, 0.3, 0.4, 0.6, 0.5],
                 [0.4, 0.7, 0.2, 0.6, 0.9],
                 [0.3, 0.5, 0.8, 0.9, 0.1],
             ]),
         ],
         name='Embedding',
     )(input_layer)
     att_layer, att_weights = ScaledDotProductAttention(
         history_only=True,
         return_attention=True,
         name='Attention',
     )([embed_layer, embed_layer, embed_layer])
     model = keras.models.Model(inputs=input_layer,
                                outputs=[att_layer, att_weights])
     model.compile(
         optimizer='adam',
         loss='mse',
         metrics={},
     )
     model.summary()
     inputs = np.array([[1, 2, 3, 1, 0]])
     predicts = model.predict(inputs)
     results, weights = predicts[0][0], predicts[1][0]
     self.assertFalse(np.allclose(results[0], results[3]))
     self.assertTrue(
         np.allclose(
             np.asarray([0.2, 0.3, 0.4, 0.6, 0.5]),
             results[0],
         ), results[2])
     for i in range(4):
         for j in range(5):
             if j > i:
                 self.assertEqual(0.0, weights[i][j])
             else:
                 self.assertLess(0.0, weights[i][j])
示例#7
0
 def call(self, inputs, mask=None):
     if isinstance(inputs, list):
         q, k, v = inputs
     else:
         q = k = v = inputs
     if isinstance(mask, list):
         q_mask, k_mask, v_mask = mask
     else:
         q_mask = k_mask = v_mask = mask
     feature_dim = K.shape(v)[-1]
     head_dim = feature_dim // self.head_num
     q = K.dot(q, self.Wq)
     k = K.dot(k, self.Wk)
     v = K.dot(v, self.Wv)
     if self.use_bias:
         q += self.bq
         k += self.bk
         v += self.bv
     if self.activation is not None:
         q = self.activation(q)
         k = self.activation(k)
         v = self.activation(v)
     outputs = []
     for i in range(self.head_num):
         begin, end = i * head_dim, (i + 1) * head_dim
         outputs.append(
             ScaledDotProductAttention(
                 history_only=self.history_only,
                 name='%s-Att-%d' % (self.name, i + 1),
             )(
                 inputs=[
                     q[:, :, begin:end],
                     k[:, :, begin:end],
                     v[:, :, begin:end],
                 ],
                 mask=[q_mask, k_mask, v_mask],
             ))
     y = K.dot(K.concatenate(outputs), self.Wo)
     if self.use_bias:
         y += self.bo
     if self.activation is not None:
         y = self.activation(y)
     return y
示例#8
0
 def call(self, inputs, mask=None):
     if isinstance(inputs, list):
         q, k, v = inputs
     else:
         q = k = v = inputs
     if isinstance(mask, list):
         q_mask, k_mask, v_mask = mask
     else:
         q_mask = k_mask = v_mask = mask
     q = K.dot(q, self.Wq)
     k = K.dot(k, self.Wk)
     v = K.dot(v, self.Wv)
     if self.use_bias:
         q += self.bq
         k += self.bk
         v += self.bv
     if self.activation is not None:
         q = self.activation(q)
         k = self.activation(k)
         v = self.activation(v)
     y = ScaledDotProductAttention(
         history_only=self.history_only,
         name='%s-Attention' % self.name,
     )(
         inputs=[
             self._reshape_to_batches(q, self.head_num),
             self._reshape_to_batches(k, self.head_num),
             self._reshape_to_batches(v, self.head_num),
         ],
         mask=[
             self._reshape_mask(q_mask, self.head_num),
             self._reshape_mask(k_mask, self.head_num),
             self._reshape_mask(v_mask, self.head_num),
         ],
     )
     y = self._reshape_from_batches(y, self.head_num)
     y = K.dot(y, self.Wo)
     if self.use_bias:
         y += self.bo
     if self.activation is not None:
         y = self.activation(y)
     y = K.reshape(y, (-1, 512, 768))
     return y
def build_model(max_length,
                loaded_model=None,
                fine_tune_model=False,
                embedding_matrix=None,
                transformer_depth=8,
                transformer_heads=8,
                l2_penalty=None,
                embedding_dropout=0.6,
                transformer_dropout=0.1,
                classifier_dropout=0.1,
                transformer_output_handling="flatten",
                print_info=False,
                train_lm=True):

    original_model = None
    if loaded_model:
        # load the specified model
        original_model = load_model(loaded_model,
                                    custom_objects={
                                        "perplexity":
                                        perplexity,
                                        "lm_accuracy":
                                        lm_accuracy,
                                        "SeqSelfAttention":
                                        SeqSelfAttention,
                                        "ScaledDotProductAttention":
                                        ScaledDotProductAttention
                                    })

    # regularizer for embedding layer
    l2_regularizer = l2(l2_penalty) if l2_penalty else None

    # input encoded as integers
    raw_input = Input(shape=(max_length, ), name="input")

    # embedding layer, initialised with embedding matrix weights for now
    embedding_weights = [
        original_model.get_layer(name="word_embedding").get_weights()[0]
        if loaded_model else embedding_matrix
    ]
    embedding_layer = ReusableEmbedding(
        input_dim=(embedding_matrix[0] if type(embedding_matrix) == tuple else
                   embedding_matrix.shape[0]),
        output_dim=(embedding_matrix[1] if type(embedding_matrix) == tuple else
                    embedding_matrix.shape[1]),
        input_length=max_length,
        name="word_embedding",
        weights=(None if type(embedding_matrix) == tuple and not loaded_model
                 else embedding_weights),
        embeddings_regularizer=l2_regularizer)

    # "transpose" of embedding matrix to map back to vocabulary
    if loaded_model:
        output_weights = original_model.get_layer(
            name="word_prediction_logits").get_weights()
        output_layer = TiedOutputEmbedding(
            projection_regularizer=l2_regularizer,
            projection_dropout=embedding_dropout,
            name="word_prediction_logits",
            weights=output_weights)
    else:
        output_layer = TiedOutputEmbedding(
            projection_regularizer=l2_regularizer,
            projection_dropout=embedding_dropout,
            name="word_prediction_logits")

    # transformer as taken from here: https://github.com/kpot/keras-transformer/blob/master/example/models.py
    if loaded_model:
        position_weights = original_model.get_layer(
            name="position_embedding").get_weights()
        position_embedding = TransformerCoordinateEmbedding(
            max_transformer_depth=1,
            name="position_embedding",
            weights=position_weights)
    else:
        position_embedding = TransformerCoordinateEmbedding(
            max_transformer_depth=1, name="position_embedding")

    transformer_input, embedding_matrix = embedding_layer(raw_input)
    transformer_output = position_embedding(transformer_input, step=0)
    for i in range(transformer_depth):
        block_name = "transformer" + str(i)

        # define transformer block
        transformer_block = TransformerBlock(
            name=block_name,
            num_heads=transformer_heads,
            residual_dropout=transformer_dropout,
            attention_dropout=transformer_dropout,
            use_masking=True,
            vanilla_wiring=True)

        # build the layers in the block because apparently you have to do that
        if loaded_model:
            if i == 0:
                transformer_block.attention_layer.build(
                    original_model.get_layer(
                        "position_embedding").output_shape)
            else:
                transformer_block.attention_layer.build(
                    original_model.get_layer(
                        "transformer{}_normalization2".format(i -
                                                              1)).output_shape)
            transformer_block.norm1_layer.build(
                original_model.get_layer(block_name +
                                         "_self_attention").output_shape)
            transformer_block.norm2_layer.build(
                original_model.get_layer(block_name +
                                         "_normalization1").output_shape)
            transformer_block.transition_layer.build(
                original_model.get_layer(block_name +
                                         "_normalization1").output_shape)

            # set weights for all the contained layers manually
            transformer_block.attention_layer.set_weights(
                original_model.get_layer(
                    name=(block_name + "_self_attention")).get_weights())
            transformer_block.norm1_layer.set_weights(
                original_model.get_layer(
                    name=(block_name + "_normalization1")).get_weights())
            transformer_block.norm2_layer.set_weights(
                original_model.get_layer(
                    name=(block_name + "_normalization2")).get_weights())
            transformer_block.transition_layer.set_weights(
                original_model.get_layer(name=(block_name +
                                               "_transition")).get_weights())

        # pass output of last layer through transformer
        transformer_output = transformer_block(transformer_output)

    if print_info:
        logger.debug("transformer_output shape: {}".format(
            K.int_shape(transformer_output[0]
                        if fine_tune_model else transformer_output)))

    # nothing special to load for softmax
    softmax_layer = Softmax(name="word_predictions")
    lm_output_logits = output_layer([transformer_output, embedding_matrix])
    lm_output = softmax_layer(lm_output_logits)
    if print_info:
        logger.debug("lm_output_logits shape: {}".format(
            K.int_shape(lm_output_logits)))
        logger.debug("output shape: {}".format(K.int_shape(lm_output)))

    if not fine_tune_model:
        m = Model(inputs=raw_input, outputs=lm_output)
        return m

    loaded_layer_names = []
    if loaded_model:
        loaded_layer_names = [layer.name for layer in original_model.layers]

    # for concatenation transformer outputs early
    flatten = Flatten(name="flatten_transformer_output")
    max_pooling = Lambda(lambda x: K.max(x, axis=1), name="max_pooling")
    mean_pooling = Lambda(lambda x: K.mean(x, axis=1), name="mean_pooling")
    self_attention = SeqSelfAttention(name="self_attention")
    scaled_dot_attention = ScaledDotProductAttention(
        name="scaled_dot_attention")
    dropout = Dropout(rate=classifier_dropout, name="classifier_dropout")
    options = {
        "flatten": flatten,
        "max_pooling": max_pooling,
        "mean_pooling": mean_pooling,
        "self_attention": self_attention,
        "scaled_dot_attention": scaled_dot_attention
    }

    dense = Dense(2, activation=None, name="dense")
    if loaded_model and "dense" in loaded_layer_names:
        layer = original_model.get_layer(name="dense")
        dense.build(layer.input_shape)
        dense.set_weights(layer.get_weights())

    pooling_layer = options[transformer_output_handling]
    if loaded_model and transformer_output_handling in loaded_layer_names:
        layer = original_model.get_layer(name=transformer_output_handling)
        pooling_layer.build(layer.input_shape)
        pooling_layer.set_weights(layer.get_weights())

    if "attention" in transformer_output_handling:
        handled_output = flatten(pooling_layer(transformer_output))
    else:
        handled_output = pooling_layer(transformer_output)

    classifier_logits = dense(dropout(handled_output))
    classifier_output = Softmax(
        name="classifier_prediction")(classifier_logits)

    if train_lm:
        m = Model(inputs=raw_input, outputs=[lm_output, classifier_output])
    else:
        m = Model(inputs=raw_input, outputs=classifier_output)
    # m = Model(inputs=raw_input, outputs=lm_output)
    return m