예제 #1
0
    def train_step(self, input_tensor, target):
        encoder_input = input_tensor
        decoder_input = tf.concat(
            [tf.fill((input_tensor.shape[0], 1, 1), -1.), target[:, :-1, :]],
            axis=1)

        encoder_padding_mask = create_padding_mask(encoder_input[:, :, 0])
        decoder_padding_mask = create_padding_mask(decoder_input[:, 0])

        look_ahead_mask = create_look_ahead_mask(self.SEQUENCE_LENGTH)

        with tf.GradientTape() as tape:
            direction_pred, regression_pred = self.nn(encoder_input,
                                                      decoder_input, True,
                                                      encoder_padding_mask,
                                                      look_ahead_mask,
                                                      decoder_padding_mask)

            loss = self.loss_function(target, direction_pred, regression_pred)

        gradients = tape.gradient(loss, self.nn.trainable_variables)
        self.opt.apply_gradients(zip(gradients, self.nn.trainable_variables))

        logits = tf.concat([regression_pred, direction_pred], axis=-1)

        return loss, logits
    def create_masks(self, inp, tar):

        # Create look ahead mask - location of future token will be 1,
        # Since there is no need for padding mask, the output will all be zero
        # Combine the look ahead and padding mask
        # For NLP transformer, the input is (x,y)
        # For modified time series transformer, the input is (x,y,1)
        # Thus the dimension is different when creating padding and modification is required

        # Encoder padding mask
        enc_padding_mask = transformer.create_padding_mask(inp)
        enc_padding_mask = enc_padding_mask[:, :, :, :,
                                            0]  # ensure consistent dimension

        # Used in the 2nd attention block in the decoder.
        # This padding mask is used to mask the encoder outputs.
        dec_padding_mask = transformer.create_padding_mask(inp)
        dec_padding_mask = dec_padding_mask[:, :, :, :,
                                            0]  # ensure consistent dimension

        # Used in the 1st attention block in the decoder.
        # It is used to pad and mask future tokens in the input received by the decoder.
        look_ahead_mask = transformer.create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = transformer.create_padding_mask(tar)
        dec_target_padding_mask = dec_target_padding_mask[:, :, :, :,
                                                          0]  # ensure consistent dimension

        combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, combined_mask, dec_padding_mask
예제 #3
0
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout):
    inp = Input((seq_len, ))
    embedding = Embedding(vocab_size, d_model, pad_id)(inp)
    encoding = PositionalEncoding(d_model)(inp)
    net = Add()([embedding, encoding])
    net = Dropout(dropout)(net)
    mask = Lambda(lambda t: create_padding_mask(t, pad_id),
                  name="input_mask")(inp)
    net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h,
                  dropout=dropout)([net, mask])
    net = Flatten()(net)
    net = Dense(2, activation="softmax")(net)

    model = Model(inp, net)

    # NOTE: keras optimizers cannot be saved with optimizer state
    # need to use an optimizer from `tf.train`
    # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are
    # dropped and the keras versions are the only implementations
    # NOTE: this is not recommended for training, the paper authors describe
    # a variable learning rate schedule, that still needs to be implemented.
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001,
                                       beta1=0.9,
                                       beta2=0.98,
                                       epsilon=1e-9)

    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=["acc"])

    return model
예제 #4
0
    def eval_step(self, input_tensor, target):
        encoder_input = input_tensor
        decoder_input = tf.concat(
            [tf.fill((input_tensor.shape[0], 1, 1), -1.), target[:, :-1, :]],
            axis=1)

        encoder_padding_mask = create_padding_mask(encoder_input[:, :, 0])
        decoder_padding_mask = create_padding_mask(decoder_input[:, 0])

        look_ahead_mask = create_look_ahead_mask(self.SEQUENCE_LENGTH)

        direction_pred, regression_pred = self.nn(encoder_input, decoder_input,
                                                  False, encoder_padding_mask,
                                                  look_ahead_mask, None)

        loss = self.loss_function(target, direction_pred, regression_pred)

        logits = tf.concat([regression_pred, direction_pred], axis=-1)

        return loss, logits
예제 #5
0
 def test_create_padding_mask(self):
     """
     https://github.com/lilianweng/transformer-tensorflow/blob/master/transformer_test.py#L56
     """
     with self.test_session() as sess:
         mask = sess.run(create_padding_mask(self.raw_input, 0),
                         feed_dict={self.raw_input: self.fake_data})
         expected = np.array([
             [[1., 1., 1., 1., 1.]] * self.seq_len,
             [[1., 1., 0., 0., 0.]] * self.seq_len,
             [[1., 1., 1., 1., 0.]] * self.seq_len,
             [[1., 1., 1., 0., 0.]] * self.seq_len,
         ])
         np.testing.assert_array_equal(mask, expected)
예제 #6
0
img_shape = (224, 224, 3)
imgL = Input(shape=img_shape, name="imgL", dtype="float32")
imgR = Input(shape=img_shape, name="imgR", dtype="float32")
sent = Input(shape=(40, ), name="sent", dtype="int32")

# embedding images
fcnn = ResnetV1_FCNN(img_shape, 20)
em_imgL = fcnn(imgL)
em_imgR = fcnn(imgR)
em_imgs = tf.keras.layers.Concatenate(axis=2)([em_imgL, em_imgR])

# embedding sentence
print("creating transformer encoder")
GloVe_embeddings = np.load("word_embeddings/embedding.npy")
print(GloVe_embeddings.shape)
enc_mask = create_padding_mask(sent)
encoder = Encoder(
    num_layers=4,
    d_model=300,  # also the word embedding dim
    num_heads=12,
    dff=512,
    input_vocab_size=GloVe_embeddings.shape[0],
    embeddings_initializer=Constant(GloVe_embeddings),
)
em_sent = encoder(sent, training=True, mask=enc_mask)

# getting prediction from the Relational Neural Network
print("creating relational network")
relation_matrix = RelationalProduct()([em_sent, em_imgs])
g = ConvolutionalPerceptron(relation_matrix.shape[1:], [256, 256])
em_relations = g(relation_matrix)
 def create_input_mask(input):
     # Encoder padding mask
     enc_padding_mask = create_padding_mask(input)
     return enc_padding_mask
    def greedy_decode(self, examples, is_train=False, tgt_seq_len=None):
        # at each step, decode with whole output prefix
        src_token_ids = examples["src_token_ids"]

        if not tgt_seq_len:
            tgt_seq_len = self.tgt_seq_len

        enc_padding_mask = create_padding_mask(
            src_token_ids, self.src_vocab.token2idx[self.src_vocab.PAD])
        dec_padding_mask = create_padding_mask(
            src_token_ids, self.src_vocab.token2idx[self.src_vocab.PAD])
        # (batch_size, inp_seq_len, d_model)
        enc_output = self.encoder(src_token_ids, is_train, enc_padding_mask)
        batch_size = tf.shape(enc_output)[0]
        start_token = tf.reshape(
            tf.cast(tf.repeat(self.tgt_vocab.token2idx[self.tgt_vocab.BOS],
                              repeats=batch_size),
                    dtype=tf.int64), [-1, 1])
        tgt_inputs = start_token
        tgt_edges = tf.zeros([batch_size, 1, tgt_seq_len], dtype=tf.int64) - 1

        start_token_onehot = tf.one_hot(start_token,
                                        depth=(self.tgt_vocab_size +
                                               self.src_seq_len))
        start_token_logits = start_token_onehot + (start_token_onehot -
                                                   1) * 1e9
        output = [start_token_logits]
        edge_output = []

        mem = {}

        for t in range(1, tgt_seq_len):
            look_ahead_mask = create_look_ahead_mask(t)[tf.newaxis,
                                                        tf.newaxis, :, :]
            # dec_output.shape == (batch_sz, t, tgt_vocab_size+src_seq_len)
            dec_output, _, edge_scores = self.decoder(tgt_inputs,
                                                      enc_output,
                                                      is_train,
                                                      look_ahead_mask,
                                                      dec_padding_mask,
                                                      mem=mem,
                                                      tgt_edges=tgt_edges)

            # (batch_sz, tgt_vocab_size+src_seq_len)
            last_step_output = dec_output[:, -1, :]
            last_step_output_idx = tf.expand_dims(tf.argmax(last_step_output,
                                                            axis=1),
                                                  axis=-1)
            tgt_inputs = tf.concat([tgt_inputs, last_step_output_idx], axis=-1)

            last_step_score = tf.expand_dims(edge_scores[:, -1, :], 1)
            last_step_score_idx = tf.cast(last_step_score > 0, tf.int64)
            pad = tf.zeros([batch_size, 1, tgt_seq_len - t], dtype=tf.int64)
            last_step_score_idx = tf.concat([last_step_score_idx, pad],
                                            axis=-1)
            tgt_edges = tf.concat([tgt_edges, last_step_score_idx], axis=1)

            # (batch_sz, t+1)
            output.append(dec_output)

            edge_output.append(
                tf.concat([
                    edge_scores,
                    tf.fill([batch_size, 1, tgt_seq_len - t], -1e9)
                ],
                          axis=2))

        dec_output = tf.concat(output, axis=1)
        edge_output.append(tf.fill([batch_size, 1, tgt_seq_len], -1e9))
        edge_output = tf.concat(edge_output, axis=1)
        return dec_output, edge_output