def train_step(self, input_tensor, target): encoder_input = input_tensor decoder_input = tf.concat( [tf.fill((input_tensor.shape[0], 1, 1), -1.), target[:, :-1, :]], axis=1) encoder_padding_mask = create_padding_mask(encoder_input[:, :, 0]) decoder_padding_mask = create_padding_mask(decoder_input[:, 0]) look_ahead_mask = create_look_ahead_mask(self.SEQUENCE_LENGTH) with tf.GradientTape() as tape: direction_pred, regression_pred = self.nn(encoder_input, decoder_input, True, encoder_padding_mask, look_ahead_mask, decoder_padding_mask) loss = self.loss_function(target, direction_pred, regression_pred) gradients = tape.gradient(loss, self.nn.trainable_variables) self.opt.apply_gradients(zip(gradients, self.nn.trainable_variables)) logits = tf.concat([regression_pred, direction_pred], axis=-1) return loss, logits
def create_masks(self, inp, tar): # Create look ahead mask - location of future token will be 1, # Since there is no need for padding mask, the output will all be zero # Combine the look ahead and padding mask # For NLP transformer, the input is (x,y) # For modified time series transformer, the input is (x,y,1) # Thus the dimension is different when creating padding and modification is required # Encoder padding mask enc_padding_mask = transformer.create_padding_mask(inp) enc_padding_mask = enc_padding_mask[:, :, :, :, 0] # ensure consistent dimension # Used in the 2nd attention block in the decoder. # This padding mask is used to mask the encoder outputs. dec_padding_mask = transformer.create_padding_mask(inp) dec_padding_mask = dec_padding_mask[:, :, :, :, 0] # ensure consistent dimension # Used in the 1st attention block in the decoder. # It is used to pad and mask future tokens in the input received by the decoder. look_ahead_mask = transformer.create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = transformer.create_padding_mask(tar) dec_target_padding_mask = dec_target_padding_mask[:, :, :, :, 0] # ensure consistent dimension combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask
def create_model(seq_len, vocab_size, pad_id, N, d_model, d_ff, h, dropout): inp = Input((seq_len, )) embedding = Embedding(vocab_size, d_model, pad_id)(inp) encoding = PositionalEncoding(d_model)(inp) net = Add()([embedding, encoding]) net = Dropout(dropout)(net) mask = Lambda(lambda t: create_padding_mask(t, pad_id), name="input_mask")(inp) net = Encoder(N=N, d_model=d_model, d_ff=d_ff, h=h, dropout=dropout)([net, mask]) net = Flatten()(net) net = Dense(2, activation="softmax")(net) model = Model(inp, net) # NOTE: keras optimizers cannot be saved with optimizer state # need to use an optimizer from `tf.train` # NOTE: this seems to be a 1.0 thing, in 2.0 all tf.train optimizers are # dropped and the keras versions are the only implementations # NOTE: this is not recommended for training, the paper authors describe # a variable learning rate schedule, that still needs to be implemented. optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.9, beta2=0.98, epsilon=1e-9) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"]) return model
def eval_step(self, input_tensor, target): encoder_input = input_tensor decoder_input = tf.concat( [tf.fill((input_tensor.shape[0], 1, 1), -1.), target[:, :-1, :]], axis=1) encoder_padding_mask = create_padding_mask(encoder_input[:, :, 0]) decoder_padding_mask = create_padding_mask(decoder_input[:, 0]) look_ahead_mask = create_look_ahead_mask(self.SEQUENCE_LENGTH) direction_pred, regression_pred = self.nn(encoder_input, decoder_input, False, encoder_padding_mask, look_ahead_mask, None) loss = self.loss_function(target, direction_pred, regression_pred) logits = tf.concat([regression_pred, direction_pred], axis=-1) return loss, logits
def test_create_padding_mask(self): """ https://github.com/lilianweng/transformer-tensorflow/blob/master/transformer_test.py#L56 """ with self.test_session() as sess: mask = sess.run(create_padding_mask(self.raw_input, 0), feed_dict={self.raw_input: self.fake_data}) expected = np.array([ [[1., 1., 1., 1., 1.]] * self.seq_len, [[1., 1., 0., 0., 0.]] * self.seq_len, [[1., 1., 1., 1., 0.]] * self.seq_len, [[1., 1., 1., 0., 0.]] * self.seq_len, ]) np.testing.assert_array_equal(mask, expected)
img_shape = (224, 224, 3) imgL = Input(shape=img_shape, name="imgL", dtype="float32") imgR = Input(shape=img_shape, name="imgR", dtype="float32") sent = Input(shape=(40, ), name="sent", dtype="int32") # embedding images fcnn = ResnetV1_FCNN(img_shape, 20) em_imgL = fcnn(imgL) em_imgR = fcnn(imgR) em_imgs = tf.keras.layers.Concatenate(axis=2)([em_imgL, em_imgR]) # embedding sentence print("creating transformer encoder") GloVe_embeddings = np.load("word_embeddings/embedding.npy") print(GloVe_embeddings.shape) enc_mask = create_padding_mask(sent) encoder = Encoder( num_layers=4, d_model=300, # also the word embedding dim num_heads=12, dff=512, input_vocab_size=GloVe_embeddings.shape[0], embeddings_initializer=Constant(GloVe_embeddings), ) em_sent = encoder(sent, training=True, mask=enc_mask) # getting prediction from the Relational Neural Network print("creating relational network") relation_matrix = RelationalProduct()([em_sent, em_imgs]) g = ConvolutionalPerceptron(relation_matrix.shape[1:], [256, 256]) em_relations = g(relation_matrix)
def create_input_mask(input): # Encoder padding mask enc_padding_mask = create_padding_mask(input) return enc_padding_mask
def greedy_decode(self, examples, is_train=False, tgt_seq_len=None): # at each step, decode with whole output prefix src_token_ids = examples["src_token_ids"] if not tgt_seq_len: tgt_seq_len = self.tgt_seq_len enc_padding_mask = create_padding_mask( src_token_ids, self.src_vocab.token2idx[self.src_vocab.PAD]) dec_padding_mask = create_padding_mask( src_token_ids, self.src_vocab.token2idx[self.src_vocab.PAD]) # (batch_size, inp_seq_len, d_model) enc_output = self.encoder(src_token_ids, is_train, enc_padding_mask) batch_size = tf.shape(enc_output)[0] start_token = tf.reshape( tf.cast(tf.repeat(self.tgt_vocab.token2idx[self.tgt_vocab.BOS], repeats=batch_size), dtype=tf.int64), [-1, 1]) tgt_inputs = start_token tgt_edges = tf.zeros([batch_size, 1, tgt_seq_len], dtype=tf.int64) - 1 start_token_onehot = tf.one_hot(start_token, depth=(self.tgt_vocab_size + self.src_seq_len)) start_token_logits = start_token_onehot + (start_token_onehot - 1) * 1e9 output = [start_token_logits] edge_output = [] mem = {} for t in range(1, tgt_seq_len): look_ahead_mask = create_look_ahead_mask(t)[tf.newaxis, tf.newaxis, :, :] # dec_output.shape == (batch_sz, t, tgt_vocab_size+src_seq_len) dec_output, _, edge_scores = self.decoder(tgt_inputs, enc_output, is_train, look_ahead_mask, dec_padding_mask, mem=mem, tgt_edges=tgt_edges) # (batch_sz, tgt_vocab_size+src_seq_len) last_step_output = dec_output[:, -1, :] last_step_output_idx = tf.expand_dims(tf.argmax(last_step_output, axis=1), axis=-1) tgt_inputs = tf.concat([tgt_inputs, last_step_output_idx], axis=-1) last_step_score = tf.expand_dims(edge_scores[:, -1, :], 1) last_step_score_idx = tf.cast(last_step_score > 0, tf.int64) pad = tf.zeros([batch_size, 1, tgt_seq_len - t], dtype=tf.int64) last_step_score_idx = tf.concat([last_step_score_idx, pad], axis=-1) tgt_edges = tf.concat([tgt_edges, last_step_score_idx], axis=1) # (batch_sz, t+1) output.append(dec_output) edge_output.append( tf.concat([ edge_scores, tf.fill([batch_size, 1, tgt_seq_len - t], -1e9) ], axis=2)) dec_output = tf.concat(output, axis=1) edge_output.append(tf.fill([batch_size, 1, tgt_seq_len], -1e9)) edge_output = tf.concat(edge_output, axis=1) return dec_output, edge_output