def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]): """Build Text vectorization layer Args: texts (list): List of string i.e input texts vocab_size (int): vocab size max_seq (int): Maximum sequence lenght. special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]']. Returns: layers.Layer: Return TextVectorization Keras Layer """ vectorize_layer = TextVectorization( max_tokens=vocab_size, output_mode="int", standardize=custom_standardization, output_sequence_length=max_seq, ) vectorize_layer.adapt(texts) # Insert mask token in vocabulary vocab = vectorize_layer.get_vocabulary() vocab = vocab[2:vocab_size - len(special_tokens)] + ["[mask]"] vectorize_layer.set_vocabulary(vocab) return vectorize_layer
caption_model.compile( optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE), loss=cross_entropy) # Fit the model caption_model.fit( train_dataset, epochs=EPOCHS, validation_data=valid_dataset, callbacks=[early_stopping], ) """ ## Check sample predictions """ vocab = vectorization.get_vocabulary() index_lookup = dict(zip(range(len(vocab)), vocab)) max_decoded_sentence_length = SEQ_LENGTH - 1 valid_images = list(valid_data.keys()) def generate_caption(): # Select a random image from the validation dataset sample_img = np.random.choice(valid_images) # Read the image from the disk sample_img = read_image(sample_img) img = sample_img.numpy().astype(np.uint8) plt.imshow(img) plt.show()
transformer.summary() transformer.compile("rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) transformer.fit(train_ds, epochs=epochs, validation_data=val_ds) """ ## Decoding test sentences Finally, let's demonstrate how to translate brand new English sentences. We simply feed into the model the vectorized English sentence as well as the target token `"[start]"`, then we repeatedly generated the next token, until we hit the token `"[end]"`. """ spa_vocab = spa_vectorization.get_vocabulary() spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) max_decoded_sentence_length = 20 def decode_sequence(input_sentence): tokenized_input_sentence = eng_vectorization([input_sentence]) decoded_sentence = "[start]" for i in range(max_decoded_sentence_length): tokenized_target_sentence = spa_vectorization([decoded_sentence ])[:, :-1] predictions = transformer( [tokenized_input_sentence, tokenized_target_sentence]) sampled_token_index = np.argmax(predictions[0, i, :]) sampled_token = spa_index_lookup[sampled_token_index]
Our layer will only consider the top 20,000 words, and will truncate or pad sequences to be actually 200 tokens long. """ from tensorflow.keras.layers import TextVectorization vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200) text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128) vectorizer.adapt(text_ds) """ You can retrieve the computed vocabulary used via `vectorizer.get_vocabulary()`. Let's print the top 5 words: """ vectorizer.get_vocabulary()[:5] """ Let's vectorize a test sentence: """ output = vectorizer([["the cat sat on the mat"]]) output.numpy()[0, :6] """ As you can see, "the" gets represented as "2". Why not 0, given that "the" was the first word in the vocabulary? That's because index 0 is reserved for padding and index 1 is reserved for "out of vocabulary" tokens. Here's a dict mapping words to their indices: """ voc = vectorizer.get_vocabulary()
""" Remove html line-break tags and handle punctuation """ lowercased = tf.strings.lower(input_string) stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ") return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1") # Create a vectorization layer and adapt it to the text vectorize_layer = TextVectorization( standardize=custom_standardization, max_tokens=vocab_size - 1, output_mode="int", output_sequence_length=maxlen + 1, ) vectorize_layer.adapt(text_ds) vocab = vectorize_layer.get_vocabulary( ) # To get words back from token indices def prepare_lm_inputs_labels(text): """ Shift word sequences by 1 position so that the target for position (i) is word at position (i+1). The model will use all words up till position (i) to predict the next word. """ text = tf.expand_dims(text, -1) tokenized_sentences = vectorize_layer(text) x = tokenized_sentences[:, :-1] y = tokenized_sentences[:, 1:] return x, y