Exemplo n.º 1
0
 def build(self, hp, inputs=None):
     input_node = nest.flatten(inputs)[0]
     if self.ngrams is not None:
         ngrams = self.ngrams
     else:
         ngrams = hp.Int("ngrams", min_value=1, max_value=2, default=2)
     return layers.TextVectorization(
         max_tokens=self.max_tokens,
         ngrams=ngrams,
         output_mode="tf-idf",
         pad_to_max_tokens=True,
     )(input_node)
Exemplo n.º 2
0
 def build(self, hp, inputs=None):
     input_node = nest.flatten(inputs)[0]
     if self.output_sequence_length is not None:
         output_sequence_length = self.output_sequence_length
     else:
         output_sequence_length = hp.Choice(
             "output_sequence_length", [64, 128, 256, 512], default=64
         )
     output_node = layers.TextVectorization(
         max_tokens=self.max_tokens,
         output_mode="int",
         output_sequence_length=output_sequence_length,
     )(input_node)
     return output_node
Exemplo n.º 3
0
def get_e2e():
    biLSTM = get_biLSTM()
    biLSTM.load_weights('./weights/biLSTM')
    vectorizer = layers.TextVectorization(max_tokens=20000,
                                          output_sequence_length=150,
                                          standardize=None)
    vectorizer_weights = pickle.load(open("./weights/vectorizer.pkl",
                                          "rb"))['weights']
    vectorizer.set_weights(vectorizer_weights)

    inp = keras.Input(shape=(1, ), dtype="string")
    x = vectorizer(inp)
    out = biLSTM(x)
    e2e = keras.Model(inp, out)
    e2e.compile(loss="binary_crossentropy", optimizer='adam', metrics=["acc"])

    return e2e
Exemplo n.º 4
0
def main():
    # Defining hyperparameters.
    VOCAB_SIZE = 8192
    MAX_SAMPLES = 50000
    BUFFER_SIZE = 20000
    MAX_LENGTH = 40
    EMBED_DIM = 256
    LATENT_DIM = 512
    NUM_HEADS = 8
    BATCH_SIZE = 64

    # Loading the data.
    # Parse the movie conversations into questions and answers sets.
    path_to_zip = keras.utils.get_file(
        "cornell_movie_dialogs.zip",
        origin=
        "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip",
        extract=True,
    )

    path_to_dataset = os.path.join(os.path.dirname(path_to_zip),
                                   "cornell movie-dialogs corpus")
    path_to_movie_lines = os.path.join(path_to_dataset, "movie_lines.txt")
    path_to_movie_conversions = os.path.join(path_to_dataset,
                                             "movie_conversations.txt")

    def load_conversations():
        # Helper function for loading the conversation splits.
        id2line = {}
        with open(path_to_movie_lines, errors='ignore') as file:
            lines = file.readlines()
        for line in lines:
            parts = line.replace("\n", "").split(" +++$+++ ")
            id2line[parts[0]] = parts[4]

        inputs, outputs = [], []
        with open(path_to_movie_conversions, "r") as file:
            lines = file.readlines()
        for line in lines:
            parts = line.replace("\n", "").split(" +++$+++ ")

            # Get conversation in a list of the ID.
            conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")]
            for i in range(len(conversation) - 1):
                inputs.append(id2line[conversation[i]])
                outputs.append(id2line[conversation[i + 1]])
                if len(inputs) >= MAX_SAMPLES:
                    return inputs, outputs
        return inputs, outputs

    questions, answers = load_conversations()

    # Splitting training and validation sets.
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (questions[:40000], answers[:40000]))
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (questions[40000:], answers[40000:]))

    # Preprocessing and Tokenization
    def preprocess_text(sentence):
        sentence = tf.strings.lower(sentence)

        # Adding a space between the punctuation and the last work to
        # allow better tokenization.
        sentence = tf.strings.regex_replace(sentence, r"([?.!,])", r" \1 ")

        # Replacing multiple continuous spaces with a single space.
        sentence = tf.strings.regex_replace(sentence, r"\s\s+", " ")

        # Replacing non english words with spaces.
        sentence = tf.strings.regex_replace(sentence, r"[^a-z?.!,]+", " ")
        sentence = tf.strings.strip(sentence)
        sentence = tf.strings.join(["[start]", sentence, "[end]"],
                                   separator=" ")
        return sentence

    vectorizer = layers.TextVectorization(
        VOCAB_SIZE,
        standardize=preprocess_text,
        output_mode="int",
        output_sequence_length=MAX_LENGTH,
    )

    # Adapt the vectorizer to both the questions and answers. This
    # dataset is batched to parallelize and speed up the process.
    vectorizer.adapt(
        tf.data.Dataset.from_tensor_slices((questions + answers)).batch(128))

    # Tokenizing and padding sentences using TextVectorization
    def vectorize_text(inputs, outputs):
        inputs, outputs = vectorizer(inputs), vectorizer(outputs)

        # One extra padding token to the right to match the output
        # shape.
        outputs = tf.pad(outputs, [[0, 1]])
        return (
            {
                "encoder_inputs": inputs,
                "decoder_inputs": outputs[:-1]
            },
            {
                "outputs": outputs[1:]
            },
        )

    train_dataset = train_dataset.map(vectorize_text,
                                      num_parallel_calls=tf.data.AUTOTUNE)
    val_dataset = val_dataset.map(vectorize_text,
                                  num_parallel_calls=tf.data.AUTOTUNE)

    train_dataset = (
        train_dataset.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(
            tf.data.AUTOTUNE))
    val_dataset = (val_dataset.cache().batch(BATCH_SIZE).prefetch(
        tf.data.AUTOTUNE))

    # Creating the FNet Encoder
    # The FNet paper proposes a replacement for the standard attention
    # mechanism used by the Transformer architecture (Vaswani et al.,
    # 2017).
    # The outputs of the FFt layer are complex numbers. To avoid
    # dealing with complex layers, only the real part (the magnitude)
    # is extracted.
    # The dense layers that follow the Fourier transformation act as
    # convolutions applied on the frequency domain.
    class FNetEncoder(layers.Layer):
        def __init__(self, embed_dim, dense_dim, **kwargs):
            super(FNetEncoder, self).__init__(**kwargs)
            self.embed_dim = embed_dim
            self.dense_dim = dense_dim
            self.dense_proj = keras.Sequential([
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ])
            self.layernorm_1 = layers.LayerNormalization()
            self.layernorm_2 = layers.LayerNormalization()

        def call(self, inputs):
            # Casting the inputs to complex64.
            inp_complex = tf.cast(inputs, tf.complex64)

            # Projecting the inputs to the frequency domain using FFT2D
            # and extracting the real part of the output.
            fft = tf.math.real(tf.signal.fft2d(inp_complex))
            proj_input = self.layernorm_1(inputs + fft)
            proj_output = self.dense_proj(proj_input)
            return self.layernorm_2(proj_input + proj_output)

    # Creating the Decoder
    # The decoder architecture remains the same as the one proposed by
    # (Vaswani et al., 2017) in the original transformer architecture,
    # consisting of an embedding, positional encoding, two masked
    # multihead attention layers and finally the dense output layers.
    # The architecture that follows is taken from Deep Learning with
    # Python, second edition, chapter 11.
    class PositionalEmbedding(layers.Layer):
        def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
            super(PositionalEmbedding, self).__init__(**kwargs)
            self.token_embeddings = layers.Embedding(input_dim=vocab_size,
                                                     output_dim=embed_dim)
            self.position_embeddings = layers.Embedding(
                input_dim=sequence_length, output_dim=embed_dim)
            self.sequence_length = sequence_length
            self.vocab_size = vocab_size
            self.embed_dim = embed_dim

        def call(self, inputs):
            length = tf.shape(inputs)[-1]
            positions = tf.range(start=0, limit=length, delta=1)
            embedded_tokens = self.token_embeddings(inputs)
            embedded_positions = self.position_embeddings(positions)
            return embedded_tokens + embedded_positions

        def compute_mask(self, inputs, mask=None):
            return tf.math.not_equal(inputs, 0)

    class FNetDecoder(layers.Layer):
        def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
            super(FNetDecoder, self).__init__(**kwargs)
            self.embed_dim = embed_dim
            self.latent_dim = latent_dim
            self.num_heads = num_heads
            self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads,
                                                         key_dim=embed_dim)
            self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads,
                                                         key_dim=embed_dim)
            self.dense_proj = keras.Sequential([
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ])
            self.layernorm_1 = layers.LayerNormalization()
            self.layernorm_2 = layers.LayerNormalization()
            self.layernorm_3 = layers.LayerNormalization()
            self.supports_masking = True

        def call(self, inputs, encoder_outputs, mask=None):
            causal_mask = self.get_causal_attention_mask(inputs)
            if mask is not None:
                padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
                padding_mask = tf.minimum(padding_mask, causal_mask)

            attention_output_1 = self.attention_1(query=inputs,
                                                  value=inputs,
                                                  key=inputs,
                                                  attention_mask=causal_mask)
            out_1 = self.layernorm_1(inputs + attention_output_1)

            attention_output_2 = self.attention_2(
                query=out_1,
                value=encoder_outputs,
                key=encoder_outputs,
                attention_mask=padding_mask,
            )
            out_2 = self.layernorm_2(out_1 + attention_output_2)

            proj_output = self.dense_proj(out_2)
            return self.layernorm_3(out_2 + proj_output)

        def get_causal_attention_mask(self, inputs):
            input_shape = tf.shape(inputs)
            batch_size, sequence_length = input_shape[0], input_shape[1]
            i = tf.range(sequence_length)[:, tf.newaxis]
            j = tf.range(sequence_length)
            mask = tf.cast(i >= j, dtype="int32")
            mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
            mult = tf.concat(
                [
                    tf.expand_dims(batch_size, -1),
                    tf.constant([1, 1], dtype=tf.int32)
                ],
                axis=0,
            )
            return tf.tile(mask, mult)

    def create_model():
        encoder_inputs = keras.Input(shape=(None, ),
                                     dtype="int32",
                                     name="encoder_inputs")
        x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE,
                                EMBED_DIM)(encoder_inputs)
        encoder_outputs = FNetEncoder(EMBED_DIM, LATENT_DIM)(x)
        encoder = keras.Model(encoder_inputs, encoder_outputs)
        decoder_inputs = keras.Input(shape=(None, ),
                                     dtype="int32",
                                     name="decoder_inputs")
        encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM),
                                         name="decoder_state_inputs")
        x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE,
                                EMBED_DIM)(decoder_inputs)
        x = FNetDecoder(EMBED_DIM, LATENT_DIM, NUM_HEADS)(x,
                                                          encoded_seq_inputs)
        x = layers.Dropout(0.5)(x)
        decoder_outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
        decoder = keras.Model([decoder_inputs, encoded_seq_inputs],
                              decoder_outputs,
                              name="outputs")
        decoder_outputs = decoder([decoder_inputs, encoder_outputs])
        fnet = keras.Model([encoder_inputs, decoder_inputs],
                           decoder_outputs,
                           name="fnet")
        return fnet

    # Creating and Training the model
    # Here, the epochs parameter is set to a single epoch, but in
    # practice the model will take around 20-30 epochs of training to
    # start outputting comprehensible sentences. Although accuracy is
    # not a good measure for this task, it is used in this example to
    # get a hint of the improvement of the network.
    fnet = create_model()
    fnet.compile("adam",
                 loss="sparse_categorical_crossentropy",
                 metrics=["accuracy"])

    # Here, the epochs parameter is set to a single epoch, but in
    # practice, the model will take around 20-30 epochs of training to
    # start outputting comprehensible sentences. Although accuracy is
    # not a good measure for this task, the example will use it just to
    # get a hint of the improvement of the network.
    fnet.fit(train_dataset, epochs=1, validation_data=val_dataset)

    # Performing inference
    VOCAB = vectorizer.get_vocabulary()

    def decode_sentence(input_sentence):
        # Mapping the input sentence to tokens and adding start and end
        # tokens.
        tokenized_input_sentence = vectorizer(
            tf.constant("[start]" + preprocess_text(input_sentence) + "[end]"))

        # Initializing the initial sentence consisting of only the start
        # token.
        tokenized_target_sentence = tf.expand_dims(VOCAB.index("[start]"), 0)
        decoded_sentence = ""

        for i in range(MAX_LENGTH):
            # Get the predictions.
            predictions = fnet.predict({
                "encoder_inputs":
                tf.expand_dims(tokenized_input_sentence, 0),
                "decoder_inputs":
                tf.expand_dims(
                    tf.pad(
                        tokenized_target_sentence,
                        [[
                            0,
                            MAX_LENGTH - tf.shape(tokenized_target_sentence)[0]
                        ]],
                    ),
                    0,
                ),
            })

            # Calculating the token with maximum probability and
            # getting the corresponding word.
            sampled_token_index = tf.argmax(predictions[0, i, :])
            sampled_token = VOCAB[sampled_token_index.numpy()]

            # If sampled token is the end token then stop generating
            # and return the sentence.
            if tf.equal(sampled_token_index, VOCAB.index("[end]")):
                break
            decoded_sentence += sampled_token + " "
            tokenized_target_sentence = tf.concat(
                [tokenized_target_sentence, [sampled_token_index]], 0)

        return decoded_sentence

    prompt = "Where have you been all this time?"
    print("PROMPT > ")
    print("RESPONSE > " + decode_sentence(prompt))

    # Conclusion
    # This example shows how to train and perform inference using the
    # FNet model. For getting insight into the architecture or for
    # further reading, refer to:
    # 1) FNet: Mixing Tokens with Fourier Transforms (Lee-Thorp et al.,
    #	2021)
    # 2) Attention is All You Need (Vaswani et al., 2017)
    # Thanks to Francios Chollet for their Keras example on English-to-
    # Spanish translation with a sequence-to-sequence Transformer from
    # which the decoder implementation was extracted.

    # Exit the program.
    exit(0)
Exemplo n.º 5
0
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars),
                                    "")


eng_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
spa_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]
eng_vectorization.adapt(train_eng_texts)
spa_vectorization.adapt(train_spa_texts)


def format_dataset(eng, spa):
It can operate as a part of your main model so that the model is excluded from the core
preprocessing logic. This greatly reduces the chances of training / serving skew during inference.

We first calculate the number of unique words present in the abstracts.
"""
train_df["total_words"] = train_df["summaries"].str.split().str.len()
vocabulary_size = train_df["total_words"].max()
print(f"Vocabulary size: {vocabulary_size}")

"""
We now create our vectorization layer and `map()` to the `tf.data.Dataset`s created
earlier.
"""

text_vectorizer = layers.TextVectorization(
    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
)

# `TextVectorization` layer needs to be adapted as per the vocabulary from our
# training set.
with tf.device("/CPU:0"):
    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

train_dataset = train_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
validation_dataset = validation_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
).prefetch(auto)
test_dataset = test_dataset.map(
    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
Exemplo n.º 7
0
def preprocess_text(sentence):
    sentence = tf.strings.lower(sentence)
    # Adding a space between the punctuation and the last word to allow better tokenization
    sentence = tf.strings.regex_replace(sentence, r"([?.!,])", r" \1 ")
    # Replacing multiple continuous spaces with a single space
    sentence = tf.strings.regex_replace(sentence, r"\s\s+", " ")
    # Replacing non english words with spaces
    sentence = tf.strings.regex_replace(sentence, r"[^a-z?.!,]+", " ")
    sentence = tf.strings.strip(sentence)
    sentence = tf.strings.join(["[start]", sentence, "[end]"], separator=" ")
    return sentence


vectorizer = layers.TextVectorization(
    VOCAB_SIZE,
    standardize=preprocess_text,
    output_mode="int",
    output_sequence_length=MAX_LENGTH,
)

# We will adapt the vectorizer to both the questions and answers
# This dataset is batched to parallelize and speed up the process
vectorizer.adapt(
    tf.data.Dataset.from_tensor_slices((questions + answers)).batch(128))
"""
### Tokenizing and padding sentences using `TextVectorization`
"""


def vectorize_text(inputs, outputs):
    inputs, outputs = vectorizer(inputs), vectorizer(outputs)
    # One extra padding token to the right to match the output shape
Exemplo n.º 8
0
# function for tokenizing dataset
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                    '[%s]' % re.escape(string.punctuation), '')


# create layer for vectorizing dataset (turn tokens into integers)
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)


def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(raw_train_ds))
Exemplo n.º 9
0
"""
The `adapt()` method takes either a Numpy array or a `tf.data.Dataset` object. In the
case of `StringLookup` and `TextVectorization`, you can also pass a list of strings:
"""

data = [
    "ξεῖν᾽, ἦ τοι μὲν ὄνειροι ἀμήχανοι ἀκριτόμυθοι",
    "γίγνοντ᾽, οὐδέ τι πάντα τελείεται ἀνθρώποισι.",
    "δοιαὶ γάρ τε πύλαι ἀμενηνῶν εἰσὶν ὀνείρων:",
    "αἱ μὲν γὰρ κεράεσσι τετεύχαται, αἱ δ᾽ ἐλέφαντι:",
    "τῶν οἳ μέν κ᾽ ἔλθωσι διὰ πριστοῦ ἐλέφαντος,",
    "οἵ ῥ᾽ ἐλεφαίρονται, ἔπε᾽ ἀκράαντα φέροντες:",
    "οἱ δὲ διὰ ξεστῶν κεράων ἔλθωσι θύραζε,",
    "οἵ ῥ᾽ ἔτυμα κραίνουσι, βροτῶν ὅτε κέν τις ἴδηται.",
]
layer = layers.TextVectorization()
layer.adapt(data)
vectorized_text = layer(data)
print(vectorized_text)
"""
In addition, adaptable layers always expose an option to directly set state via
constructor arguments or weight assignment. If the intended state values are known at
layer construction time, or are calculated outside of the `adapt()` call, they can be set
without relying on the layer's internal computation. For instance, if external vocabulary
files for the `TextVectorization`, `StringLookup`, or `IntegerLookup` layers already
exist, those can be loaded directly into the lookup tables by passing a path to the
vocabulary file in the layer's constructor arguments.

Here's an example where we instantiate a `StringLookup` layer with precomputed vocabulary:
"""
Exemplo n.º 10
0
def main():
	# Perform exploratory data analysis
	# In this section, first load the dataset into a pandas dataframe
	# and then perform some basic exploratory data analysis (EDA).
	arxiv_data = pd.read_csv(
		"https://github.com/soumik12345/multi-label-text-classification/releases/download/v0.2/arxiv_data.csv"
	)
	arxiv_data.head()

	# The text features are present in the summaries column and their
	# corresponding labels are in terms. There are multiple categories
	# associated with a particular entry.
	print(f"Therea rea {len(arxiv_data)} rows in the dataset.")

	# Real-word data is noisy. One of the most commonly observed source
	# of noise is data duplication. Here, notice the initial dataset
	# has got about 13K duplicate entries.
	total_duplicate_titles = sum(arxiv_data["titles"].duplicated())
	print(f"There are {total_duplicate_titles} duplicate titles.")

	# Before proceeding further, drop these entries.
	arxiv_data = arxiv_data[~arxiv_data["titles"].duplicated()]
	print(f"There are {len(arxiv_data)} rows in the deduplicated dataset.")

	# There are some terms with occurence as low as 1.
	print(sum(arxiv_data["terms"].value_counts() == 1))

	# How many unique terms?
	print(arxiv_data["terms"].unique())

	# As observed above, out of 3,157 unique combinations of terms,
	# 2,321 entries have the lowest occurence. to prepare the train,
	# validation, and test sets with stratification, must drop these
	# terms.
	# Filtering the rare terms.
	arxiv_data_filtered = arxiv_data.groupby("terms").filter(
		lambda x: len(x) > 1
	)
	print(arxiv_data_filtered.shape)

	# Convert the string labels to a list of strings.
	# The initial labels are represented as raw strings. Here, make
	# them List[str] for a more compact representation.
	arxiv_data_filtered["terms"] = arxiv_data_filtered["terms"].apply(
		lambda x: literal_eval(x)
	)
	print(arxiv_data_filtered["terms"].values[:5])

	# Use stratified splits because of class imbalance.
	# The dataset has a class imabalance problem. So, to have a fair
	# evaluation result, ensure the datasets are sampled with
	# stratification. To know more about different strategies to deal
	# with the class imbalance problem, follow this tutorial. For an
	# end-to-end demonstration of classifcation with imbalanced data,
	# refer to imbalanced classifcation: credit card fraud detection.
	test_split = 0.1

	# Initial train and test split.
	train_df, test_df = train_test_split(
		arxiv_data_filtered, test_size=test_split,
		stratify=arxiv_data_filtered["terms"].values,
	)

	# Splitting the test set further into validation and new test sets.
	val_df = test_df.sample(frac=0.5)
	test_df.drop(val_df.index, inplace=True)

	print(f"Number of rows in training set: {len(train_df)}")
	print(f"Number of rows in validation set: {len(val_df)}")
	print(f"Number of rows in test set: {len(test_df)}")

	# Multi-label binarization.
	# Preprocess the labels using the StringLookup layer.
	terms = tf.ragged.constant(train_df["terms"].values)
	lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
	lookup.adapt(terms)
	vocab = lookup.get_vocabulary()

	def invert_multi_hot(encoded_labels):
		# Reverse a single multi-hot encoded label to a tuple of vocab
		# terms.
		hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
		return np.take(vocab, hot_indices)

	print("Vocabulary:\n")
	print(vocab)

	# Here, separate the individual unique classes available from the
	# label pool and then using this information to represent a given
	# label set with 0's and 1's.
	sample_label = train_df["terms"].iloc[0]
	print(f"Original label: {sample_label}")

	label_binarized = lookup([sample_label])
	print(f"Label-binarized representation: {label_binarized}")

	# Data preprocessing and tf.data.Dataset objects.
	# First, get the percentile estimates of the sequence lengths. The
	# purpose will be clear in a moment.
	train_df["summaries"].apply(lambda x: len(x.split(" "))).describe()

	# Notice that 50% of the abstracts have a length of 154 (This
	# number may be different based upon the split). So, any number
	# close to that value is a good enough approximate for the
	# maximum sequence length.
	# Now, implement utilities to prepare the datasets that would go
	# straight to the text classifier model.
	max_seqlen = 150
	batch_size = 128
	padding_token = "<pad>"
	auto = tf.data.AUTOTUNE

	def unify_text_length(text, label):
		# Split the given abstract and calculate its length.
		word_splits = tf.strings.split(text, sep=" ")
		sequence_length = tf.shape(word_splits)[0]

		# Calculate the padding amount.
		padding_amount = max_seqlen - sequence_length

		# Check if there needs to be a pad or truncate.
		if padding_amount > 0:
			unified_text = tf.pad(
				[text], [[0, padding_amount]], constant_values="<pad>"
			)
			unified_text = tf.strings.reduce_join(
				unified_text, separator=""
			)
		else:
			unified_text = tf.strings.reduce_join(
				word_splits[:max_seqlen], separator=" "
			)

		# The expansion is needed for subsequent vectorization.
		return tf.expand_dims(unified_text, -1), label

	def make_dataset(dataframe, is_train=True):
		labels = tf.ragged.constant(dataframe["terms"].values)
		label_binarized = lookup(labels)
		dataset = tf.data.Dataset.from_tensor_slices(
			(dataframe["summaries"].values, label_binarized)
		)
		dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
		dataset = dataset.map(unify_text_length, num_parallel_calls=auto).cache()
		return dataset.batch(batch_size)

	# Can now prepare the tf.data.Dataset objects.
	train_dataset = make_dataset(train_df, is_train=True)
	validation_dataset = make_dataset(val_df, is_train=False)
	test_dataset = make_dataset(test_df, is_train=False)

	# Dataset preview.
	text_batch, label_batch = next(iter(train_dataset))

	for i, text in enumerate(text_batch[:5]):
		label = label_batch[i].numpy()[None, ...]
		print(f"Abstract: {text[0]}")
		print(f"Label(s): {invert_multi_hot(label[0])}")
		print(" ")

	# Vectorization.
	# Before feeding the data to the model, it must be vectorized
	# (represented in a numerical form). For that purpose, use the
	# TextVectorization layer. It can operate as part of the main model
	# so that the model is excluded from the core processing logic.
	# This greatly reduces the chances of training/serving skew during
	# inference.
	# First, calculate the number of unique words present in the
	# abstracts.
	train_df["total_words"] = train_df["summaries"].str.split().str.len()
	vocabulary_size = train_df["total_words"].max()
	print("Vocabulary size: {vocabulary_size}")

	# Now create the vectorization layer and map() to the
	# tf.data.Datasets created earlier.
	text_vectorizer = layers.TextVectorization(
		max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
	)

	# "TextVectorization" layer needs to be adapted as per the
	# vocabulary from the training set.
	with tf.device("/CPU:0"):
		text_vectorizer.adapt(
			train_dataset.map(lambda text, label: text)
		)

	train_dataset = train_dataset.map(
		lambda text, label: (text_vectorizer(text), label), 
		num_parallel_calls=auto
	).prefetch(auto)
	validation_dataset = validation_dataset.map(
		lambda text, label: (text_vectorizer(text), label), 
		num_parallel_calls=auto
	).prefetch(auto)
	test_dataset = test_dataset.map(
		lambda text, label: (text_vectorizer(text), label), 
		num_parallel_calls=auto
	).prefetch(auto)

	# A batch of raw text will first go through the TextVectorization
	# layer and it will generate their integer representations.
	# Internally, the TextVectorization layer will first create
	# bi-grams out of the sequences and then represent them using 
	# TF-IDF. The output representation will then be passed to the
	# shallow model responsible for text classification.
	# To learn more about other possible configurations with
	# TextVectorizer, please consult the official documentation.
	# Note: Setting the max_tokens argument to a pre-calculated
	# vocabulary size is not a requirement.

	# Create a text classification model.
	# Keep the model simple -- it will be a small stack of fully-
	# connected layers with ReLU as the non-linearity.
	def make_model():
		shallow_mlp_model = keras.Sequential(
			[
				layers.Dense(512, activation="relu"),
				layers.Dense(256, activation="relu"),
				layers.Dense(
					lookup.vocabulary_size(), activation="sigmoid"
				), # More on why "sigmoid" has been used here in a moment.
			]
		)
		return shallow_mlp_model

	# Train the model.
	# Train the model using binary crossentropy loss. This is because
	# the labels are not disjoint. For a given abstract, there may be
	# multiple categories. So, the prediction task will be divided into
	# a series of multiple binary classification problems. This is also
	# why the activation function of the classification layer was kept
	# to sigmoid. Researchers have used other combinations of loss
	# functions and activation functions as well. For example, in
	# Exporing the Limits of Weakly Supervised Pretraining, Mahajan et
	# al. used the softmax activation function and cross-entropy loss
	# to train their models.
	epochs = 20

	shallow_mlp_model = make_model()
	shallow_mlp_model.compile(
		loss="binary_crossentropy", optimizer="adam",
		metrics=["categorical_accuracy"]
	)

	history = shallow_mlp_model.fit(
		train_dataset, validation_data=validation_dataset, epochs=epochs
	)

	'''
	def plot_results(item):
		plt.plot(history.history[item], label=item)
		plt.plot(history.history["val_" + item], label="val_" + item)
		plt.xlabel("Epochs")
		plt.ylabel(item)
		plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
		plt.legend()
		plt.grid()
		plt.show()

	plot_results("loss")
	plot_results("categorical_accuracy")
	# While training, notice an initial sharp fall in the loss
	# followed by a gradual decay.
	'''

	# Evaluate the model.
	_, categorical_acc = shallow_mlp_model.evaluate(test_dataset)
	print(f"Categorical accuracy on the test set: {round(categorical_acc * 100, 2)}%.")

	# The trained model gives an evaluation accuracy of ~87%.

	# Inference.
	# An important feature of the preprocessing layers provided by
	# Keras is that they can be included inside a tf.keras.Model. This
	# example will export an inference model by including the
	# text_vectorization layer on top of shallow_mlp_model. This will
	# allow the inference model to directly operate on raw strings.
	# Note that during training, it is always preferable to use these
	# preprocessing layers as part of the data input pipeline rather
	# than the model to avoid surfacing bottlenecks for the hardware
	# accelerators. This also allows for asynchronous data processing.
	# Create a model for inference.
	model_for_inference = keras.Sequential(
		[text_vectorizer, shallow_mlp_model]
	)

	# Create a small dataset just for demoing inference.
	inference_dataset = make_dataset(
		test_df.sample(100), is_train=False
	)
	text_batch, label_batch = next(iter(inference_dataset))
	predicted_probabilities = model_for_inference.predict(text_batch)

	# Perform inference.
	for i, text in enumerate(text_batch[:5]):
		label = label_batch[i].numpy()[None, ...]
		print(f"Abstract: {text[0]}")
		print(f"Label(s): {invert_multi_hot(label[0])}")
		predicted_probs = [probs for probs in predicted_probabilities[i]]
		top_3_labels = [
			x
			for _, x in sorted(
				zip(predicted_probabilities[i], lookup.get_vocabulary()),
				key=lambda pair: pair[0],
				reverse=True
			)
		][:3]
		print(f"Predicted Label(s): ({', '.join([label for label in top_3_labels])})")
		print(" ")

	# The prediction results are not that great but not below the par
	# for a simple model like the one above. Can improve this
	# performance with models that consider word order like LSTM or
	# even those that use Transformers (Vaswani et al.).

	# Exit the program.
	exit(0)
Exemplo n.º 11
0
Since we are working with text data, we will need to encode the text strings as vectors which
would then be passed through an `Embedding` layer. To make this tokenization process
faster, we use the `map()` function with its parallelization functionality.
"""


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )


vectorizer = layers.TextVectorization(
    3000, standardize=custom_standardization, output_sequence_length=150
)
# Adapting the dataset
vectorizer.adapt(
    train_dataset.map(lambda x, y: x, num_parallel_calls=tf.data.AUTOTUNE).batch(256)
)


def vectorize_text(text, label):
    text = vectorizer(text)
    return text, label


train_dataset = train_dataset.map(
    vectorize_text, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)