Exemplos de UnicodeScriptTokenizer em Python, exemplos de tensorflow_text.UnicodeScriptTokenizer em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: tensor_analyse.py Projeto: filipe-guerra/Tensor-project

 def __init__(self, all_text):
     # self.paragraphs = all_text
     tokenizer = text.UnicodeScriptTokenizer()
     (self.tokens, self.offset_starts,
      self.offset_limits) = tokenizer.tokenize_with_offsets(all_text)
     self.bigrams = text.ngrams(self.tokens,
                                2,
                                reduction_type=text.Reduction.STRING_JOIN)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: tf_text.py Projeto: youngkiu/flipped

import tensorflow_text as text

docs = tf.constant([
    u'Everything not saved will be lost.'.encode('UTF-16-BE'),
    u'Sad☹'.encode('UTF-16-BE')
])
utf8_docs = tf.strings.unicode_transcode(docs,
                                         input_encoding='UTF-16-BE',
                                         output_encoding='UTF-8')

tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(
    ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())

tokenizer = text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(
    ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())

tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8')
print(tokens.to_list())

tokenizer = text.UnicodeScriptTokenizer()
(tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets(
    ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])
print(tokens.to_list())
print(offset_starts.to_list())
print(offset_limits.to_list())

docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'],

Exemplo n.º 3

0

Exibir arquivo

    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking version 1 (token dependent)
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentence breaking version 2 (StateBasedSentenceBreaker)
        sbv2_text_input = [['Welcome to the U.S.! Harry'],
                           ['Wu Tang Clan; ain\'t nothing']]
        sentence_breaker_v2 = text.StateBasedSentenceBreaker()
        sbv2_fragment_text, _, _ = (
            sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input))
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Split merge from logits tokenizer
        smfl_tokenizer = text.SplitMergeFromLogitsTokenizer()
        split_merge_from_logits = smfl_tokenizer.tokenize(
            b'IloveFlume!',
            # One pair of logits for each Unicode character from the text.  Each
            # pair indicates a "split" action if the first component is greater than
            # the second one, and a "merge" otherwise.
            [
                [2.7, -0.3],  # I: split
                [4.1, 0.82],  # l: split
                [-2.3, 4.3],  # o: merge
                [3.1, 12.2],  # v: merge
                [-3.0, 4.7],  # e: merge
                [2.7, -0.7],  # F: split
                [0.7, 15.0],  # l: merge
                [1.6, 23.0],  # u: merge
                [2.1, 11.0],  # m: merge
                [0.0, 20.0],  # e: merge
                [18.0, 0.7],  # !: split
            ])
        # Confirm TF unicode_script op that requires ICU works
        tf_unicode_script = tf.strings.unicode_script(
            [ord('a'), 0x0411, 0x82b8, ord(',')])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        # Assertion method
        def assert_check(tensor):
            return tf.assert_equal(tensor, tf.identity(tensor))

        # Assertions
        constrained_sequence_assert = assert_check(
            constrained_sequence.to_tensor())
        max_spanning_tree_assert = assert_check(max_spanning_tree)
        normalized_assert = assert_check(normalized)
        regex_split_assert = assert_check(regex_split.to_tensor())
        rouge_l_assert = assert_check(rouge_l)
        sentence_breaking_assert = assert_check(sentence_breaking.to_tensor())
        sentence_breaking_v2_assert = assert_check(
            sbv2_fragment_text.to_tensor())
        sentencepiece_assert = assert_check(sentencepiece.to_tensor())
        sentencepiece_id_assert = assert_check(sentencepiece_id)
        sentencepiece_size_assert = assert_check(sentencepiece_size)
        split_merge_assert = assert_check(split_merge)
        split_merge_from_logits_assert = assert_check(split_merge_from_logits)
        tf_unicode_script_assert = assert_check(tf_unicode_script)
        unicode_script_assert = assert_check(unicode_script.to_tensor())
        whitespace_assert = assert_check(whitespace.to_tensor())
        wordpiece_assert = assert_check(wordpiece.to_tensor())
        wordshapes_assert = assert_check(wordshapes)

        with tf.control_dependencies([
                constrained_sequence_assert, max_spanning_tree_assert,
                normalized_assert, regex_split_assert, rouge_l_assert,
                sentence_breaking_assert, sentence_breaking_v2_assert,
                sentencepiece_assert, sentencepiece_id_assert,
                sentencepiece_size_assert, split_merge_assert,
                split_merge_from_logits_assert, tf_unicode_script_assert,
                unicode_script_assert, whitespace_assert, wordpiece_assert,
                wordshapes_assert
        ]):
            y = tf.add(x, [1])
        return {'y': y}

Exemplo n.º 4

0

Exibir arquivo

def _create_or_get_tokenizer():
    if DEFAULT_TOKENIZER_TYPE not in _tokenizers:
        _tokenizers[DEFAULT_TOKENIZER_TYPE] = tf_text.UnicodeScriptTokenizer()
    return _tokenizers[DEFAULT_TOKENIZER_TYPE]

Exemplo n.º 5

0

Exibir arquivo

Arquivo: load-tf-text.py Projeto: gosiqueira/learn-tensorflow

def main():
    # Unicode
    docs = tf.constant([
        u'Everything not saved will be lost.'.encode('UTF-16-BE'),
        u'Sad☹'.encode('UTF-16-BE')
    ])
    _ = tf.strings.unicode_transcode(docs,
                                     input_encoding='UTF-16-BE',
                                     output_encoding='UTF-8')

    # Tokenization
    # WhitespaceTokenizer
    tokenizer = text.UnicodeScriptTokenizer()
    tokens = tokenizer.tokenize(
        ['everything not saved will be lost', u'Sad☹'.encode('UTF-8')])
    print(f'Tokens: {tokens.to_list()}')

    # Unicode split
    tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], 'UTF-8')
    print(f'Tokens: {tokens.to_list()}')

    # Offsets
    tokenizer = text.UnicodeScriptTokenizer()
    (tokens, _, end_offsets) = tokenizer.tokenize_with_offsets(
        ['everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    print(f'Tokens: {tokens.to_list()}')
    print(f'Offsets: {end_offsets.to_list()}')

    # TF.Data Example
    docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'],
                                               ["It's a trap!"]])
    tokenizer = text.WhitespaceTokenizer()
    tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
    iterator = iter(tokenized_docs)
    print(f'First sentence tokens: {next(iterator).to_list()}')
    print(f'Seconds sentence tokens: {next(iterator).to_list()}')

    # Other Text Ops
    # Wordshape
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(
        ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    # Is capitalized?
    f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
    # Are all letters uppercased
    f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
    # Does the token contain punctuation?
    f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
    # Is the token a number?
    f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

    print(f'Is capitalized? {f1.to_list()}')
    print(f'Are all letters uppercased? {f2.to_list()}')
    print(f'Does the token contain punctuation? {f3.to_list()}')
    print(f'Is the token a number? {f4.to_list()}')

    # N-grams & Sliding Window
    tokenizer = text.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(
        ['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

    # Ngrams, in this case bi-gram (n = 2)
    bigrams = text.ngrams(tokens, 2, reduction_type=text.Reduction.STRING_JOIN)

    print(f'Bi-grams: {bigrams.to_list()}')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: save_models.py Projeto: sts-sadr/text

    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer - not in this version
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        with tf.control_dependencies([
                constrained_sequence, max_spanning_tree, normalized,
                regex_split, rouge_l, sentence_breaking, sentencepiece,
                sentencepiece_id, sentencepiece_size, split_merge,
                unicode_script, whitespace, wordpiece, wordshapes
        ]):
            y = tf.add(x, [1])
        return {'y': y}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: load-text.py Projeto: gosiqueira/learn-tensorflow

def main():
    # Example 1: Predict the tag for a Stack Overflow question
    # Download and explore the dataset
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
    dataset = utils.get_file('stack_overflow_16k.tar.gz',
                             url,
                             untar=True,
                             cache_dir='stack_overflow',
                             cache_subdir='')
    dataset_dir = pathlib.Path(dataset).parent

    print(f'Paths: {list(dataset_dir.iterdir())}')

    train_dir = dataset_dir / 'train'
    print(f'Train paths: {train_dir.iterdir}')

    sample_file = train_dir / 'python/1755.txt'
    with open(sample_file) as f:
        print(f.read())

    # Load the dataset
    batch_size = 32
    seed = 42

    raw_train_ds = preprocessing.text_dataset_from_directory(
        train_dir,
        batch_size=batch_size,
        validation_split=0.2,
        subset='training',
        seed=seed)

    for text_batch, label_batch in raw_train_ds.take(1):
        for i in range(10):
            print(f'Question: {text_batch.numpy()[i]}')
            print(f'Label: {label_batch.numpy()[i]}')

    for i, label in enumerate(raw_train_ds.class_names):
        print(f'Label {i} corresponds to {label}')

    raw_val_ds = preprocessing.text_dataset_from_directory(
        train_dir,
        batch_size=batch_size,
        validation_split=0.2,
        subset='validation',
        seed=seed)

    test_dir = dataset_dir / 'test'
    raw_test_ds = preprocessing.text_dataset_from_directory(
        test_dir, batch_size=batch_size)

    # Prepare the dataset for training
    vocab_size = 10000
    binary_vectorize_layer = TextVectorization(max_tokens=vocab_size,
                                               output_mode='binary')

    max_sequence_length = 250
    int_vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=max_sequence_length)

    # Make a text-only dataset (without labels), then call adapt
    train_text = raw_train_ds.map(lambda text, labels: text)
    binary_vectorize_layer.adapt(train_text)
    int_vectorize_layer.adapt(train_text)

    def binary_vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return binary_vectorize_layer(text), label

    def int_vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return int_vectorize_layer(text), label

    # Retrieve a batch (of 32 reviews and labels) from the dataset
    text_batch, label_batch = next(iter(raw_train_ds))
    first_question, first_label = text_batch[0], label_batch[0]
    print(f'Question: {first_question}')
    print(f'Label: {first_label}')

    print(
        f'"binary" vectorized question: {binary_vectorize_text(first_question, first_label)[0]}'
    )
    print(
        f'"int" vectorized question: {int_vectorize_text(first_question, first_label)[0]}'
    )

    print(f'1298 ---> {int_vectorize_layer.get_vocabulary()[1289]}')
    print(f' 313 ---> {int_vectorize_layer.get_vocabulary()[313]}')
    print(f'Vocabulary size: {len(int_vectorize_layer.get_vocabulary())}')

    binary_train_ds = raw_train_ds.map(binary_vectorize_text)
    binary_val_ds = raw_val_ds.map(binary_vectorize_text)
    binary_test_ds = raw_test_ds.map(binary_vectorize_text)

    int_train_ds = raw_train_ds.map(int_vectorize_text)
    int_val_ds = raw_val_ds.map(int_vectorize_text)
    int_test_ds = raw_test_ds.map(int_vectorize_text)

    # Configure the dataset for performance
    AUTOTUNE = tf.data.AUTOTUNE

    def configure_dataset(dataset):
        return dataset.cache().prefetch(buffer_size=AUTOTUNE)

    binary_train_ds = configure_dataset(binary_train_ds)
    binary_val_ds = configure_dataset(binary_val_ds)
    binary_test_ds = configure_dataset(binary_test_ds)

    int_train_ds = configure_dataset(int_train_ds)
    int_val_ds = configure_dataset(int_val_ds)
    int_test_ds = configure_dataset(int_test_ds)

    # Train the model
    binary_model = tf.keras.Sequential([layers.Dense(4)])

    binary_model.compile(
        optimizer='adam',
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

    _ = binary_model.fit(binary_train_ds,
                         validation_data=binary_val_ds,
                         epochs=10)

    def create_model(vocab_size, num_labels):
        model = tf.keras.Sequential([
            layers.Embedding(vocab_size, 64, mask_zero=True),
            layers.Conv1D(64, 5, padding='valid', activation='relu',
                          strides=2),
            layers.GlobalMaxPool1D(),
            layers.Dense(num_labels)
        ])

        return model

    # vocab_size is vocab_size + 1 since 0 is used additionally for padding.
    int_model = create_model(vocab_size=vocab_size + 1, num_labels=4)
    int_model.compile(
        optimizer='adam',
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])
    _ = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=5)

    print(f'Linear model on binary vectorized data: {binary_model.summary()}')

    print(f'ConvNet model on int vectorized data: {int_model.summary()}')

    _, binary_accuracy = binary_model.evaluate(binary_test_ds)
    _, int_accuracy = int_model.evaluate(int_test_ds)

    print(f'Binary model accuracy {binary_accuracy:2.2%}')
    print(f'Int model accuracy: {int_accuracy:2.2%}')

    # Export the model
    export_model = tf.keras.Sequential(
        [binary_vectorize_layer, binary_model,
         layers.Activation('sigmoid')])

    export_model.compile(
        optimizer='adam',
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

    # Test it with `raw_test_ds`, which yields raw strings
    loss, accuracy = export_model.evaluate(raw_test_ds)
    print(f'Accuracy: {accuracy:2.2%}')

    def get_string_labels(predicted_scores_batch):
        predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1)
        predicted_labels = tf.gather(raw_train_ds.class_names,
                                     predicted_int_labels)
        return predicted_labels

    # Run inference on new data
    inputs = [
        'how do I extract keys from a dict into a list?',  # python
        'debug public static void main(string[] args) {...}',  # java
    ]

    predicted_scores = export_model.predict(inputs)
    predicted_labels = get_string_labels(predicted_scores)
    for input, label in zip(inputs, predicted_labels):
        print(f'Question: {input}')
        print(f'Predicted label: {label.numpy()}')

    # Example 2: Predict the author of Illiad translations
    url = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
    file_names = ['cowper.txt', 'derby.txt', 'butler.txt']

    for name in file_names:
        text_dir = utils.get_file(name, origin=url + name)

    parent_dir = pathlib.Path(text_dir).parent
    print(f'Paths: {list(parent_dir.iterdir())}')

    # Load the dataset
    def labeler(example, index):
        return example, tf.cast(index, tf.int64)

    labeled_data_sets = []

    for i, file_name in enumerate(file_names):
        lines_dataset = tf.data.TextLineDataset(str(parent_dir / file_name))
        labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
        labeled_data_sets.append(labeled_dataset)

    buffer_size = 50000
    batch_size = 64
    validation_size = 5000

    all_labeled_data = labeled_data_sets[0]
    for labeled_dataset in labeled_data_sets[1:]:
        all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

    all_labeled_data = all_labeled_data.shuffle(buffer_size,
                                                reshuffle_each_iteration=False)

    for text, label in all_labeled_data.take(10):
        print(f'Sentence: {text.numpy()}')
        print(f'Label: {label.numpy()}')

    # Prepare the dataset for training
    tokenizer = tf_text.UnicodeScriptTokenizer()

    def tokenize(text, unused_label):
        lower_case = tf_text.case_fold_utf8(text)
        return tokenizer.tokenize(lower_case)

    tokenized_ds = all_labeled_data.map(tokenize)

    for text_batch in tokenized_ds.take(5):
        print(f'Tokens: {text_batch.numpy()}')

    tokenized_ds = configure_dataset(tokenized_ds)

    vocab_dict = collections.defaultdict(lambda: 0)
    for toks in tokenized_ds.as_numpy_iterator():
        for tok in toks:
            vocab_dict[tok] += 1

    vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
    vocab = [token for token, count in vocab]
    vocab = vocab[:vocab_size]
    vocab_size = len(vocab)

    print(f'Vocab size: {vocab_size}')
    print(f'First five vocab entries: {vocab[:5]}')

    keys = vocab
    values = range(2, len(vocab) + 2)  # reserve 0 for padding, 1 for OOV

    init = tf.lookup.KeyValueTensorInitializer(keys,
                                               values,
                                               key_dtype=tf.string,
                                               value_dtype=tf.int64)

    num_oov_buckets = 1
    vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

    def preprocess_text(text, label):
        standardized = tf_text.case_fold_utf8(text)
        tokenized = tokenizer.tokenize(standardized)
        vectorized = vocab_table.lookup(tokenized)
        return vectorized, label

    example_text, example_label = next(iter(all_labeled_data))
    print(f'Sentence: {example_text.numpy()}')
    vectorized_text, example_label = preprocess_text(example_text,
                                                     example_label)
    print(f'Vectorized sentence: {vectorized_text.numpy()}')

    all_encoded_data = all_labeled_data.map(preprocess_text)

    # Split the dataset into train and test
    train_data = all_encoded_data.skip(validation_size).shuffle(buffer_size)
    validation_data = all_encoded_data.take(validation_size)

    train_data = train_data.padded_batch(batch_size)
    validation_data = validation_data.padded_batch(batch_size)

    sample_text, sample_labels = next(iter(validation_data))
    print(f'Text batch shape: {sample_text.shape}')
    print(f'Label batch shape: {sample_labels.shape}')
    print(f'First text example: {sample_text[0]}')
    print(f'First label example: {sample_labels[0]}')

    vocab_size += 2

    # Train the model
    model = create_model(vocab_size=vocab_size, num_labels=3)
    model.compile(optimizer='adam',
                  loss=losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    _ = model.fit(train_data, validation_data=validation_data, epochs=3)

    loss, accuracy = model.evaluate(validation_data)

    print(f'Loss: {loss}')
    print(f'Accuracy: {accuracy:2.2%}')

    # Export the model
    preprocess_layer = TextVectorization(
        max_tokens=vocab_size,
        standardize=tf_text.case_fold_utf8,
        split=tokenizer.tokenize,
        output_mode='int',
        output_sequence_length=max_sequence_length)

    preprocess_layer.set_vocabulary(vocab)

    export_model = tf.keras.Sequential(
        [preprocess_layer, model,
         layers.Activation('sigmoid')])

    export_model.compile(
        optimizer='adam',
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

    # Create a test dataset of raw strings
    test_ds = all_labeled_data.take(validation_size).batch(batch_size)
    test_ds = configure_dataset(test_ds)
    loss, accuracy = export_model.evaluate(test_ds)

    print(f'Loss: {loss}')
    print(f'Accuracy: {accuracy:2.2%}')

    # Run inference on new data
    inputs = [
        "Join'd to th' Ionians with their flowing robes,",  # Label: 1
        "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
        "And with loud clangor of his arms he fell.",  # Label: 0
    ]
    predicted_scores = export_model.predict(inputs)
    predicted_labels = tf.argmax(predicted_scores, axis=1)
    for input, label in zip(inputs, predicted_labels):
        print(f'Question: {input}')
        print(f'Predicted label: {label.numpy()}')

    # Download more datasets using TensorFlow Datasets (TFDS)
    train_ds = tfds.load('imdb_reviews',
                         split='train',
                         batch_size=batch_size,
                         shuffle_files=True,
                         as_supervised=True)

    val_ds = tfds.load('imdb_reviews',
                       split='train',
                       batch_size=batch_size,
                       shuffle_files=True,
                       as_supervised=True)

    for review_batch, label_batch in val_ds.take(1):
        for i in range(5):
            print(f'Review: {review_batch[i].numpy()}')
            print(f'Label: {label_batch[i].numpy()}')

    # Prepare thje dataset for training
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=max_sequence_length)

    # Make a text-only dataset (without labels), then call adapt
    train_text = train_ds.map(lambda text, labels: text)
    vectorize_layer.adapt(train_text)

    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return vectorize_layer(text), label

    train_ds = train_ds.map(vectorize_text)
    val_ds = val_ds.map(vectorize_text)

    train_ds = configure_dataset(train_ds)
    val_ds = configure_dataset(val_ds)

    # Train the model
    model = create_model(vocab_size=vocab_size + 1, num_labels=1)
    model.summary()

    model.compile(optimizer='adam',
                  loss=losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    _ = model.fit(train_ds, validation_data=val_ds, epochs=3)

    loss, accuracy = model.evaluate(val_ds)
    print(f'Loss: {loss}')
    print(f'Accuracy: {accuracy:2.2%}')

    # Export the model
    export_model = tf.keras.Sequential(
        [vectorize_layer, model,
         layers.Activation('sigmoid')])

    export_model.compile(
        optimizer='adam',
        loss=losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])

    # 0 ---> negative review
    # 1 ---> positive review
    inputs = [
        'This is a fantastic movie.', 'This is a bad movie.',
        'This movie was bad that it was good.',
        'I will never say yes to watching this movie.'
    ]
    predicted_scores = export_model.predict(inputs)
    predicted_labels = [int(round(x[0])) for x in predicted_scores]
    for input, label in zip(inputs, predicted_labels):
        print(f'Question: {input}')
        print(f'Predicted label: {label}')