Python BertTokenizer примеры, tensorflow_text.BertTokenizer Python примеры использования

Пример #1

0

Показать файл

def translate(sentence, transformer):
    tokenizer_en = tf_text.BertTokenizer('vocabs/vocab_en.txt')
    tokenizer_es = tf_text.BertTokenizer('vocabs/vocab_es.txt')

    sentence = tf.convert_to_tensor([sentence])
    encoder_input = tokenizer_en.tokenize(sentence)
    encoder_input = encoder_input.merge_dims(-2, -1)
    encoder_input = add_start_end(encoder_input).to_tensor()

    output = tf.convert_to_tensor([START])
    output = tf.expand_dims(output, 0)

    for i in range(MAX_LENGTH):
        predictions = transformer(encoder_input, output, False)
        predictions = predictions[:, -1:, :]
        predicted_id = tf.argmax(predictions, axis=-1)

        output = tf.concat([output, predicted_id], axis=-1)

        if predicted_id == END:
            break

    text = tokenizer_es.detokenize(output)[0].numpy()
    text = tf.strings.reduce_join(text, separator=' ', axis=-1)

    return text.numpy().decode('utf-8')[8:-6]

Пример #2

0

Показать файл

Файл: skt_sandhi_tokenizer.py Проект: Yuzki/sandhi

    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=False)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:

        # Include a tokenize signature for a batch of strings.
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string))

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

Пример #3

0

Показать файл

    def show_html(self, token_impact: np.ndarray, initial_tokens: np.ndarray, label_index: int = 0):
        # sp = spm.SentencePieceProcessor(model_file='../inputs/embd/sentencepiece_bpe.model')
        tokenizer = text.BertTokenizer("../inputs/bert_tokens.model")

        arr = np.zeros(99757)
        color_map = cm.get_cmap("Reds")

        token_impact /= np.max(token_impact)  # normalization to range [0; 1]
        with open("../outputs/text_{}.html".format(self.model_name), "a") as file:
            file.write("<div><h2>Author #{}</h2>\n".format(label_index))
            for token, impact in zip(initial_tokens, token_impact):
                # if impact > 0.5:
                arr[int(token)] += 1
                local_impact = self.get_color(color_map, impact)
                word = tokenizer.detokenize([[int(token)]]).to_list()[0][0].decode("utf-8")
                # special tokens
                if word == "TAB":
                    word = "&nbsp&nbsp&nbsp&nbsp"
                elif word == "SPC":
                    word = "&nbsp"
                elif word == "NLN":
                    file.write("<br>")
                    continue

                file.write("<span style='background-color: rgba({}, {}, {}, {})'>{}</span>"
                           .format(*local_impact, word))

            file.write("</div>")
        return arr

Пример #4

0

Показать файл

def get_tf_tokenizer(module_handle, tokenization_info=None):
    """Creates a preprocessing function."""
    LOGGER.debug("(get_tf_tokenizer): get_tokenization_info")
    # We get tokenization info to know where the vocab is and if the model
    # is lower cased
    if tokenization_info is None:
        tokenization_info = get_tokenization_info(module_handle=module_handle)

    LOGGER.debug("(get_tf_tokenizer): tf.lookup.TextFileInitializer")
    # Create a lookup table initializer from a text file (the vocab file)
    table_initializer = tf.lookup.TextFileInitializer(
        filename=tokenization_info["vocab_file"],
        key_dtype=tf.string,
        key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
        value_dtype=tf.int64,
        value_index=tf.lookup.TextFileIndex.LINE_NUMBER)

    LOGGER.debug("(get_tf_tokenizer): tf.lookup.StaticVocabularyTable")
    # Make the table itself
    vocab_lookup_table = tf.lookup.StaticVocabularyTable(
        initializer=table_initializer,
        num_oov_buckets=1,
        lookup_key_dtype=tf.string)

    LOGGER.debug("(get_tf_tokenizer): tf_text.BertTokenizer")
    # Build the tokenizer
    tokenizer = tf_text.BertTokenizer(
        vocab_lookup_table=vocab_lookup_table,
        lower_case=tokenization_info["do_lower_case"])

    LOGGER.debug("(get_tf_tokenizer): Done")
    return tokenizer, vocab_lookup_table

Пример #5

0

Показать файл

    def tokenize_single_sentence(self,
                                 sequence,
                                 max_len=128,
                                 addCLS=True,
                                 addSEP=True):
        """Tokenize a single sentence to ID according to the vocab.txt provided.
        Add special tokens according to config."""

        tokenizer = text.BertTokenizer(self.vocab_dir, token_out_type=tf.int64)
        word_id = tokenizer.tokenize(sequence)
        word_id = word_id.merge_dims(1, 2)[:, :max_len]
        word_id = word_id.to_tensor(default_value=self.PAD_ID)
        if addCLS:
            CLSToken = tf.fill([tf.shape(sequence)[0], 1], self.CLS_ID)
            word_id = word_id[:, :max_len - 1]
            word_id = tf.concat([CLSToken, word_id], axis=1)

        if addSEP:
            SEPToken = tf.fill([tf.shape(sequence)[0], 1], self.SEP_ID)
            word_id = word_id[:, :max_len - 1]
            word_id = tf.concat([word_id, SEPToken], axis=1)

        word_id = tf.pad(word_id, [[0, 0], [0, max_len]],
                         constant_values=self.PAD_ID)

        word_id = tf.slice(word_id, [0, 0], [-1, max_len])

        # Mask to distinguish padded values.
        input_mask = tf.cast(word_id > 0, tf.int64)
        # Mask to distinguish two sentences. In this case, just one sentence.
        segment_id = tf.fill(tf.shape(input_mask),
                             tf.constant(0, dtype=tf.int64))

        return word_id, input_mask, segment_id

Пример #6

0

Показать файл

Файл: text_layers.py Проект: qfaizan401/models

    def __init__(self,
                 *,
                 vocab_file: str,
                 lower_case: bool,
                 tokenize_with_offsets: bool = False,
                 **kwargs):
        """Initialize a BertTokenizer layer.

    Args:
      vocab_file: A Python string with the path of the vocabulary file.
        This is a text file with newline-separated wordpiece tokens.
        This layer initializes a lookup table from it that gets used with
        text.BertTokenizer.
      lower_case: A Python boolean forwarded to text.BertTokenizer.
        If true, input text is converted to lower case (where applicable)
        before tokenization. This must be set to match the way in which
        the vocab_file was created.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
         BertTokenizer.tokenize_with_offsets() instead of plain .tokenize()
         and outputs a triple of (tokens, start_offsets, limit_offsets)
         insead of just tokens.
      **kwargs: standard arguments to Layer().

    Raises:
      ImportError: if importing tensorflow_text failed.
    """
        _check_if_tf_text_installed()

        self.tokenize_with_offsets = tokenize_with_offsets
        self._vocab_table = self._create_vocab_table(vocab_file)
        self._special_tokens_dict = self._create_special_tokens_dict(
            self._vocab_table, vocab_file)
        super().__init__(**kwargs)
        self._bert_tokenizer = text.BertTokenizer(self._vocab_table,
                                                  lower_case=lower_case)

Пример #7

0

Показать файл

Файл: tokenizer.py Проект: tensorflow/lingvo

 def __init__(self, params):
     super().__init__(params)
     self._tokenizer = tf_text.BertTokenizer(
         params.vocab_path,
         lower_case=True,
         max_bytes_per_word=200,
         token_out_type=tf.int32,
     )

Пример #8

0

Показать файл

Файл: utils.py Проект: yynil/demonstrateTFX-TFText

def _tokenize(stringA):
    """Tokenize the two sentences and insert appropriate tokens"""
    tokenizer = text.BertTokenizer(
        "vocab.txt",
        token_out_type=tf.string,
    )

    stringA = tf.squeeze(stringA)
    idA = tokenizer.tokenize(stringA)
    #idB = tokenizer.tokenize(stringB)
    return idA.merge_dims(-2, -1).to_sparse()

Пример #9

0

Показать файл

Файл: preprocess.py Проект: google-research/vision_transformer

    def __post_init__(self):
        tokenizer = tensorflow_text.BertTokenizer(self.vocab_path,
                                                  token_out_type=tf.int32,
                                                  lower_case=True)
        with tf.io.gfile.GFile(self.vocab_path) as f:
            vocab = f.read().split('\n')
        cls_token = vocab.index('[CLS]')

        # Work-around for frozen dataclasses:
        # https://stackoverflow.com/questions/53756788
        object.__setattr__(self, 'cls_token', cls_token)
        object.__setattr__(self, '_tokenizer', tokenizer)

Пример #10

0

Показать файл

    def __init__(self,
                 *,
                 vocab_file: str,
                 lower_case: Optional[bool] = None,
                 tokenize_with_offsets: bool = False,
                 tokenizer_kwargs: Optional[Mapping[Text, Any]] = None,
                 **kwargs):
        """Initialize a `BertTokenizer` layer.

    Args:
      vocab_file: A Python string with the path of the vocabulary file.
        This is a text file with newline-separated wordpiece tokens.
        This layer initializes a lookup table from it that gets used with
        `text.BertTokenizer`.
      lower_case: Optional boolean forwarded to `text.BertTokenizer`.
        If true, input text is converted to lower case (where applicable)
        before tokenization. This must be set to match the way in which
        the `vocab_file` was created. If passed, this overrides whatever value
        may have been passed in `tokenizer_kwargs`.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
        `text.BertTokenizer.tokenize_with_offsets()` instead of plain
        `text.BertTokenizer.tokenize()` and outputs a triple of
        `(tokens, start_offsets, limit_offsets)`
        insead of just tokens.
      tokenizer_kwargs: Optional mapping with keyword arguments to forward to
        `text.BertTokenizer`'s constructor.
      **kwargs: Standard arguments to `Layer()`.

    Raises:
      ImportError: If importing `tensorflow_text` failed.
    """
        _check_if_tf_text_installed()

        self.tokenize_with_offsets = tokenize_with_offsets
        # TODO(b/177326279): Stop storing the vocab table initializer as an
        # attribute when https://github.com/tensorflow/tensorflow/issues/46456
        # has been fixed in the TensorFlow versions of the TF Hub users that load
        # a SavedModel created from this layer. Due to that issue, loading such a
        # SavedModel forgets to add .vocab_table._initializer as a trackable
        # dependency of .vocab_table, so that saving it again to a second SavedModel
        # (e.g., the final model built using TF Hub) does not properly track
        # the ._vocab_table._initializer._filename as an Asset.
        self._vocab_table, self._vocab_initializer_donotuse = (
            self._create_vocab_table_and_initializer(vocab_file))
        self._special_tokens_dict = self._create_special_tokens_dict(
            self._vocab_table, vocab_file)
        super().__init__(**kwargs)
        tokenizer_kwargs = dict(tokenizer_kwargs or {})
        if lower_case is not None:
            tokenizer_kwargs["lower_case"] = lower_case
        self._bert_tokenizer = text.BertTokenizer(self._vocab_table,
                                                  **tokenizer_kwargs)

Пример #11

0

Показать файл

 def build(self, input_shape: tf.TensorShape) -> None:
     self.tokenizer = tftext.BertTokenizer(
         tf.lookup.StaticVocabularyTable(
             tf.lookup.KeyValueTensorInitializer(
                 self.vocab,
                 list(range(len(self.vocab))),
                 key_dtype=tf.string,
                 value_dtype=tf.int64,
             ),
             1,
         ),
         max_chars_per_token=self.max_chars_per_token,
     )
     super().build(input_shape)

Пример #12

0

Показать файл

    def initial_preprocess(self, df_path: str, tmp_dataset_filename: str):
        df = self._initial_load(df_path)
        df = df[(df.n_lines > 0)]
        # tokenize. requires time (approx 1h)

        df.flines = df.flines.apply(self._insert_tokens)
        print("updated")
        text_dataset = tf.data.Dataset.from_tensor_slices(df.flines.values)

        vocab = bert_vocab.bert_vocab_from_dataset(
            text_dataset,
            **bert_vocab_args
        )
        self._write_vocab_file("../inputs/bert_tokens.model", vocab)
        print("saved")
        # read the tokenizer
        tokenizer = text.BertTokenizer("../inputs/bert_tokens.model")

        # reduce the size of the dataset according to the n_tokens
        df.index = np.arange(len(df))
        df["n_tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).shape[0])
        df = df[df.n_tokens <= self.input_size]
        # reindex
        df.index = np.arange(len(df))
        # reduce size
        df = self._user_selection_and_encoding(df, 50, 450)
        # long saving
        # The issue is that `tokenizer.tokenize()` do not always return a shape (-1, 1).
        # Some elements of the result of the function could be a list, e.g. [[2929, 8524]].
        # >> tokenizer.detokenize([[2929, 8524]])
        # < tf.RaggedTensor[[b'visdist']] >
        # >> tokenizer.detokenize([[2929]])
        # < tf.RaggedTensor[[b'vis']] >
        # >> tokenizer.detokenize([[8524]])
        # < tf.RaggedTensor[[b'##dist']] >
        # I have decided to flatten these lists.
        df["tokens"] = df.flines.apply(lambda x: tokenizer.tokenize(x).to_list())

        df.tokens = df.tokens.apply(lambda x: list(pd.core.common.flatten(x)))
        dataset = df[["user", "tokens", "task"]]
        # shuffle dataset
        dataset = dataset.sample(frac=1)

        def rsh(x):
            arr = np.array(x)
            arr = np.resize(arr, (self.input_size, 1))
            return arr.tolist()

        dataset.tokens = dataset.tokens.apply(rsh)
        dataset.to_json(tmp_dataset_filename)

Пример #13

0

Показать файл

Файл: utils.py Проект: yynil/demonstrateTFX-TFText

def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    test = tf.constant(['test sentence'])
    tokenizer = text.BertTokenizer(
        "vocab.txt",
        token_out_type=tf.string,
    )
    output = tokenizer.tokenize(test)
    return inputs

Пример #14

0

Показать файл

def convert_huggingface_tokenizer(huggingface_tokenizer,
                                  suffix_indicator="##",
                                  max_chars_per_token=None,
                                  split_unknown_characters=True,
                                  lower_case=True,
                                  keep_whitespace=False,
                                  normalization_form=None,
                                  preserve_unused_token=True,
                                  dtype=tf.int32):

    vocab_lookup_table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(
            keys=list(huggingface_tokenizer.vocab.keys()),
            values=tf.constant(list(huggingface_tokenizer.vocab.values()),
                               dtype=tf.int64)),
        default_value=0)

    special_ids_mask_table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(
            keys=tf.constant(huggingface_tokenizer.all_special_ids,
                             dtype=dtype),
            values=tf.constant(1,
                               dtype=dtype,
                               shape=len(
                                   huggingface_tokenizer.all_special_ids)),
            key_dtype=dtype,
            value_dtype=dtype),
        default_value=tf.constant(0, dtype=dtype))

    tokenizer_tf_text = tf_text.BertTokenizer(
        vocab_lookup_table=vocab_lookup_table,
        suffix_indicator="##",
        max_bytes_per_word=huggingface_tokenizer.wordpiece_tokenizer.
        max_input_chars_per_word,
        max_chars_per_token=None,
        token_out_type=dtype,
        unknown_token=huggingface_tokenizer.unk_token,
        split_unknown_characters=True,
        lower_case=True,
        keep_whitespace=False,
        normalization_form=None,
        preserve_unused_token=True)

    return tokenizer_tf_text, vocab_lookup_table, special_ids_mask_table

Пример #15

0

Показать файл

Файл: text_layers.py Проект: sunlinlin-aragon/models

    def __init__(self,
                 *,
                 vocab_file: str,
                 lower_case: bool,
                 tokenize_with_offsets: bool = False,
                 **kwargs):
        """Initialize a BertTokenizer layer.

    Args:
      vocab_file: A Python string with the path of the vocabulary file.
        This is a text file with newline-separated wordpiece tokens.
        This layer initializes a lookup table from it that gets used with
        text.BertTokenizer.
      lower_case: A Python boolean forwarded to text.BertTokenizer.
        If true, input text is converted to lower case (where applicable)
        before tokenization. This must be set to match the way in which
        the vocab_file was created.
      tokenize_with_offsets: A Python boolean. If true, this layer calls
         BertTokenizer.tokenize_with_offsets() instead of plain .tokenize()
         and outputs a triple of (tokens, start_offsets, limit_offsets)
         insead of just tokens.
      **kwargs: standard arguments to Layer().

    Raises:
      ImportError: if importing tensorflow_text failed.
    """
        _check_if_tf_text_installed()

        self.tokenize_with_offsets = tokenize_with_offsets
        # TODO(b/177326279): Stop storing the vocab table initializer as an
        # attribute when https://github.com/tensorflow/tensorflow/issues/46293
        # has been fixed in the TensorFlow versions of the TF Hub users that load
        # a SavedModel created from this layer. Due to that issue, loading such a
        # SavedModel forgets to add .vocab_table._initializer as a trackable
        # dependency of .vocab_table, so that saving it again to a second SavedModel
        # (e.g., the final model built using TF Hub) does not properly track
        # the ._vocab_table._initializer._filename as an Asset.
        self._vocab_table, self._vocab_initializer_donotuse = (
            self._create_vocab_table_and_initializer(vocab_file))
        self._special_tokens_dict = self._create_special_tokens_dict(
            self._vocab_table, vocab_file)
        super().__init__(**kwargs)
        self._bert_tokenizer = text.BertTokenizer(self._vocab_table,
                                                  lower_case=lower_case)

Пример #16

0

Показать файл

Файл: dataset.py Проект: baichii/tf2

    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype=tf.string))

        self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.lookup.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.get_reserved_tokens.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_vocab_size.get_concrete_function()

Пример #17

0

Показать файл

def classify(tweet, model):

    tokenizer = tf_text.BertTokenizer('vocab.txt')

    input = tokenizer.tokenize([tweet])
    input = input.merge_dims(-2, -1)
    input = tf.keras.preprocessing.sequence.pad_sequences(input.to_list(),
                                                          padding="post",
                                                          maxlen=params['MAX_LEN'])
    prediction = model(input, training=False).numpy()

    if prediction[0][0] > .5:
        sentiment = 'positivo'
        value = prediction[0][0] * 100
    else:
        sentiment = 'negativo'
        value = (1 - prediction[0][0]) * 100

    return value, sentiment

Пример #18

0

Показать файл

Файл: bert_utils.py Проект: google-research/language

def get_tf_tokenizer(module_handle):
  """Creates a preprocessing function."""
  tokenization_info = get_tokenization_info(module_handle)

  table_initializer = tf.lookup.TextFileInitializer(
      filename=tokenization_info["vocab_file"],
      key_dtype=tf.string,
      key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
      value_dtype=tf.int64,
      value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
  vocab_lookup_table = tf.lookup.StaticVocabularyTable(
      initializer=table_initializer,
      num_oov_buckets=1,
      lookup_key_dtype=tf.string)

  tokenizer = tf_text.BertTokenizer(
      vocab_lookup_table=vocab_lookup_table,
      lower_case=tokenization_info["do_lower_case"])

  return tokenizer, vocab_lookup_table

Пример #19

0

Показать файл

  def tokenize_single_sentence_unpad(self,
                                     sequence: tf.Tensor,
                                     max_len: int = 128,
                                     add_cls: bool = True,
                                     add_sep: bool = True):
    """Tokenize a sentence with the BERT model vocab file and without padding.

    Add special tokens according to config.

    Args:
      sequence: Tensor of shape [batch_size, 1].
      max_len: The number of tokens after padding and truncating.
      add_cls: Whether to add CLS token at the front of each sequence.
      add_sep: Whether to add SEP token at the end of each sequence.

    Returns:
      word_ids: Ragged tokenized sequences [batch_size, None].
    """
    vocab_file_path = self._model.resolved_object.vocab_file.asset_path
    tokenizer = text.BertTokenizer(
        vocab_file_path,
        lower_case=self._do_lower_case,
        token_out_type=tf.int64)
    word_ids = tokenizer.tokenize(sequence)
    # Tokenizer default puts tokens into array of size 1. merge_dims flattens it
    word_ids = word_ids.merge_dims(-2, -1)
    if add_cls:
      cls_token = tf.fill([tf.shape(sequence)[0], 1],
                          tf.constant(self._cls_id, dtype=tf.int64))

      word_ids = tf.concat([cls_token, word_ids], 1)

    if add_sep:
      sep_token = tf.fill([tf.shape(sequence)[0], 1],
                          tf.constant(self._sep_id, dtype=tf.int64))

      word_ids = word_ids[:, :max_len - 1]
      word_ids = tf.concat([word_ids, sep_token], 1)

    return word_ids

Пример #20

0

Показать файл

Файл: skt_sandhi_tokenizer.py Проект: Yuzki/sandhi

# Commented out IPython magic to ensure Python compatibility.
# %%time
pada_vocab = bert_vocab.bert_vocab_from_dataset(
    train_pada.batch(1000).prefetch(2), **bert_vocab_args)

print(pada_vocab[:10])
print(pada_vocab[100:110])
print(pada_vocab[1000:1010])
print(pada_vocab[-10:])

write_vocab_file(work_dir + 'pada_vocab.txt', pada_vocab)

# !ls *.txt

samh_tokenizer = text.BertTokenizer(work_dir + 'samh_vocab.txt',
                                    **bert_tokenizer_params)
pada_tokenizer = text.BertTokenizer(work_dir + 'pada_vocab.txt',
                                    **bert_tokenizer_params)

for smah_examples, pada_examples in train_examples.batch(3).take(1):
    for ex in pada_examples:
        print(ex.numpy())

# Tokenize the examples -> (batch, word, word-piece)
token_batch = pada_tokenizer.tokenize(pada_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2, -1)

# for ex in token_batch.to_list():
#   print(ex)

Пример #21

0

Показать файл

Файл: preprocessing.py Проект: kkweon/personal_newsletter_curation

def preprocessing_fn(inputs):
    """Preprocess input column of text into transformed columns of.
        * input token ids
        * input mask
        * input type ids
    """

    CLS_ID = tf.constant(101, dtype=tf.int64)
    SEP_ID = tf.constant(102, dtype=tf.int64)
    PAD_ID = tf.constant(0, dtype=tf.int64)

    vocab_file_path = load_bert_layer().resolved_object.vocab_file.asset_path

    bert_tokenizer = text.BertTokenizer(vocab_lookup_table=vocab_file_path,
                                        token_out_type=tf.int64,
                                        lower_case=do_lower_case)

    def tokenize_text(text, sequence_length=MAX_SEQ_LEN):
        """
        Perform the BERT preprocessing from text -> input token ids
        """

        # convert text into token ids
        tokens = bert_tokenizer.tokenize(text)

        # flatten the output ragged tensors
        tokens = tokens.merge_dims(1, 2)[:, :sequence_length]

        # Add start and end token ids to the id sequence
        start_tokens = tf.fill([tf.shape(text)[0], 1], CLS_ID)
        end_tokens = tf.fill([tf.shape(text)[0], 1], SEP_ID)
        tokens = tokens[:, :sequence_length - 2]
        tokens = tf.concat([start_tokens, tokens, end_tokens], axis=1)

        # truncate sequences greater than MAX_SEQ_LEN
        tokens = tokens[:, :sequence_length]

        # pad shorter sequences with the pad token id
        tokens = tokens.to_tensor(default_value=PAD_ID)
        pad = sequence_length - tf.shape(tokens)[1]
        tokens = tf.pad(tokens, [[0, 0], [0, pad]], constant_values=PAD_ID)

        # and finally reshape the word token ids to fit the output
        # data structure of TFT
        return tf.reshape(tokens, [-1, sequence_length])

    def preprocess_bert_input(text):
        """
        Convert input text into the input_word_ids, input_mask, input_type_ids
        """
        input_word_ids = tokenize_text(text)
        input_mask = tf.cast(input_word_ids > 0, tf.int64)
        input_mask = tf.reshape(input_mask, [-1, MAX_SEQ_LEN])

        zeros_dims = tf.stack(tf.shape(input_mask))
        input_type_ids = tf.fill(zeros_dims, 0)
        input_type_ids = tf.cast(input_type_ids, tf.int64)

        return (input_word_ids, input_mask, input_type_ids)

    input_word_ids, input_mask, input_type_ids = \
        preprocess_bert_input(tf.squeeze(inputs['text'], axis=1))

    return {
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids,
        'label': inputs['label']
    }

Пример #22

0

Показать файл

Файл: gravity_comparison.py Проект: Spu7Nix/PMX-prosjekt

# skaffer vokabularet vi har laget

bert_tokenizer_params = dict(lower_case=True)
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
bert_vocab_args = dict(
    # maksimum størrelse for vokabularet
    vocab_size=8000 * 7,
    # Reserverte orddeler som må være med
    reserved_tokens=reserved_tokens,
    # flere argumenter
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={},
)

# lager en "tokenizer", som deler tekst opp i orddeler
tokenizer = text.BertTokenizer('vocab.txt', **bert_tokenizer_params)

tokenlist = open('vocab.txt', 'r', encoding="utf-8").readlines()

# IDen til padding
PAD_ID = 0
# Maksimum lengde for vektoren
# hvis vektoren er mindre, blir det lagt til padding
max_seq_len = 20
langs = 7


def preprocess_bert_input(text):
    # finner IDene til alle orddelene i inputtet
    ids = tokenize_text(text, max_seq_len)
    # lager en mask, som i dette tilfettet representerer lengden på vektoren vår

Пример #23

0

Показать файл

    def __init__(self,
                 bert_layer,
                 max_len,
                 min_len=1,
                 CLS='[CLS]',
                 SEP='[SEP]',
                 PAD='[PAD]',
                 UNK='[UNK]'):
        """ Initializes the layer

        :param CLS Token that represents the start of a sentence
        :param SEP Token that represents the end of a segment
        :param PAD Token that represents padding
        :param UNK Token that represents unknown tokens
        :param bert_layer Keras layer that loaded from pretrained BERT
        """
        super().__init__()
        self._CLS = CLS
        self._SEP = SEP
        self._PAD = PAD
        self._min_len = min_len
        self._max_len = max_len

        resolved_object = bert_layer.resolved_object
        self.do_lower_case = resolved_object.do_lower_case.numpy()
        if hasattr(resolved_object, "tokenizer_type"):
            tokenizer_type_file = resolved_object.tokenizer_type.asset_path.numpy(
            ).decode("utf-8")
            with tf.io.gfile.GFile(tokenizer_type_file, 'r') as f_handler:
                self._tokenizer_type = f_handler.read().strip()
            tokenizer_file = resolved_object.tokenizer_file.asset_path.numpy(
            ).decode("utf-8")
            if self._tokenizer_type == SENTENCEPIECE:
                with tf.io.gfile.GFile(tokenizer_file, 'rb') as f_handler:
                    sp_model = f_handler.read()
                self._tokenizer = tf_text.SentencepieceTokenizer(
                    model=sp_model, out_type=tf.int32)
                self.vocab_table = create_tf_vocab_from_sp_tokenizer(
                    self._tokenizer, num_oov_buckets=1)
            else:
                assert (self._tokenizer_type == SPACE)
                _, self.vocab_table = read_tf_vocab(tokenizer_file, UNK)
        else:
            vocab_file = resolved_object.vocab_file.asset_path.numpy().decode(
                "utf-8")
            _, self.vocab_table = create_tf_vocab_from_wp_tokenizer(
                vocab_file, num_oov_buckets=1)
            self._tokenizer = tf_text.BertTokenizer(
                self.vocab_table,
                token_out_type=tf.int64,
                lower_case=self.do_lower_case,
                unknown_token=UNK)
            self._tokenizer_type = WORDPIECE

        self._pad_id = self.vocab_table.lookup(tf.constant(PAD)) if PAD else -1
        self._cls_id = self.vocab_table.lookup(tf.constant(CLS)) if CLS else -1
        self._sep_id = self.vocab_table.lookup(tf.constant(SEP)) if SEP else -1

        if self._tokenizer_type == SENTENCEPIECE:
            self._pad_id = tf.cast(self._pad_id, tf.int32)
            self._cls_id = tf.cast(self._cls_id, tf.int32)
            self._sep_id = tf.cast(self._sep_id, tf.int32)

Пример #24

0

Показать файл

            batch_data)  #, padding=True, truncation=True)
end = time.time()
print(
    "The throughput of huggingface python tokenizer: {:,.2f} tokens/s".format(
        (total_tokens / (end - start))))

# BERT Tokenizer using TensorFlow Text
vocab_list = list(py_tokenizer.vocab.token_to_idx.keys())
lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(keys=vocab_list,
                                        key_dtype=tf.string,
                                        values=tf.range(tf.size(
                                            vocab_list, out_type=tf.int64),
                                                        dtype=tf.int64),
                                        value_dtype=tf.int64),
    num_oov_buckets=1)

tf_tokenizer = tf_text.BertTokenizer(lookup_table)

for batch_data in batches:
    input_ids = tf_tokenizer.tokenize(batch_data)

start = time.time()
for _ in range(epochs):
    for batch_data in batches:
        input_ids = tf_tokenizer.tokenize(batch_data)
end = time.time()
print(
    "The throughput of TensorFlow Text BertTokenizer: {:,.2f} tokens/s".format(
        (total_tokens / (end - start))))

Python BertTokenizer примеры использования