Exemplo n.º 1
0
    def _tokenize_example(document, summary):
        tokenizer = tft.SentencepieceTokenizer(
            model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
        if substitute_newline:
            document = tf.strings.regex_replace(document, "\n",
                                                substitute_newline)
        # Remove space before special tokens.
        document = tf.strings.regex_replace(document, r" ([<\[]\S+[>\]])",
                                            b"\\1")
        document_ids = tokenizer.tokenize(document)
        if isinstance(document_ids, tf.RaggedTensor):
            document_ids = document_ids.to_tensor(0)
        document_ids = document_ids[:max_encoder_length]

        # Remove newline optionally
        if substitute_newline:
            summary = tf.strings.regex_replace(summary, "\n",
                                               substitute_newline)
        # Remove space before special tokens.
        summary = tf.strings.regex_replace(summary, r" ([<\[]\S+[>\]])",
                                           b"\\1")
        summary_ids = tokenizer.tokenize(summary)
        # Add [EOS] (1) special tokens.
        suffix = tf.constant([1])
        summary_ids = tf.concat([summary_ids, suffix], axis=0)
        if isinstance(summary_ids, tf.RaggedTensor):
            summary_ids = summary_ids.to_tensor(0)
        summary_ids = summary_ids[:max_decoder_length]

        return document_ids, summary_ids
Exemplo n.º 2
0
    def input_fn():
        # text input
        text = tf.compat.v1.placeholder(tf.string, [batch_size],
                                        name="input_text")

        # text tokenize
        tokenizer = tft.SentencepieceTokenizer(
            model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
        if substitute_newline:
            text = tf.strings.regex_replace(text, "\n", substitute_newline)
        # Remove space before special tokens.
        text = tf.strings.regex_replace(text, r" ([<\[]\S+[>\]])", b"\\1")
        ids = tokenizer.tokenize(text)
        if isinstance(ids, tf.RaggedTensor):
            ids = ids.to_tensor(0)

        # text padding: Pad only if necessary and reshape properly
        padded_ids = dynamic_padding(ids, max_encoder_length)
        ids = tf.slice(padded_ids, [0, 0], [batch_size, max_encoder_length])

        receiver_tensors = {"input": text}
        features = {"input_ids": tf.cast(ids, tf.int32, name="input_ids")}

        return tf.estimator.export.ServingInputReceiver(
            features=features, receiver_tensors=receiver_tensors)
Exemplo n.º 3
0
  def input_fn():
    # text input
    text = tf.compat.v1.placeholder(tf.string, [batch_size], name="input_text")

    # text tokenize
    tokenizer = tft.SentencepieceTokenizer(
        model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
    if substitute_newline:
      text = tf.strings.regex_replace(text, "\n", substitute_newline)
    ids = tokenizer.tokenize(text)
    ids = ids[:, :max_encoder_length - 2]

    # Add [CLS] and [SEP] special tokens.
    prefix = tf.repeat(tf.constant([[65]]), batch_size, axis=0)
    suffix = tf.repeat(tf.constant([[66]]), batch_size, axis=0)
    ids = tf.concat([prefix, ids, suffix], axis=1)
    if isinstance(ids, tf.RaggedTensor):
      ids = ids.to_tensor(0)

    # text padding: Pad only if necessary and reshape properly
    padded_ids = dynamic_padding(ids, max_encoder_length)
    ids = tf.slice(padded_ids, [0, 0], [batch_size, max_encoder_length])

    receiver_tensors = {"input": text}
    features = {"input_ids": tf.cast(ids, tf.int32, name="input_ids")}

    return tf.estimator.export.ServingInputReceiver(
        features=features, receiver_tensors=receiver_tensors)
  def _tokenize_example(context, question):
    tokenizer = tft.SentencepieceTokenizer(
        model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
    if substitute_newline:
      context = tf.strings.regex_replace(context, "\n", substitute_newline)
    # Remove space before special tokens.
    context = tf.strings.regex_replace(context, r" ([<\[]\S+[>\]])", b"\\1")
    context_ids = tokenizer.tokenize(context)
    if isinstance(context_ids, tf.RaggedTensor):
      context_ids = context_ids.to_tensor(0)
    context_ids = context_ids[:max_encoder_length]

    # Remove newline optionally
    if substitute_newline:
      question = tf.strings.regex_replace(question, "\n", substitute_newline)
    # Remove space before special tokens.
    question = tf.strings.regex_replace(question, r" ([<\[]\S+[>\]])", b"\\1")
    question_ids = tokenizer.tokenize(question)
    # Add [EOS] (1) special tokens.
    suffix = tf.constant([1])
    question_ids = tf.concat([question_ids, suffix], axis=0)
    if isinstance(question_ids, tf.RaggedTensor):
      question_ids = question_ids.to_tensor(0)
    question_ids = question_ids[:max_decoder_length]

    return context_ids, question_ids
Exemplo n.º 5
0
def c4_preprocess(dataset, training, max_target_length=-1,
                  tokenization=None, spm_path=None):
  """Pre-processing function for C4 dataset."""
  del training
  def unicode_decode_chars(features, targets):
    targets = tf.strings.unicode_decode(features['text'], 'UTF-8')
    targets = tf.cast(targets, tf.int64)
    features['targets'] = targets
    features['inputs'] = targets
    return (features, targets)

  def spc_tokenize(tokenizer, features, targets):
    del targets
    tokenized_text = tokenizer.tokenize(features['text'])
    features['targets'] = tf.cast(tokenized_text, tf.int64)
    features['inputs'] = features['targets']
    return features, features['targets']

  if tokenization == 'spc':
    spm_path = spm_path or t5_utils.DEFAULT_SPM_PATH
    with tf.compat.v1.gfile.GFile(spm_path, 'rb') as f:
      spc_model = f.read()
    tokenizer = tf_text.SentencepieceTokenizer(model=spc_model)
    dataset = dataset.map(functools.partial(spc_tokenize, tokenizer))
  else:
    dataset = dataset.map(unicode_decode_chars)

  def target_right_length(_, target):
    return tf.less(tf.shape(target)[0], max_target_length + 1)

  if max_target_length > 0:
    dataset = dataset.filter(target_right_length)

  return dataset
Exemplo n.º 6
0
    def do_masking(example):
        if "tfds://" == data_dir[:7]:
            text = example["text"]
        else:
            text = example

        print(text)

        tokenizer = tft.SentencepieceTokenizer(
            model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
        if substitute_newline:
            text = tf.strings.regex_replace(text, "\n", substitute_newline)
        subtokens = tokenizer.tokenize(text)
        (subtokens, masked_lm_positions, masked_lm_ids,
         masked_lm_weights) = tf.compat.v1.py_func(
             numpy_masking, [subtokens],
             [tf.int32, tf.int32, tf.int32, tf.float32],
             stateful=False)
        features = {
            "input_ids": subtokens,
            "segment_ids": tf.zeros_like(subtokens),
            "masked_lm_positions": masked_lm_positions,
            "masked_lm_ids": masked_lm_ids,
            "masked_lm_weights": masked_lm_weights,
            "next_sentence_labels": tf.zeros([1], dtype=tf.int64),
        }
        return features
Exemplo n.º 7
0
 def setup(self):
     self.tokenizer = tft.SentencepieceTokenizer(
         model=tf.io.gfile.GFile(self.vocab_model_file, "rb").read())
     self.sentence_tokenizer = nltk.load(SENTENCE_TOKENIZER_PATH)
     self.delimiter_range_pair = rendering_utils.get_default_delimiter_range_pair(
         task=self.task,
         delimiter_type=self.delimiter_type,
     )
Exemplo n.º 8
0
def load_sentencepiece_tokenizer(
    model_path, add_bos=False, add_eos=True, reverse=False):
  """Load a tf-text SentencePiece tokenizer from given model filepath."""
  with tf.io.gfile.GFile(model_path, 'rb') as model_fp:
    sp_model = model_fp.read()
  sp_tokenizer = tftxt.SentencepieceTokenizer(
      model=sp_model, add_bos=add_bos, add_eos=add_eos, reverse=reverse)
  return sp_tokenizer
Exemplo n.º 9
0
 def __init__(self, params: WMTDataConfig):
     self._params = params
     self._max_seq_length = params.max_seq_length
     self._static_batch = params.static_batch
     self._global_batch_size = params.global_batch_size
     if self._params.transform_and_batch:
         self._tokenizer = tftxt.SentencepieceTokenizer(
             model=tf.io.gfile.GFile(params.sentencepiece_model_path,
                                     'rb').read(),
             add_eos=True)
    def __post_init__(self):
        tokenizer = tensorflow_text.SentencepieceTokenizer(
            model=tf.io.gfile.GFile(self.vocab_path, 'rb').read(),
            add_eos=True)
        eos_token = tokenizer.string_to_id('</s>')

        # Work-around for frozen dataclasses:
        # https://stackoverflow.com/questions/53756788
        object.__setattr__(self, 'eos_token', eos_token)
        object.__setattr__(self, '_tokenizer', tokenizer)
Exemplo n.º 11
0
 def __init__(self, model=None, model_path=None):
     super(SentencepieceTokenizer, self).__init__()
     if model_path:
         self.proto = open(model_path, "rb").read()
     if model:
         if isinstance(model, str):
             asc_str = model.encode("ascii")
             self.proto = base64.decodebytes(asc_str)
         if isinstance(model, bytes):
             self.proto = model
     self.tokenizer = text.SentencepieceTokenizer(self.proto)
Exemplo n.º 12
0
 def __init__(self, params):
   super().__init__(params)
   p = self.params
   with tf.io.gfile.GFile(p.spm_model, 'rb') as f:
     self._tokenizer = tf_text.SentencepieceTokenizer(
         model=f.read(),
         out_type=tf.int32,
         nbest_size=p.nbest_size,
         alpha=p.alpha,
         reverse=False,
         add_bos=False,
         add_eos=p.append_eos)
Exemplo n.º 13
0
 def __init__(self, params, model: tf.keras.Model, inference_step=None):
     super().__init__(params, model, inference_step)
     self._sp_tokenizer = tf_text.SentencepieceTokenizer(
         model=tf.io.gfile.GFile(params.sentencepiece_model_path,
                                 "rb").read(),
         add_eos=True)
     try:
         empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
     except tf.errors.InternalError:
         raise ValueError(
             "EOS token not in tokenizer vocab."
             "Please make sure the tokenizer generates a single token for an "
             "empty string.")
     self._eos_id = empty_str_tokenized.item()
    def test_tftext_sentencepiece_tokenizer_bos_eos(self):
        """Check that the new tokenizer produces the same result that the tftext one with bos and eos."""
        tftext_sp = tensorflow_text.SentencepieceTokenizer(
            self.sentencepiece_model, add_bos=True, add_eos=True)
        opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(
            self.sentencepiece_model, add_bos=True, add_eos=True)

        input_text = [
            u" ", u"to be or not to be", u"ignored by length text1",
            u"ignored by length text2"
        ]
        tftext_tokenized = tftext_sp.tokenize(input_text)
        opt_tokenized = opt_sp.tokenize(input_text)
        self.assertAllEqual(tftext_tokenized, opt_tokenized)
Exemplo n.º 15
0
def decode(ids: tf.Tensor, vocab_filename: str, encoder_type: str):
  """DecodeOp."""
  if encoder_type not in ["sentencepiece", "sentencepiece_newline"]:
    raise ValueError("Unsupported encoder type: %s" % encoder_type)
  sp_model = tf.gfile.GFile(vocab_filename, "rb").read()
  tokenizer = tf_text.SentencepieceTokenizer(model=sp_model)
  ids = tf.where(ids > 1 + _SHIFT_RESERVED_TOKENS, ids - _SHIFT_RESERVED_TOKENS,
                 ids)
  ids = tf.cast(ids, tf.int32)
  text = tokenizer.detokenize(ids)
  text = tf.reshape(text, [-1])
  if encoder_type == "sentencepiece_newline":
    text = tf.strings.regex_replace(text, _NEWLINE_SYMBOL, "\n")
  return text
    def test_tftext_sentencepiece_detokenizer(self):
        """Check that the new tokenizer produces the same result that the tftext one."""
        tftext_sp = tensorflow_text.SentencepieceTokenizer(
            self.sentencepiece_model)
        opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(
            self.sentencepiece_model)

        input_text = [
            u" ", u"to be or not to be", u"ignored by length text1",
            u"ignored by length text2"
        ]
        tftext_tokenized = tftext_sp.tokenize(input_text)

        # Check detokenizer
        tftext_detokenized = tftext_sp.detokenize(tftext_tokenized)
        opt_detokenized = opt_sp.detokenize(tftext_tokenized)
        self.assertAllEqual(tftext_detokenized, opt_detokenized)
    def __init__(self, model, nbest_size=0, alpha=1.0):
        """Initializes the tokenizer.

        Args:
          model: Path to the SentencePiece model.
          nbest_size: Number of candidates to sample from (disabled during inference).
          alpha: Smoothing parameter for the sampling.
        """
        super().__init__()
        self._nbest_size = nbest_size
        with tf.io.gfile.GFile(model, "rb") as model_file:
            self._tokenizer = tft.SentencepieceTokenizer(
                model=model_file.read(),
                out_type=tf.string,
                nbest_size=nbest_size,
                alpha=alpha,
            )
Exemplo n.º 18
0
    def __init__(self, vocab_file):

        sentence_model_voca = vocab_file
        sp_model = spm.SentencePieceProcessor()
        sp_proto = tf.io.gfile.GFile(sentence_model_voca, "rb").read()
        sp_model.LoadFromSerializedProto(sp_proto)
        self.vocab_size = sp_model.GetPieceSize()

        word_to_token = [sp_model.IdToPiece(i) for i in range(self.vocab_size)]
        self.word_start_subtoken = np.array(
            [sp_model.IdToPiece(i)[0] == "▁" for i in range(self.vocab_size)])

        self.tf_tokenizer = tft.SentencepieceTokenizer(
            model=tf.io.gfile.GFile(sentence_model_voca, "rb").read())

        self.vocab = ko_bpe_vocab(word_to_token)

        self.inv_vocab = {v: k for k, v in self.vocab.items()}
Exemplo n.º 19
0
  def _tokenize_example(example):
    text, label = example["text"], example["label"]
    tokenizer = tft.SentencepieceTokenizer(
        model=tf.io.gfile.GFile(vocab_model_file, "rb").read())
    if substitute_newline:
      text = tf.strings.regex_replace(text, "\n", substitute_newline)
    ids = tokenizer.tokenize(text)
    ids = ids[:max_encoder_length - 2]
    # Add [CLS] (65) and [SEP] (66) special tokens.
    prefix = tf.constant([65])
    suffix = tf.constant([66])
    ids = tf.concat([prefix, ids, suffix], axis=0)
    if isinstance(ids, tf.RaggedTensor):
      ids = ids.to_tensor(0)

    # tf.Example only supports tf.int64, but the TPU is better with tf.int32.
    label = tf.cast(label, tf.int32)

    return ids, label
def main(args: argparse.Namespace):
    logger = get_logger("make-tfrecord")

    input_files = glob.glob(args.dataset_paths)
    logger.info(f"[+] Number of Dataset Files: {len(input_files)}")

    # Load Config
    logger.info(f"[+] Load Config From {args.data_config}")
    config = DataConfig.from_yaml(args.data_config)

    # Load Sentencepiece model
    logger.info(f"[+] Load Tokenizer From {args.sp_model_path}")
    with open(args.sp_model_path, "rb") as f:
        tokenizer = text.SentencepieceTokenizer(f.read(),
                                                add_bos=True,
                                                add_eos=True)

    serialize = tf.function(lambda audio, text: tf.io.serialize_tensor(
        tf.stack([tf.io.serialize_tensor(audio),
                  tf.io.serialize_tensor(text)])))

    logger.info("[+] Start Saving Dataset...")
    for file_path in tqdm(input_files):
        output_dir = args.output_dir if args.output_dir else os.path.dirname(
            file_path)
        file_name = os.path.basename(file_path)
        output_path = os.path.join(
            output_dir,
            os.path.splitext(file_name)[0] + ".tfrecord")

        # Write TFRecordFile
        dataset = (get_dataset(
            file_path, config.file_format, config.sample_rate, tokenizer).map(
                config.audio_feature_fn,
                num_parallel_calls=tf.data.experimental.AUTOTUNE).map(
                    serialize,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE))
        writer = tf.data.experimental.TFRecordWriter(output_path, "GZIP")
        writer.write(dataset)

    logger.info("[+] Done")
Exemplo n.º 21
0
def encode(text: tf.Tensor, max_len: int, vocab_filename: str,
           encoder_type: str):
  """EncodeOp."""
  if encoder_type not in ["sentencepiece", "sentencepiece_newline"]:
    raise ValueError("Unsupported encoder type: %s" % encoder_type)
  sp_model = tf.gfile.GFile(vocab_filename, "rb").read()
  tokenizer = tf_text.SentencepieceTokenizer(model=sp_model)
  batch_size = text.shape[0]
  if encoder_type == "sentencepiece_newline":
    text = tf.strings.regex_replace(text, "\n", _NEWLINE_SYMBOL)
  ids = tokenizer.tokenize(text)
  eos = tf.ragged.constant([[1]] * batch_size)
  ids = tf.concat([ids, eos], axis=1)
  ids = ids.to_tensor(default_value=0)
  ids = ids[:, :max_len]
  pad = max_len - tf.shape(ids)[1]
  ids = tf.pad(ids, [[0, 0], [0, pad]])
  ids.set_shape([ids.shape[0], max_len])
  ids = tf.where(ids > 1, ids + _SHIFT_RESERVED_TOKENS, ids)
  ids = tf.cast(ids, tf.int64)
  return ids
Exemplo n.º 22
0
 def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
   super().__init__(params, logging_dir, name=name)
   self._sentencepiece_model_path = params.sentencepiece_model_path
   if params.sentencepiece_model_path:
     self._sp_tokenizer = tftxt.SentencepieceTokenizer(
         model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(),
         add_eos=True)
     try:
       empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
     except tf.errors.InternalError:
       raise ValueError(
           "EOS token not in tokenizer vocab."
           "Please make sure the tokenizer generates a single token for an "
           "empty string.")
     self._eos_id = empty_str_tokenized.item()
     self._vocab_size = self._sp_tokenizer.vocab_size().numpy()
   else:
     raise ValueError("Setencepiece model path not provided.")
   if (params.validation_data.input_path or
       params.validation_data.tfds_name) and self._logging_dir:
     self._references, self._tf_record_input_path = write_test_record(
         params.validation_data, self.logging_dir)
    def benchmarkTokenizer(self):
        sp_model = _GetSentencepieceModel()
        test_text = [
            "This week we celebrate the casts and creatives who have come together"
            " to bring us our favorite.",
            "More Stacks products demonstrated commitment to excellent support.",
            "Test, test, test."
        ]

        tftext_sp = tensorflow_text.SentencepieceTokenizer(sp_model)
        opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(sp_model)
        iter_number = 1000
        start = time.time()
        for _ in range(iter_number):
            _ = opt_sp.tokenize(test_text)
        self.report_benchmark(iters=iter_number,
                              wall_time=time.time() - start,
                              name="opt")
        start = time.time()
        for _ in range(iter_number):
            _ = tftext_sp.tokenize(test_text)
        self.report_benchmark(iters=iter_number,
                              wall_time=time.time() - start,
                              name="tf.text")
Exemplo n.º 24
0
    def __init__(self,
                 bert_layer,
                 max_len,
                 min_len=1,
                 CLS='[CLS]',
                 SEP='[SEP]',
                 PAD='[PAD]',
                 UNK='[UNK]'):
        """ Initializes the layer

        :param CLS Token that represents the start of a sentence
        :param SEP Token that represents the end of a segment
        :param PAD Token that represents padding
        :param UNK Token that represents unknown tokens
        :param bert_layer Keras layer that loaded from pretrained BERT
        """
        super().__init__()
        self._CLS = CLS
        self._SEP = SEP
        self._PAD = PAD
        self._min_len = min_len
        self._max_len = max_len

        resolved_object = bert_layer.resolved_object
        self.do_lower_case = resolved_object.do_lower_case.numpy()
        if hasattr(resolved_object, "tokenizer_type"):
            tokenizer_type_file = resolved_object.tokenizer_type.asset_path.numpy(
            ).decode("utf-8")
            with tf.io.gfile.GFile(tokenizer_type_file, 'r') as f_handler:
                self._tokenizer_type = f_handler.read().strip()
            tokenizer_file = resolved_object.tokenizer_file.asset_path.numpy(
            ).decode("utf-8")
            if self._tokenizer_type == SENTENCEPIECE:
                with tf.io.gfile.GFile(tokenizer_file, 'rb') as f_handler:
                    sp_model = f_handler.read()
                self._tokenizer = tf_text.SentencepieceTokenizer(
                    model=sp_model, out_type=tf.int32)
                self.vocab_table = create_tf_vocab_from_sp_tokenizer(
                    self._tokenizer, num_oov_buckets=1)
            else:
                assert (self._tokenizer_type == SPACE)
                _, self.vocab_table = read_tf_vocab(tokenizer_file, UNK)
        else:
            vocab_file = resolved_object.vocab_file.asset_path.numpy().decode(
                "utf-8")
            _, self.vocab_table = create_tf_vocab_from_wp_tokenizer(
                vocab_file, num_oov_buckets=1)
            self._tokenizer = tf_text.BertTokenizer(
                self.vocab_table,
                token_out_type=tf.int64,
                lower_case=self.do_lower_case,
                unknown_token=UNK)
            self._tokenizer_type = WORDPIECE

        self._pad_id = self.vocab_table.lookup(tf.constant(PAD)) if PAD else -1
        self._cls_id = self.vocab_table.lookup(tf.constant(CLS)) if CLS else -1
        self._sep_id = self.vocab_table.lookup(tf.constant(SEP)) if SEP else -1

        if self._tokenizer_type == SENTENCEPIECE:
            self._pad_id = tf.cast(self._pad_id, tf.int32)
            self._cls_id = tf.cast(self._cls_id, tf.int32)
            self._sep_id = tf.cast(self._sep_id, tf.int32)
Exemplo n.º 25
0
 def _create_tokenizer(self):
     return text.SentencepieceTokenizer(model=self._model_serialized_proto,
                                        out_type=tf.int32,
                                        nbest_size=self._nbest_size,
                                        alpha=self._alpha)
 def tf_tokenizer(self):
     """Instantiate and return a TF tokenizer."""
     return tf_text.SentencepieceTokenizer(model=self.sp_model)
Exemplo n.º 27
0
def load_sentencepiece_model(model_proto):
    proto = tf.io.gfile.GFile(model_proto, 'rb').read()
    return tf_text.SentencepieceTokenizer(model=proto)
Exemplo n.º 28
0
 def __init__(self, vocab_file):
   super().__init__()
   serialized_proto = tf.compat.v1.gfile.GFile(vocab_file, "rb").read()
   self.tokenizer = tf_text.SentencepieceTokenizer(
       model=serialized_proto, add_bos=True, add_eos=True)
Exemplo n.º 29
0
# fmt: on

if __name__ == "__main__":
    args = parser.parse_args()
    strategy = get_device_strategy(args.device)

    logger = get_logger()

    if args.mixed_precision:
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
        tf.keras.mixed_precision.experimental.set_policy(policy)
        logger.info("Use Mixed Precision FP16")

    # Construct Dataset
    with tf.io.gfile.GFile(args.sp_model_path, "rb") as f:
        tokenizer = text.SentencepieceTokenizer(f.read(), add_bos=True, add_eos=True)

    dataset_files = tf.io.gfile.glob(args.dataset_path)
    if not dataset_files:
        logger.error("[Error] Dataset path is invalid!")
        sys.exit(1)
    if args.auto_encoding:
        scatter = lambda tokens: (tokens, tokens)
        dataset = (
            tf.data.TextLineDataset(dataset_files, num_parallel_reads=tf.data.experimental.AUTOTUNE)
            .map(tokenizer.tokenize)
            .map(scatter)
        )
    else:
        tokenize = lambda inputs, outputs: ((tokenizer.tokenize(inputs), tokenizer.tokenize(outputs)))
        dataset = tf.data.experimental.CsvDataset(
Exemplo n.º 30
0
    def __call__(self, x):
        # Constrained sequence
        cs_scores = np.array([[10.0, 12.0, 6.0, 4.0], [13.0, 12.0, 11.0,
                                                       10.0]])
        cs_input = np.array([cs_scores, cs_scores, cs_scores],
                            dtype=np.float32)
        cs_transition_weights = np.array(
            [[-1.0, 1.0, -2.0, 2.0, 0.0], [3.0, -3.0, 4.0, -4.0, 0.0],
             [5.0, 1.0, 10.0, 1.0, 1.0], [-7.0, 7.0, -8.0, 8.0, 0.0],
             [0.0, 1.0, 2.0, 3.0, 0.0]],
            dtype=np.float32)
        cs_allowed_transitions = np.array([[True, True, True, True, True],
                                           [True, True, True, True, True],
                                           [True, False, True, False, False],
                                           [True, True, True, True, True],
                                           [True, False, True, True, True]])
        constrained_sequence = text.viterbi_constrained_sequence(
            cs_input, [2, 2, 2],
            allowed_transitions=cs_allowed_transitions,
            transition_weights=cs_transition_weights,
            use_log_space=True,
            use_start_and_end_states=True)
        # Max Spanning Tree
        mst_num_nodes = tf.constant([4, 3], tf.int32)
        mst_scores = tf.constant(
            [[[0, 0, 0, 0], [1, 0, 0, 0], [1, 2, 0, 0], [1, 2, 3, 4]],
             [[4, 3, 2, 9], [0, 0, 2, 9], [0, 0, 0, 9], [9, 9, 9, 9]]],
            tf.int32)  # pyformat: disable
        (max_spanning_tree,
         _) = text.max_spanning_tree(mst_num_nodes, mst_scores)
        # Normalize
        normalized = text.case_fold_utf8(['A String'])
        normalized = text.normalize_utf8(normalized)
        # Regex split
        regex_split = text.regex_split(input=['Yo dawg!'],
                                       delim_regex_pattern=r'\s')
        # Rouge-L
        rl_hypotheses = tf.ragged.constant(
            [['captain', 'of', 'the', 'delta', 'flight'],
             ['the', '1990', 'transcript']])
        rl_references = tf.ragged.constant(
            [['delta', 'air', 'lines', 'flight'],
             ['this', 'concludes', 'the', 'transcript']])
        (rouge_l, _, _) = text.metrics.rouge_l(rl_hypotheses, rl_references)
        # Sentence breaking version 1 (token dependent)
        sb_token_word = [['Welcome', 'to', 'the', 'U.S.', '!', 'Harry'],
                         ['Wu', 'Tang', 'Clan', ';', 'ain\'t', 'nothing']]
        sb_token_properties = [[0, 0, 0, 256, 0, 0], [0, 0, 0, 0, 0, 0]]
        sb_token_starts = []
        sb_token_ends = []
        for sentence in sb_token_word:
            sentence_string = ''
            sentence_start = []
            sentence_end = []
            for word in sentence:
                sentence_start.append(len(sentence_string))
                sentence_string = sentence_string.join([word, ' '])
                sentence_end.append(len(sentence_string))
            sb_token_starts.append(sentence_start)
            sb_token_ends.append(sentence_end)
        sb_token_starts = tf.constant(sb_token_starts, dtype=tf.int64)
        sb_token_ends = tf.constant(sb_token_ends, dtype=tf.int64)
        sb_token_properties = tf.ragged.constant(sb_token_properties,
                                                 dtype=tf.int64)
        (sentence_breaking, _, _,
         _) = text.sentence_fragments(sb_token_word, sb_token_starts,
                                      sb_token_ends, sb_token_properties)
        # Sentence breaking version 2 (StateBasedSentenceBreaker)
        sbv2_text_input = [['Welcome to the U.S.! Harry'],
                           ['Wu Tang Clan; ain\'t nothing']]
        sentence_breaker_v2 = text.StateBasedSentenceBreaker()
        sbv2_fragment_text, _, _ = (
            sentence_breaker_v2.break_sentences_with_offsets(sbv2_text_input))
        # Sentencepiece tokenizer
        sp_model_file = (
            'third_party/tensorflow_text/python/ops/test_data/test_oss_model.model'
        )
        sp_model = open(sp_model_file, 'rb').read()
        sp_tokenizer = text.SentencepieceTokenizer(sp_model)
        sentencepiece = sp_tokenizer.tokenize(['A sentence of things.'])
        sentencepiece = sp_tokenizer.detokenize(sentencepiece)
        (sentencepiece, _,
         _) = sp_tokenizer.tokenize_with_offsets(sentencepiece)
        sentencepiece_size = sp_tokenizer.vocab_size()
        sentencepiece_id = sp_tokenizer.id_to_string(1)
        # Split merge tokenizer
        sm_tokenizer = text.SplitMergeTokenizer()
        split_merge = sm_tokenizer.tokenize(b'IloveFlume!',
                                            [0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0])
        # Split merge from logits tokenizer
        smfl_tokenizer = text.SplitMergeFromLogitsTokenizer()
        split_merge_from_logits = smfl_tokenizer.tokenize(
            b'IloveFlume!',
            # One pair of logits for each Unicode character from the text.  Each
            # pair indicates a "split" action if the first component is greater than
            # the second one, and a "merge" otherwise.
            [
                [2.7, -0.3],  # I: split
                [4.1, 0.82],  # l: split
                [-2.3, 4.3],  # o: merge
                [3.1, 12.2],  # v: merge
                [-3.0, 4.7],  # e: merge
                [2.7, -0.7],  # F: split
                [0.7, 15.0],  # l: merge
                [1.6, 23.0],  # u: merge
                [2.1, 11.0],  # m: merge
                [0.0, 20.0],  # e: merge
                [18.0, 0.7],  # !: split
            ])
        # Confirm TF unicode_script op that requires ICU works
        tf_unicode_script = tf.strings.unicode_script(
            [ord('a'), 0x0411, 0x82b8, ord(',')])
        # Unicode script tokenizer
        us_tokenizer = text.UnicodeScriptTokenizer()
        unicode_script = us_tokenizer.tokenize(['a string'])
        # Whitespace tokenizer
        ws_tokenizer = text.WhitespaceTokenizer()
        whitespace = ws_tokenizer.tokenize(['a string'])
        # Wordpiece tokenizer
        wp_initializer = tf.lookup.KeyValueTensorInitializer(
            ['i'], [1], key_dtype=tf.string, value_dtype=tf.int64)
        self.wp_vocab_table = tf.lookup.StaticHashTable(wp_initializer,
                                                        default_value=-1)
        wp_tokenizer = text.WordpieceTokenizer(self.wp_vocab_table)
        wordpiece = wp_tokenizer.tokenize(['i am'])
        # Wordshape
        wordshapes = text.wordshape([u'a-b', u'a\u2010b'.encode('utf-8')],
                                    text.WordShape.HAS_PUNCTUATION_DASH)

        # Assertion method
        def assert_check(tensor):
            return tf.assert_equal(tensor, tf.identity(tensor))

        # Assertions
        constrained_sequence_assert = assert_check(
            constrained_sequence.to_tensor())
        max_spanning_tree_assert = assert_check(max_spanning_tree)
        normalized_assert = assert_check(normalized)
        regex_split_assert = assert_check(regex_split.to_tensor())
        rouge_l_assert = assert_check(rouge_l)
        sentence_breaking_assert = assert_check(sentence_breaking.to_tensor())
        sentence_breaking_v2_assert = assert_check(
            sbv2_fragment_text.to_tensor())
        sentencepiece_assert = assert_check(sentencepiece.to_tensor())
        sentencepiece_id_assert = assert_check(sentencepiece_id)
        sentencepiece_size_assert = assert_check(sentencepiece_size)
        split_merge_assert = assert_check(split_merge)
        split_merge_from_logits_assert = assert_check(split_merge_from_logits)
        tf_unicode_script_assert = assert_check(tf_unicode_script)
        unicode_script_assert = assert_check(unicode_script.to_tensor())
        whitespace_assert = assert_check(whitespace.to_tensor())
        wordpiece_assert = assert_check(wordpiece.to_tensor())
        wordshapes_assert = assert_check(wordshapes)

        with tf.control_dependencies([
                constrained_sequence_assert, max_spanning_tree_assert,
                normalized_assert, regex_split_assert, rouge_l_assert,
                sentence_breaking_assert, sentence_breaking_v2_assert,
                sentencepiece_assert, sentencepiece_id_assert,
                sentencepiece_size_assert, split_merge_assert,
                split_merge_from_logits_assert, tf_unicode_script_assert,
                unicode_script_assert, whitespace_assert, wordpiece_assert,
                wordshapes_assert
        ]):
            y = tf.add(x, [1])
        return {'y': y}