Exemplos de WhitespaceSplit em Python, exemplos de tokenizers.pre_tokenizers.WhitespaceSplit em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: tokenization_transfo_xl.py Projeto: zs930831/transformers

    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        try:
            tokenizer = WordLevel(vocab_file, unk_token=unk_token)
            tokenizer = Tokenizer(tokenizer)
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizer,"
                "please note they are not compatible.".format(vocab_file))

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        # Strip normalizer at the end
        normalizer += [Strip(left=True, right=True)]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: datasetcreator.py Projeto: afiqmuzaffar/MMM-JSB

    def __create_tokenizer(self, files):

        # Create, train and save the tokenizer.
        print("Preparing tokenizer...")
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = WhitespaceSplit()
        trainer = WordLevelTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
        tokenizer.train(files=files, trainer=trainer)
        return tokenizer

Exemplo n.º 3

0

Exibir arquivo

    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
        tokenizer = Tokenizer(tokenizer)

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)

Exemplo n.º 4

0

Exibir arquivo

def wordpiece_tokenize(line):
    tokenizer = Tokenizer(WordPiece(wordpiece_dict3))
    tokenizer.enable_padding(length=200)
    tokenizer.enable_truncation(max_length=200)
    tokenizer.pre_tokenizer = WhitespaceSplit()
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    output = tokenizer.encode(line)
    return (output.ids)

Exemplo n.º 5

0

Exibir arquivo

    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')

Exemplo n.º 6

0

Exibir arquivo

    def test_bert_like(self):
        pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation()])
        assert isinstance(Sequence([]), PreTokenizer)
        assert isinstance(Sequence([]), Sequence)
        assert isinstance(pickle.loads(pickle.dumps(pre_tokenizer)), Sequence)

        result = pre_tokenizer.pre_tokenize_str("Hey friend!     How are you?!?")
        assert result == [
            ("Hey", (0, 3)),
            ("friend", (4, 10)),
            ("!", (10, 11)),
            ("How", (16, 19)),
            ("are", (20, 23)),
            ("you", (24, 27)),
            ("?", (27, 28)),
            ("!", (28, 29)),
            ("?", (29, 30)),
        ]

Exemplo n.º 7

0

Exibir arquivo

    def converted(self):
        tokenizer = self.tokenizer(self.proto)

        # Tokenizer assemble
        tokenizer.normalizer = self.normalizer(self.proto)

        replacement = "▁"
        add_prefix_space = True
        tokenizer.pre_tokenizer = PSequence([
            WhitespaceSplit(),
            Metaspace(replacement=replacement,
                      add_prefix_space=add_prefix_space),
        ])
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)
        post_processor = self.post_processor(tokenizer)
        if post_processor:
            tokenizer.post_processor = post_processor

        # TODO what parameters should we give ?
        parameters = {}

        return BaseTokenizer(tokenizer, parameters)

Exemplo n.º 8

0

Exibir arquivo

 def test_instantiate(self):
     assert WhitespaceSplit() is not None
     assert isinstance(WhitespaceSplit(), PreTokenizer)
     assert isinstance(WhitespaceSplit(), WhitespaceSplit)
     assert isinstance(pickle.loads(pickle.dumps(WhitespaceSplit())), WhitespaceSplit)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: voidful/FARM

def test_all_tokenizer_on_special_cases(caplog):
    caplog.set_level(logging.CRITICAL)

    lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"]

    tokenizers = []
    for lang_name in lang_names:
        if "roberta" in lang_name:
            add_prefix_space = True
        else:
            add_prefix_space = False
        t = Tokenizer.load(lang_name,
                           lower_case=False,
                           add_prefix_space=add_prefix_space)
        tokenizers.append(t)

    texts = [
        "This is a sentence",
        "Der entscheidende Pass",
        "力加勝北区ᴵᴺᵀᵃছজটডণত",
        "Thiso text is included tolod makelio sure Unicodeel is handled properly:",
        "This is a sentence...",
        "Let's see all on this text and. !23# neverseenwordspossible"
        "This      is a sentence with multiple spaces",
        """This is a sentence.
      With linebreak""",
        """Sentence with multiple


      newlines
      """,
        "and another one\n\n\nwithout space",
        "This is a sentence			with multiple tabs",
    ]

    expected_to_fail = [(1, 1), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 1),
                        (2, 5)]

    for i_tok, tokenizer in enumerate(tokenizers):
        for i_text, text in enumerate(texts):
            # Important: we don't assume to preserve whitespaces after tokenization.
            # This means: \t, \n " " etc will all resolve to a single " ".
            # This doesn't make a difference for BERT + XLNet but it does for roBERTa

            test_passed = True

            # 1. original tokenize function from transformer repo on full sentence
            standardized_whitespace_text = ' '.join(
                text.split())  # remove multiple whitespaces
            tokenized = tokenizer.tokenize(standardized_whitespace_text)

            # 2. Our tokenization method using a pretokenizer which can normalize multiple white spaces
            # This approach is used in NER
            pre_tokenizer = WhitespaceSplit()
            words_and_spans = pre_tokenizer.pre_tokenize_str(text)
            words = [x[0] for x in words_and_spans]
            word_spans = [x[1] for x in words_and_spans]

            encoded = tokenizer.encode_plus(
                words, is_split_into_words=True,
                add_special_tokens=False).encodings[0]

            # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words"
            if encoded.tokens != tokenized:
                test_passed = False

            # token offsets are originally relative to the beginning of the word
            # These lines convert them so they are relative to the beginning of the sentence
            token_offsets = []
            for (start, end), w_index, in zip(encoded.offsets, encoded.words):
                word_start_ch = word_spans[w_index][0]
                token_offsets.append(
                    (start + word_start_ch, end + word_start_ch))
            if getattr(tokenizer, "add_prefix_space", None):
                token_offsets = [(start - 1, end)
                                 for start, end in token_offsets]

            # verify that offsets align back to original text
            if text == "力加勝北区ᴵᴺᵀᵃছজটডণত":
                # contains [UNK] that are impossible to match back to original text space
                continue
            for tok, (start, end) in zip(encoded.tokens, token_offsets):
                #subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them
                tok = re.sub(r"^(##|Ġ|▁)", "", tok)
                #tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok))
                original_tok = text[start:end]
                if tok != original_tok:
                    test_passed = False
            if (i_tok, i_text) in expected_to_fail:
                assert not test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'"
            else:
                assert test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'"

Exemplo n.º 10

0

Exibir arquivo

Arquivo: test_pre_tokenizers.py Projeto: llogiq/tokenizers

 def test_instantiate(self):
     assert WhitespaceSplit() is not None
     assert isinstance(WhitespaceSplit(), PreTokenizer)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: tokenizerTrainer.py Projeto: kbrezinski/Malware-Transformer

from itertools import product
from tokenizers import Tokenizer, Regex
from tokenizers.models import WordLevel
from tokenizers.normalizers import NFD
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Split, WhitespaceSplit

## Loop which creates and loads the tokenizer
def stackTraceTokenizer(tokens:tuple, events:tuple, vocab_size=2_000, min_freq=3):

    for norm, event in product(tokens, events):
        print(norm, event)

        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.normalizer = NFD()

        if norm == 'white':
            tokenizer.pre_tokenizer = WhitespaceSplit()
        else:
            tokenizer.pre_tokenizer = Split(pattern=Regex("[A-Z]+[a-z0-9]+|[.A-Z]+|[a-z0-9]+"),
                                            behavior='isolated')

        trainer = WordLevelTrainer(vocab_size=vocab_size, min_frequency=min_freq,
                                  special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

        tokenizer.train(["vocab-{}.txt".format(event)], trainer)
        print(f"Trained tokenizer for {norm}:{event}")

        yield tokenizer, event, norm

Exemplo n.º 12

0

Exibir arquivo

Arquivo: multitask_learning.py Projeto: imdiptanu/FARM

    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
      self.baskets = []
      self.pre_tokenizer = WhitespaceSplit()

      texts = [x["text"] for x in dicts]
      words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
      words = [[x[0] for x in y] for y in words_and_spans]

      word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

      tokenized_batch = self.tokenizer.batch_encode_plus(
          words,
          return_offsets_mapping=True,
          return_special_tokens_mask=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          max_length=self.max_seq_len,
          padding="max_length",
          is_split_into_words=True,
      )

      for i in range(len(dicts)):
          tokenized = tokenized_batch[i]
          d = dicts[i]
          id_external = self._id_from_dict(d)
          if indices:
              id_internal = indices[i]
          else:
              id_internal = i

          input_ids = tokenized.ids
          segment_ids = tokenized.type_ids
          initial_mask = self._get_start_of_word(tokenized.words)
          assert len(initial_mask) == len(input_ids)

          padding_mask = tokenized.attention_mask

          if return_baskets:
              token_to_word_map = tokenized.words
              word_spans = word_spans_batch[i]
              tokenized_dict = {
                  "tokens": tokenized.tokens,
                  "word_spans": word_spans,
                  "token_to_word_map": token_to_word_map,
                  "start_of_word": initial_mask
              }
          else:
              tokenized_dict = {}

          feature_dict = {
              "input_ids": input_ids,
              "padding_mask": padding_mask,
              "segment_ids": segment_ids,
              "initial_mask": initial_mask,
          }

          for task_name, task in self.tasks.items():
              try:
                  label_name = task["label_name"]
                  labels_word = d[label_name]
                  label_list = task["label_list"]
                  label_tensor_name = task["label_tensor_name"]

                  if task["task_type"] == "classification":
                      label_ids = [label_list.index(labels_word)]
                  elif task["task_type"] == "ner":
                      labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                      label_ids = [label_list.index(lt) for lt in labels_token]
              except ValueError:
                  label_ids = None
                  problematic_labels = set(labels_token).difference(set(label_list))
                  print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                  f"\nWe found a problem with labels {str(problematic_labels)}")
              except KeyError:
                  label_ids = None
                  # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                  #                 "\nIf your are running in *inference* mode: Don't worry!"
                  #                 "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
              if label_ids:
                  feature_dict[label_tensor_name] = label_ids

          curr_sample = Sample(id=None,
                                  clear_text=d,
                                  tokenized=tokenized_dict,
                                  features=[feature_dict])
          curr_basket = SampleBasket(id_internal=id_internal,
                                      raw=d,
                                      id_external=id_external,
                                      samples=[curr_sample])
          self.baskets.append(curr_basket)

      if indices and 0 not in indices:
          pass
      else:
          self._log_samples(1)

      dataset, tensor_names = self._create_dataset()
      ret = [dataset, tensor_names, self.problematic_sample_ids]
      if return_baskets:
          ret.append(self.baskets)
      return tuple(ret)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: multitask_learning.py Projeto: imdiptanu/FARM

class MTLProcessor(Processor):

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        train_filename,
        test_filename,
        delimiter,
        dev_split=0.0,
        dev_filename=None,
        label_list=None,
        metric=None,
        proxies=None,
        **kwargs
    ):
        self.delimiter = delimiter

        super(MTLProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies
        )

    def file_to_dicts(self, file: str) -> [dict]:
      dicts = list()
      df = pd.read_csv(file)
      for text, label, tokens in zip(df.sentence.values, df.label.values, df.trigger.values):
        columns = dict()
        text = ast.literal_eval(text)
        tokens = ast.literal_eval(tokens)
        columns["text"] = " ".join(text)
        columns["document_level_task_label"] = label # Key hard-coded
        columns["token_level_task_label"] = list(map(str, tokens)) # Key hard-coded
        dicts.append(columns)
      return dicts

    @staticmethod
    def _get_start_of_word(word_ids):
        words = np.array(word_ids)
        words[words == None] = -1
        start_of_word_single = [0] + list(np.ediff1d(words) > 0)
        start_of_word_single = [int(x) for x in start_of_word_single]
        return start_of_word_single

    # Most of the code is copied from NERProcessor - dataset_from_dicts()
    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
      self.baskets = []
      self.pre_tokenizer = WhitespaceSplit()

      texts = [x["text"] for x in dicts]
      words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
      words = [[x[0] for x in y] for y in words_and_spans]

      word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

      tokenized_batch = self.tokenizer.batch_encode_plus(
          words,
          return_offsets_mapping=True,
          return_special_tokens_mask=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          max_length=self.max_seq_len,
          padding="max_length",
          is_split_into_words=True,
      )

      for i in range(len(dicts)):
          tokenized = tokenized_batch[i]
          d = dicts[i]
          id_external = self._id_from_dict(d)
          if indices:
              id_internal = indices[i]
          else:
              id_internal = i

          input_ids = tokenized.ids
          segment_ids = tokenized.type_ids
          initial_mask = self._get_start_of_word(tokenized.words)
          assert len(initial_mask) == len(input_ids)

          padding_mask = tokenized.attention_mask

          if return_baskets:
              token_to_word_map = tokenized.words
              word_spans = word_spans_batch[i]
              tokenized_dict = {
                  "tokens": tokenized.tokens,
                  "word_spans": word_spans,
                  "token_to_word_map": token_to_word_map,
                  "start_of_word": initial_mask
              }
          else:
              tokenized_dict = {}

          feature_dict = {
              "input_ids": input_ids,
              "padding_mask": padding_mask,
              "segment_ids": segment_ids,
              "initial_mask": initial_mask,
          }

          for task_name, task in self.tasks.items():
              try:
                  label_name = task["label_name"]
                  labels_word = d[label_name]
                  label_list = task["label_list"]
                  label_tensor_name = task["label_tensor_name"]

                  if task["task_type"] == "classification":
                      label_ids = [label_list.index(labels_word)]
                  elif task["task_type"] == "ner":
                      labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                      label_ids = [label_list.index(lt) for lt in labels_token]
              except ValueError:
                  label_ids = None
                  problematic_labels = set(labels_token).difference(set(label_list))
                  print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                  f"\nWe found a problem with labels {str(problematic_labels)}")
              except KeyError:
                  label_ids = None
                  # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                  #                 "\nIf your are running in *inference* mode: Don't worry!"
                  #                 "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.")
              if label_ids:
                  feature_dict[label_tensor_name] = label_ids

          curr_sample = Sample(id=None,
                                  clear_text=d,
                                  tokenized=tokenized_dict,
                                  features=[feature_dict])
          curr_basket = SampleBasket(id_internal=id_internal,
                                      raw=d,
                                      id_external=id_external,
                                      samples=[curr_sample])
          self.baskets.append(curr_basket)

      if indices and 0 not in indices:
          pass
      else:
          self._log_samples(1)

      dataset, tensor_names = self._create_dataset()
      ret = [dataset, tensor_names, self.problematic_sample_ids]
      if return_baskets:
          ret.append(self.baskets)
      return tuple(ret)