def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): try: tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError( "Unable to parse file {}. Unknown format. " "If you tried to load a model saved through TransfoXLTokenizer," "please note they are not compatible.".format(vocab_file)) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] # Strip normalizer at the end normalizer += [Strip(left=True, right=True)] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def __create_tokenizer(self, files): # Create, train and save the tokenizer. print("Preparing tokenizer...") tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) tokenizer.pre_tokenizer = WhitespaceSplit() trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(files=files, trainer=trainer) return tokenizer
def __init__( self, vocab_file, delimiter, lowercase, unk_token, eos_token, add_eos=False, add_double_eos=False, normalization: Optional[str] = None, ): tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) # Create the correct normalization path normalizer = [] # Include unicode normalization if normalization: normalizer += [unicode_normalizer_from_str(normalization)] # Include case normalization if lowercase: normalizer += [Lowercase()] if len(normalizer) > 0: tokenizer.normalizer = Sequence( normalizer) if len(normalizer) > 1 else normalizer[0] # Setup the splitter tokenizer.pre_tokenizer = CharDelimiterSplit( delimiter) if delimiter else WhitespaceSplit() if add_double_eos: tokenizer.post_processor = BertProcessing( (eos_token, tokenizer.token_to_id(eos_token)), (eos_token, tokenizer.token_to_id(eos_token))) parameters = { "model": "TransfoXLModel", "add_eos": add_eos, "add_double_eos": add_double_eos, "unk_token": unk_token, "eos_token": eos_token, "delimiter": delimiter, "lowercase": lowercase, } super().__init__(tokenizer, parameters)
def wordpiece_tokenize(line): tokenizer = Tokenizer(WordPiece(wordpiece_dict3)) tokenizer.enable_padding(length=200) tokenizer.enable_truncation(max_length=200) tokenizer.pre_tokenizer = WhitespaceSplit() tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) output = tokenizer.encode(line) return (output.ids)
def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first')
def test_bert_like(self): pre_tokenizer = Sequence([WhitespaceSplit(), Punctuation()]) assert isinstance(Sequence([]), PreTokenizer) assert isinstance(Sequence([]), Sequence) assert isinstance(pickle.loads(pickle.dumps(pre_tokenizer)), Sequence) result = pre_tokenizer.pre_tokenize_str("Hey friend! How are you?!?") assert result == [ ("Hey", (0, 3)), ("friend", (4, 10)), ("!", (10, 11)), ("How", (16, 19)), ("are", (20, 23)), ("you", (24, 27)), ("?", (27, 28)), ("!", (28, 29)), ("?", (29, 30)), ]
def converted(self): tokenizer = self.tokenizer(self.proto) # Tokenizer assemble tokenizer.normalizer = self.normalizer(self.proto) replacement = "▁" add_prefix_space = True tokenizer.pre_tokenizer = PSequence([ WhitespaceSplit(), Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), ]) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor(tokenizer) if post_processor: tokenizer.post_processor = post_processor # TODO what parameters should we give ? parameters = {} return BaseTokenizer(tokenizer, parameters)
def test_instantiate(self): assert WhitespaceSplit() is not None assert isinstance(WhitespaceSplit(), PreTokenizer) assert isinstance(WhitespaceSplit(), WhitespaceSplit) assert isinstance(pickle.loads(pickle.dumps(WhitespaceSplit())), WhitespaceSplit)
def test_all_tokenizer_on_special_cases(caplog): caplog.set_level(logging.CRITICAL) lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] tokenizers = [] for lang_name in lang_names: if "roberta" in lang_name: add_prefix_space = True else: add_prefix_space = False t = Tokenizer.load(lang_name, lower_case=False, add_prefix_space=add_prefix_space) tokenizers.append(t) texts = [ "This is a sentence", "Der entscheidende Pass", "力加勝北区ᴵᴺᵀᵃছজটডণত", "Thiso text is included tolod makelio sure Unicodeel is handled properly:", "This is a sentence...", "Let's see all on this text and. !23# neverseenwordspossible" "This is a sentence with multiple spaces", """This is a sentence. With linebreak""", """Sentence with multiple newlines """, "and another one\n\n\nwithout space", "This is a sentence with multiple tabs", ] expected_to_fail = [(1, 1), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 1), (2, 5)] for i_tok, tokenizer in enumerate(tokenizers): for i_text, text in enumerate(texts): # Important: we don't assume to preserve whitespaces after tokenization. # This means: \t, \n " " etc will all resolve to a single " ". # This doesn't make a difference for BERT + XLNet but it does for roBERTa test_passed = True # 1. original tokenize function from transformer repo on full sentence standardized_whitespace_text = ' '.join( text.split()) # remove multiple whitespaces tokenized = tokenizer.tokenize(standardized_whitespace_text) # 2. Our tokenization method using a pretokenizer which can normalize multiple white spaces # This approach is used in NER pre_tokenizer = WhitespaceSplit() words_and_spans = pre_tokenizer.pre_tokenize_str(text) words = [x[0] for x in words_and_spans] word_spans = [x[1] for x in words_and_spans] encoded = tokenizer.encode_plus( words, is_split_into_words=True, add_special_tokens=False).encodings[0] # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" if encoded.tokens != tokenized: test_passed = False # token offsets are originally relative to the beginning of the word # These lines convert them so they are relative to the beginning of the sentence token_offsets = [] for (start, end), w_index, in zip(encoded.offsets, encoded.words): word_start_ch = word_spans[w_index][0] token_offsets.append( (start + word_start_ch, end + word_start_ch)) if getattr(tokenizer, "add_prefix_space", None): token_offsets = [(start - 1, end) for start, end in token_offsets] # verify that offsets align back to original text if text == "力加勝北区ᴵᴺᵀᵃছজটডণত": # contains [UNK] that are impossible to match back to original text space continue for tok, (start, end) in zip(encoded.tokens, token_offsets): #subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them tok = re.sub(r"^(##|Ġ|▁)", "", tok) #tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok)) original_tok = text[start:end] if tok != original_tok: test_passed = False if (i_tok, i_text) in expected_to_fail: assert not test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'" else: assert test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'"
def test_instantiate(self): assert WhitespaceSplit() is not None assert isinstance(WhitespaceSplit(), PreTokenizer)
from itertools import product from tokenizers import Tokenizer, Regex from tokenizers.models import WordLevel from tokenizers.normalizers import NFD from tokenizers.trainers import WordLevelTrainer from tokenizers.pre_tokenizers import Split, WhitespaceSplit ## Loop which creates and loads the tokenizer def stackTraceTokenizer(tokens:tuple, events:tuple, vocab_size=2_000, min_freq=3): for norm, event in product(tokens, events): print(norm, event) tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) tokenizer.normalizer = NFD() if norm == 'white': tokenizer.pre_tokenizer = WhitespaceSplit() else: tokenizer.pre_tokenizer = Split(pattern=Regex("[A-Z]+[a-z0-9]+|[.A-Z]+|[a-z0-9]+"), behavior='isolated') trainer = WordLevelTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(["vocab-{}.txt".format(event)], trainer) print(f"Trained tokenizer for {norm}:{event}") yield tokenizer, event, norm
def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] self.pre_tokenizer = WhitespaceSplit() texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True, ) for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) padding_mask = tokenized.attention_mask if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_name = task["label_name"] labels_word = d[label_name] label_list = task["label_list"] label_tensor_name = task["label_tensor_name"] if task["task_type"] == "classification": label_ids = [label_list.index(labels_word)] elif task["task_type"] == "ner": labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: label_ids = None # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" # "\nIf your are running in *inference* mode: Don't worry!" # "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] if return_baskets: ret.append(self.baskets) return tuple(ret)
class MTLProcessor(Processor): def __init__( self, tokenizer, max_seq_len, data_dir, train_filename, test_filename, delimiter, dev_split=0.0, dev_filename=None, label_list=None, metric=None, proxies=None, **kwargs ): self.delimiter = delimiter super(MTLProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies ) def file_to_dicts(self, file: str) -> [dict]: dicts = list() df = pd.read_csv(file) for text, label, tokens in zip(df.sentence.values, df.label.values, df.trigger.values): columns = dict() text = ast.literal_eval(text) tokens = ast.literal_eval(tokens) columns["text"] = " ".join(text) columns["document_level_task_label"] = label # Key hard-coded columns["token_level_task_label"] = list(map(str, tokens)) # Key hard-coded dicts.append(columns) return dicts @staticmethod def _get_start_of_word(word_ids): words = np.array(word_ids) words[words == None] = -1 start_of_word_single = [0] + list(np.ediff1d(words) > 0) start_of_word_single = [int(x) for x in start_of_word_single] return start_of_word_single # Most of the code is copied from NERProcessor - dataset_from_dicts() def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"): self.baskets = [] self.pre_tokenizer = WhitespaceSplit() texts = [x["text"] for x in dicts] words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts] words = [[x[0] for x in y] for y in words_and_spans] word_spans_batch = [[x[1] for x in y] for y in words_and_spans] tokenized_batch = self.tokenizer.batch_encode_plus( words, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, return_attention_mask=True, truncation=True, max_length=self.max_seq_len, padding="max_length", is_split_into_words=True, ) for i in range(len(dicts)): tokenized = tokenized_batch[i] d = dicts[i] id_external = self._id_from_dict(d) if indices: id_internal = indices[i] else: id_internal = i input_ids = tokenized.ids segment_ids = tokenized.type_ids initial_mask = self._get_start_of_word(tokenized.words) assert len(initial_mask) == len(input_ids) padding_mask = tokenized.attention_mask if return_baskets: token_to_word_map = tokenized.words word_spans = word_spans_batch[i] tokenized_dict = { "tokens": tokenized.tokens, "word_spans": word_spans, "token_to_word_map": token_to_word_map, "start_of_word": initial_mask } else: tokenized_dict = {} feature_dict = { "input_ids": input_ids, "padding_mask": padding_mask, "segment_ids": segment_ids, "initial_mask": initial_mask, } for task_name, task in self.tasks.items(): try: label_name = task["label_name"] labels_word = d[label_name] label_list = task["label_list"] label_tensor_name = task["label_tensor_name"] if task["task_type"] == "classification": label_ids = [label_list.index(labels_word)] elif task["task_type"] == "ner": labels_token = expand_labels(labels_word, initial_mask, non_initial_token) label_ids = [label_list.index(lt) for lt in labels_token] except ValueError: label_ids = None problematic_labels = set(labels_token).difference(set(label_list)) print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" f"\nWe found a problem with labels {str(problematic_labels)}") except KeyError: label_ids = None # print(f"[Task: {task_name}] Could not convert labels to ids via label_list!" # "\nIf your are running in *inference* mode: Don't worry!" # "\nIf you are running in *training* mode: Verify you are supplying a proper label list to your processor and check that labels in input data are correct.") if label_ids: feature_dict[label_tensor_name] = label_ids curr_sample = Sample(id=None, clear_text=d, tokenized=tokenized_dict, features=[feature_dict]) curr_basket = SampleBasket(id_internal=id_internal, raw=d, id_external=id_external, samples=[curr_sample]) self.baskets.append(curr_basket) if indices and 0 not in indices: pass else: self._log_samples(1) dataset, tensor_names = self._create_dataset() ret = [dataset, tensor_names, self.problematic_sample_ids] if return_baskets: ret.append(self.baskets) return tuple(ret)