def _tokenize(self, text, never_split=None, **kwargs): if self.do_preprocessing: if self.do_lower_case: text = text.lower() text = str(" ".join(text_processor.pre_process_doc(text))) text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'(\w)\1{2,}', r'\1\1', text) text = re.sub(r'^\s', '', text) text = re.sub(r'\s$', '', text) # print(s) split_tokens = [text] if self.do_wordpiece_tokenize: wordpiece_tokenizer = WordpieceTokenizer(self.vocab, self.unk_token) split_tokens = wordpiece_tokenizer.tokenize(text) elif self.do_char_tokenize: tokenizer = CharacterTokenizer(self.vocab, self.unk_token) split_tokens = tokenizer.tokenize(text) elif self.do_basic_tokenize: """Tokenizes a piece of text.""" split_tokens = self.base_bert_tok.tokenize(text) return split_tokens
def __init__(self, vocab_path, do_lower_case=True, max_len=None, freq_path=None): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. """ self.token_to_idx = json.load(open(vocab_path, 'r'), object_pairs_hook=OrderedDict) self.idx_to_token = OrderedDict([ (idx, tok) for tok, idx in self.token_to_idx.items() ]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx) self.max_len = max_len if max_len is not None else int(1e12) if freq_path is not None: self.token_to_freq = json.load(open(freq_path, 'r'), object_pairs_hook=OrderedDict)
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format( vocab_file)) self.vocab = load_vocab(vocab_file) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer( do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token)
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, do_char_tokenize=False, do_wordpiece_tokenize=False, do_preprocessing = True, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.do_wordpiece_tokenize = do_wordpiece_tokenize self.do_lower_case = do_lower_case self.vocab_file = vocab_file self.do_basic_tokenize = do_basic_tokenize self.do_char_tokenize = do_char_tokenize self.unk_token = unk_token self.do_preprocessing = do_preprocessing if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file, do_lower_case=do_lower_case, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs)
def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
class SubwordTokenizer(Tokenizer): """ Subword Tokenizer text -> [word tokens] -> [[sub word tokens], ...] * Args: name: tokenizer name [wordpiece] """ def __init__(self, name, word_tokenizer, config={}): super(SubwordTokenizer, self).__init__(name, f"subword-{name}+{word_tokenizer.cache_name}") self.data_handler = DataHandler(CachePath.VOCAB) self.config = config self.word_tokenizer = word_tokenizer self.subword_tokenizer = None """ Tokenizers """ def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
def make_alignment(tokenizer: transformers.WordpieceTokenizer, tokens: List[str]) -> Tuple[List[str], List[List[int]]]: """ Make the alignment between tokens and the subtokens. It is useful to interpret results or to understand the model reasoning. """ i = 0 sub_tokens = [] alignment = [] for token in tokens: indices = [] word_pieces = tokenizer.tokenize(token) for sub_token in word_pieces: indices.append(i) sub_tokens.append(sub_token) i += 1 alignment.append(indices) return sub_tokens, alignment
def extend_bert_vocab(self, words_to_extend): # print(all_words) init_len = len(self.tokenizer.vocab) cur_ind = init_len for i in words_to_extend: if i in self.tokenizer.vocab: continue self.tokenizer.vocab[i] = cur_ind cur_ind += 1 print(f"extend bert tokenizer with extra {cur_ind - init_len} words!") self.tokenizer.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.tokenizer.vocab.items() ]) self.tokenizer.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.tokenizer.vocab, unk_token=self.tokenizer.unk_token) self.encoder._resize_token_embeddings(cur_ind)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) # Dirty hack to add NK vocab to our tokenizer # From: https://github.com/deepset-ai/FARM/issues/157 from collections import OrderedDict from transformers import BertTokenizer, WordpieceTokenizer with open('jobert-vocab.txt', 'r', encoding='utf8') as fp: vocab = fp.read().splitlines() tokens_to_add = [token for token in vocab if not (token in tokenizer.vocab or token in tokenizer.all_special_tokens)] tokenizer.vocab = OrderedDict([ *tokenizer.vocab.items(), *[ (token, i + len(tokenizer.vocab)) for i, token in enumerate(tokens_to_add) ] ]) tokenizer.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in tokenizer.vocab.items()]) tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token) if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = ( get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None ) eval_dataset = ( get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None ) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def __init__(self, vocab_file, do_lower_case=False, do_word_tokenize=True, do_subword_tokenize=True, word_tokenizer_type="basic", subword_tokenizer_type="wordpiece", never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", mecab_kwargs=None, **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_word_tokenize**: (`optional`) boolean (default True) Whether to do word tokenization. **do_subword_tokenize**: (`optional`) boolean (default True) Whether to do subword tokenization. **word_tokenizer_type**: (`optional`) string (default "basic") Type of word tokenizer. **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None) """ super(BertTokenizer, self).__init__( unk_token='<unk>' if word_tokenizer_type == 'sp' else '[UNK]', sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, do_lower_case=do_lower_case, do_word_tokenize=do_word_tokenize, do_subword_tokenize=do_subword_tokenize, word_tokenizer_type=word_tokenizer_type, subword_tokenizer_type=subword_tokenizer_type, never_split=never_split, mecab_kwargs=mecab_kwargs, **kwargs, ) # ^^ We call the grandparent's init, not the parent's. if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_word_tokenize = do_word_tokenize self.word_tokenizer_type = word_tokenizer_type self.lower_case = do_lower_case self.never_split = never_split self.mecab_kwargs = copy.deepcopy(mecab_kwargs) if do_word_tokenize: if word_tokenizer_type == "basic": self.word_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False) elif word_tokenizer_type == "mecab": self.word_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})) elif word_tokenizer_type == "sp": path_vocab = Path(vocab_file) self.word_tokenizer = SentencePiecepTokenizer( model_file=str(path_vocab.parent / path_vocab.stem) + '.model', do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})) else: raise ValueError( "Invalid word_tokenizer_type '{}' is specified.".format( word_tokenizer_type)) self.do_subword_tokenize = do_subword_tokenize self.subword_tokenizer_type = subword_tokenizer_type if do_subword_tokenize: if subword_tokenizer_type == "wordpiece": self.subword_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token) elif subword_tokenizer_type == "character": self.subword_tokenizer = CharacterTokenizer( vocab=self.vocab, unk_token=self.unk_token) else: raise ValueError( "Invalid subword_tokenizer_type '{}' is specified.".format( subword_tokenizer_type))
class WordPieceVocab(object): """Runs end-to-end tokenization: punctuation splitting + wordpiece""" def __init__(self, vocab_path, do_lower_case=True, max_len=None, freq_path=None): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. """ self.token_to_idx = json.load(open(vocab_path, 'r'), object_pairs_hook=OrderedDict) self.idx_to_token = OrderedDict([ (idx, tok) for tok, idx in self.token_to_idx.items() ]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx) self.max_len = max_len if max_len is not None else int(1e12) if freq_path is not None: self.token_to_freq = json.load(open(freq_path, 'r'), object_pairs_hook=OrderedDict) def tokenize(self, text): split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens def detokenize(self, tokens): text = ' '.join(tokens) return text.replace(' ##', '') def to_input_tensor(self, sents: List[List[str]], device) -> torch.Tensor: """ Convert list of tokens into tensor with necessary padding for shorter sentences. @param sents (List[List[str]]): list of sentences (words) @param device: device on which to load the tesnor, i.e. CPU or GPU @returns sents_var: tensor of (max_sentence_length, batch_size) """ sents = [self.convert_tokens_to_idx(sent) for sent in sents] sents, mask = self.pad_sentences(sents) sents_var = torch.tensor(sents, dtype=torch.long, device=device) mask_var = torch.tensor(mask, dtype=torch.long, device=device) return sents_var, mask_var def from_output_tensor(self, batch_output): """ Places batch output on cpu and converts it to tokens ignoring -1's and padding. args: batch_output (tensor) (batch_size, max_len) """ place_on_cpu(batch_output) sents = [] for output in batch_output: sent = [] for idx in output: idx = idx.item() if idx == -1: continue token = self.idx_to_token[idx] if token == "[PAD]": continue sent.append(token) sents.append(sent) return sents def pad_sentences(self, sents): """ args: sents (list(list(str))) """ sents_padded = [] mask_padded = [] max_len = max(map(len, sents)) for sent in sents: sents_padded.append(sent[:] + [self.token_to_idx['[PAD]']] * (max_len - len(sent))) mask = [[int(token != self.token_to_idx['[PAD]']) for token in sent] for sent in sents_padded] return sents_padded, mask def wrap_sentence(self, sent): """ Wrap sentences with start and stop tokens. args: sent (list[str]]) """ sent = ['[CLS]'] + sent + ['[SEP]'] return sent def unwrap_sentence(self, tokens): new_tokens = [ token for token in tokens if token != '[CLS]' and token != '[SEP]' ] return new_tokens def convert_tokens_to_idx(self, tokens): """Converts a sequence of tokens into ids using the vocab.""" ids = [] for token in tokens: ids.append(self.token_to_idx[token]) if len(ids) > self.max_len: logging.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format( len(ids), self.max_len)) return ids def convert_idxs_to_token(self, ids): """Converts a sequence of ids in wordpiece tokens using the vocab.""" tokens = [] for i in ids: tokens.append(self.idx_to_token[i]) return tokens def get_tokens_in_range(self, tokens, text, start, end): """ Get all of the tokens in the range (start, end) in original string. """ token_idxs = [] find_start = 0 for idx, token in enumerate(tokens): if token == "[CLS]" or token == "[SEP]": continue if token.startswith("##"): # remove pounds token = token[2:] token_start = text.find(token, find_start) token_end = token_start + len(token) find_start = token_end if ((token_start >= start and token_start < end) or (token_end >= start and token_end < end)): token_idxs.append(idx) return token_idxs def __len__(self): """ Compute number of words in VocabEntry. @returns len (int): number of words in VocabEntry """ return len(self.token_to_idx)
class VinaBertTokenizer(BertTokenizer): """BERT tokenizer for Vietnamese text; underthesea tokenization + WordPiece""" def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, vina_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a underthesea BertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with underthesea before wordpiece. **vina_dict_path**: (`optional`) string Path to a directory of a underthesea dictionary. """ super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format( vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = VinaBasicTokenizer( do_lower_case=do_lower_case, vina_dict_path=vina_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token) def _tokenize(self, text): if self.do_basic_tokenize: tokens = self.basic_tokenizer.tokenize( text, never_split=self.all_special_tokens) else: tokens = [text] if self.do_wordpiece_tokenize: split_tokens = [ sub_token for token in tokens for sub_token in self.wordpiece_tokenizer.tokenize(token) ] else: split_tokens = tokens return split_tokens