def __init__(self, vocab_path, do_lower_case=True, max_len=None, freq_path=None): """Constructs a BertTokenizer. Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. """ self.token_to_idx = json.load(open(vocab_path, 'r'), object_pairs_hook=OrderedDict) self.idx_to_token = OrderedDict([ (idx, tok) for tok, idx in self.token_to_idx.items() ]) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.token_to_idx) self.max_len = max_len if max_len is not None else int(1e12) if freq_path is not None: self.token_to_freq = json.load(open(freq_path, 'r'), object_pairs_hook=OrderedDict)
def _tokenize(self, text, never_split=None, **kwargs): if self.do_preprocessing: if self.do_lower_case: text = text.lower() text = str(" ".join(text_processor.pre_process_doc(text))) text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'(\w)\1{2,}', r'\1\1', text) text = re.sub(r'^\s', '', text) text = re.sub(r'\s$', '', text) # print(s) split_tokens = [text] if self.do_wordpiece_tokenize: wordpiece_tokenizer = WordpieceTokenizer(self.vocab, self.unk_token) split_tokens = wordpiece_tokenizer.tokenize(text) elif self.do_char_tokenize: tokenizer = CharacterTokenizer(self.vocab, self.unk_token) split_tokens = tokenizer.tokenize(text) elif self.do_basic_tokenize: """Tokenizes a piece of text.""" split_tokens = self.base_bert_tok.tokenize(text) return split_tokens
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, do_char_tokenize=False, do_wordpiece_tokenize=False, do_preprocessing = True, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) self.do_wordpiece_tokenize = do_wordpiece_tokenize self.do_lower_case = do_lower_case self.vocab_file = vocab_file self.do_basic_tokenize = do_basic_tokenize self.do_char_tokenize = do_char_tokenize self.unk_token = unk_token self.do_preprocessing = do_preprocessing if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file, do_lower_case=do_lower_case, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs)
def __init__(self, vocab_file, do_lower_case=False, do_basic_tokenize=True, do_wordpiece_tokenize=True, mecab_dict_path=None, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_basic_tokenize**: (`optional`) boolean (default True) Whether to do basic tokenization with MeCab before wordpiece. **mecab_dict_path**: (`optional`) string Path to a directory of a MeCab dictionary. """ super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, **kwargs) if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'.".format( vocab_file)) self.vocab = load_vocab(vocab_file) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_basic_tokenize = do_basic_tokenize self.do_wordpiece_tokenize = do_wordpiece_tokenize if do_basic_tokenize: self.basic_tokenizer = MecabBasicTokenizer( do_lower_case=do_lower_case, mecab_dict_path=mecab_dict_path) if do_wordpiece_tokenize: self.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token)
def extend_bert_vocab(self, words_to_extend): # print(all_words) init_len = len(self.tokenizer.vocab) cur_ind = init_len for i in words_to_extend: if i in self.tokenizer.vocab: continue self.tokenizer.vocab[i] = cur_ind cur_ind += 1 print(f"extend bert tokenizer with extra {cur_ind - init_len} words!") self.tokenizer.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.tokenizer.vocab.items() ]) self.tokenizer.wordpiece_tokenizer = WordpieceTokenizer( vocab=self.tokenizer.vocab, unk_token=self.tokenizer.unk_token) self.encoder._resize_token_embeddings(cur_ind)
def _wordpiece(self, text, unit="text"): """ ex) Hello World -> ['Hello', 'World'] -> ['He', '##llo', 'Wo', '##rld'] """ if self.subword_tokenizer is None: vocab_path = self.data_handler.read(self.config["vocab_path"], return_path=True) vocab = load_vocab(vocab_path) self.subword_tokenizer = WordpieceTokenizer( vocab, unk_token=self.config.get("unk_token", "[UNK]")) tokens = [] if unit == "word": for sub_token in self.subword_tokenizer.tokenize(text): tokens.append(sub_token) else: for token in self.word_tokenizer.tokenize(text): for sub_token in self.subword_tokenizer.tokenize(token): tokens.append(sub_token) return tokens
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) # Dirty hack to add NK vocab to our tokenizer # From: https://github.com/deepset-ai/FARM/issues/157 from collections import OrderedDict from transformers import BertTokenizer, WordpieceTokenizer with open('jobert-vocab.txt', 'r', encoding='utf8') as fp: vocab = fp.read().splitlines() tokens_to_add = [token for token in vocab if not (token in tokenizer.vocab or token in tokenizer.all_special_tokens)] tokenizer.vocab = OrderedDict([ *tokenizer.vocab.items(), *[ (token, i + len(tokenizer.vocab)) for i, token in enumerate(tokens_to_add) ] ]) tokenizer.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in tokenizer.vocab.items()]) tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token) if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = ( get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None ) eval_dataset = ( get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None ) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def __init__(self, vocab_file, do_lower_case=False, do_word_tokenize=True, do_subword_tokenize=True, word_tokenizer_type="basic", subword_tokenizer_type="wordpiece", never_split=None, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", mecab_kwargs=None, **kwargs): """Constructs a MecabBertTokenizer. Args: **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. **do_lower_case**: (`optional`) boolean (default True) Whether to lower case the input. Only has an effect when do_basic_tokenize=True. **do_word_tokenize**: (`optional`) boolean (default True) Whether to do word tokenization. **do_subword_tokenize**: (`optional`) boolean (default True) Whether to do subword tokenization. **word_tokenizer_type**: (`optional`) string (default "basic") Type of word tokenizer. **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None) """ super(BertTokenizer, self).__init__( unk_token='<unk>' if word_tokenizer_type == 'sp' else '[UNK]', sep_token=sep_token, pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, do_lower_case=do_lower_case, do_word_tokenize=do_word_tokenize, do_subword_tokenize=do_subword_tokenize, word_tokenizer_type=word_tokenizer_type, subword_tokenizer_type=subword_tokenizer_type, never_split=never_split, mecab_kwargs=mecab_kwargs, **kwargs, ) # ^^ We call the grandparent's init, not the parent's. if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" .format(vocab_file)) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([ (ids, tok) for tok, ids in self.vocab.items() ]) self.do_word_tokenize = do_word_tokenize self.word_tokenizer_type = word_tokenizer_type self.lower_case = do_lower_case self.never_split = never_split self.mecab_kwargs = copy.deepcopy(mecab_kwargs) if do_word_tokenize: if word_tokenizer_type == "basic": self.word_tokenizer = BasicTokenizer( do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False) elif word_tokenizer_type == "mecab": self.word_tokenizer = MecabTokenizer( do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})) elif word_tokenizer_type == "sp": path_vocab = Path(vocab_file) self.word_tokenizer = SentencePiecepTokenizer( model_file=str(path_vocab.parent / path_vocab.stem) + '.model', do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})) else: raise ValueError( "Invalid word_tokenizer_type '{}' is specified.".format( word_tokenizer_type)) self.do_subword_tokenize = do_subword_tokenize self.subword_tokenizer_type = subword_tokenizer_type if do_subword_tokenize: if subword_tokenizer_type == "wordpiece": self.subword_tokenizer = WordpieceTokenizer( vocab=self.vocab, unk_token=self.unk_token) elif subword_tokenizer_type == "character": self.subword_tokenizer = CharacterTokenizer( vocab=self.vocab, unk_token=self.unk_token) else: raise ValueError( "Invalid subword_tokenizer_type '{}' is specified.".format( subword_tokenizer_type))