def get_dataset_and_collater(vocab_path, merges_path, data_path, seq_len): tokenizer = get_seq_tokenizer(vocab_path, merges_path) dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer, file_path=data_path, block_size=seq_len) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False) return dataset, data_collator
def train_bert(corpus_path, hebrew_model=False): """ Bert model training :param corpus_path: Corpus to train Bert on :param hebrew_model: Model in Hebrew or not :return: The name of the new trained model """ language = 'hebrew' if hebrew_model else 'english' df = pd.read_csv(corpus_path) corpus_name = get_corpus_name(corpus_path) print("Preprocess...") if hebrew_model: model_name, vocab, raw_text_file = preprocess_hebrew(df, corpus_name) else: model_name, vocab, raw_text_file = preprocess_english(df, corpus_name) pass print("Cuda availability :", torch.cuda.is_available()) print("Getting tokenizer...") tokenizer = transformers.AutoTokenizer.from_pretrained(conf.bert_model[language], use_fast=True) model = transformers.AutoModelForMaskedLM.from_pretrained(conf.bert_model[language]).to('cuda') tokenizer.add_tokens(vocab) model.resize_token_embeddings(len(tokenizer)) if os.path.exists(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name): shutil.rmtree(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) os.mkdir(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) tokenizer.save_pretrained(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) print("Tokenizing...") dataset = transformers.LineByLineTextDataset( tokenizer=tokenizer, file_path=raw_text_file, block_size=128, ) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = transformers.TrainingArguments( output_dir=conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name, overwrite_output_dir=True, num_train_epochs=20, per_device_train_batch_size=16, save_steps=300, logging_steps=100, save_total_limit=3, ) trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset ) print("Begin training...") os.environ["TOKENIZERS_PARALLELISM"] = "false" trainer.train() trainer.save_model(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) print('The model has been saved under : ', conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) return conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name
def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]: column_names = self.raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if self.data_config.max_seq_length is None: max_seq_length = self.tokenizer.model_max_length if max_seq_length > 1024: self.logger.warning( "The tokenizer picked seems to have a very large `model_max_length` " f"({self.tokenizer.model_max_length}). Using 1024 instead. You can change " "that default value by setting max_seq_length in the experiment config." ) max_seq_length = 1024 else: if self.data_config.max_seq_length > self.tokenizer.model_max_length: self.logger.warning( f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger " f"than the maximum length for the model ({self.tokenizer.model_max_length}). " f"Using max_seq_length={self.tokenizer.model_max_length}.") max_seq_length = min(self.data_config.max_seq_length, self.tokenizer.model_max_length) # We cannot use self.tokenizer as a non-local variable in the tokenize_function if we want # map to be able to cache the output of the tokenizer. Hence, the tokenize_function takes # a tokenizer explicitly as an input and we create a closure using functools.partial. if self.data_config.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if self.data_config.pad_to_max_length else False def tokenize_function(tokenizer, padding, max_seq_length, examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is # more efficient when it receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = self.raw_datasets.map( functools.partial(tokenize_function, self.tokenizer, padding, max_seq_length), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not self.data_config.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting # them in smaller parts. We use `return_special_tokens_mask=True` because # DataCollatorForLanguageModeling (see below) is more efficient when it receives # the `special_tokens_mask`. def tokenize_function(tokenizer, examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = self.raw_datasets.map( functools.partial(tokenize_function, self.tokenizer), batched=True, num_proc=self.data_config.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not self.data_config.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and # generate chunks of max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list( examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it # instead of this drop, you can customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so # group_texts throws away a remainder for each of those groups of 1,000 texts. # You can adjust that batch_size here but a higher value might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map # method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=self.data_config.preprocessing_num_workers, load_from_cache_file=not self.data_config.overwrite_cache, ) for _, data in tokenized_datasets.items(): hf.remove_unused_columns(self.model, data) self.collator = transformers.DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm_probability=self.data_config.mlm_probability) return tokenized_datasets
import argparse import transformers parser = argparse.ArgumentParser() parser.add_argument('--vocab', type=str) parser.add_argument('--model', type=str) parser.add_argument('--data', type=str) args = parser.parse_args() tokenizer = transformers.BertTokenizer(vocab_file=args.vocab, do_lower_case=False, do_basic_tokenize=True) model = transformers.BertForMaskedLM.from_pretrained(args.model) dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer, file_path=args.data, block_size=128) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15) train_args = transformers.TrainingArguments( per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}") trainer = transformers.Trainer(model=model, eval_dataset=dataset, data_collator=data_collator, prediction_loss_only=True, args=train_args) eval_output = trainer.evaluate() print(eval_output)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = transformers.HfArgumentParser( (ModelArguments, DataTrainingArguments, transformers.TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed transformers.set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = transformers.AutoConfig.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = transformers.AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = transformers.CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = transformers.AutoModelForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = transformers.AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results