def test_plm(self): tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") data_collator = DataCollatorForPermutationLanguageModeling(tokenizer) # ^ permutation lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((31, 112))) self.assertEqual(batch["perm_mask"].shape, torch.Size((31, 112, 112))) self.assertEqual(batch["target_mapping"].shape, torch.Size((31, 112, 112))) self.assertEqual(batch["labels"].shape, torch.Size((31, 112))) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 512, 512))) self.assertEqual(batch["target_mapping"].shape, torch.Size((2, 512, 512))) self.assertEqual(batch["labels"].shape, torch.Size((2, 512))) example = [torch.randint(5, [5])] with self.assertRaises(ValueError): # Expect error due to odd sequence length data_collator(example)
def test_plm(self): tokenizer = BertTokenizer(self.vocab_file) no_pad_features = [{ "input_ids": list(range(10)) }, { "input_ids": list(range(10)) }] pad_features = [{ "input_ids": list(range(5)) }, { "input_ids": list(range(10)) }] data_collator = DataCollatorForPermutationLanguageModeling(tokenizer) batch = data_collator(pad_features) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10))) self.assertEqual(batch["target_mapping"].shape, torch.Size( (2, 10, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) batch = data_collator(no_pad_features) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 10, 10))) self.assertEqual(batch["target_mapping"].shape, torch.Size( (2, 10, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) example = [torch.randint(5, [5])] with self.assertRaises(ValueError): # Expect error due to odd sequence length data_collator(example)
def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path, config_path, pretrain_model_path, output_record_path, model_save_path): seed_everything(997) num_train_epochs = train_epoch pretrain_batch_size = batch_size seq_length = seq_length lr = lr corpus_path = corpus_path vocab_path = vocab_path config_path = config_path pretrain_model_path = pretrain_model_path output_record_path = output_record_path model_save_path = model_save_path tokenizer = BertTokenizer.from_pretrained(vocab_path) # train_dataset = LineByLineTextDataset(block_size=128, file_path=corpus_path, tokenizer=tokenizer) # data = read_data(corpus_path, tokenizer) train_dataset = OppoDataset(train_file_path=corpus_path, tokenizer=tokenizer, maxlen=128) data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer) config = XLNetConfig.from_pretrained( pretrained_model_name_or_path=config_path) # model = XLNetForMaskedLM(config=config,name='./xlnet_model/pytorch_model.bin') if os.path.exists(pretrain_model_path): model = XLNetLMHeadModel.from_pretrained(pretrain_model_path, config=config) else: model = XLNetLMHeadModel(config=config) # data_collator = Collator(max_seq_len=seq_length, tokenizer=tokenizer, mlm_probability=0.15) training_args = TrainingArguments( output_dir=output_record_path, overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=lr, dataloader_num_workers=8, prediction_loss_only=True, fp16=True, fp16_backend='amp', per_device_train_batch_size=pretrain_batch_size, save_strategy='no', seed=997) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset) trainer.train() trainer.save_model(model_save_path)
def loaders(self): if self._loaders is None: ps = self.params c = DataCollatorForPermutationLanguageModeling( self.tokenizer, plm_probability=ps.plm_probability, max_span_length=ps.max_span_length, ) t = DataLoader( self.train_ds, shuffle=True, collate_fn=c, batch_size=ps.train_batch_size ) e = DataLoader(self.eval_ds, collate_fn=c, batch_size=ps.eval_batch_size) self._loaders = {TRAIN: t, EVAL: e} return self._loaders
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) # Dirty hack to add NK vocab to our tokenizer # From: https://github.com/deepset-ai/FARM/issues/157 from collections import OrderedDict from transformers import BertTokenizer, WordpieceTokenizer with open('jobert-vocab.txt', 'r', encoding='utf8') as fp: vocab = fp.read().splitlines() tokens_to_add = [token for token in vocab if not (token in tokenizer.vocab or token in tokenizer.all_special_tokens)] tokenizer.vocab = OrderedDict([ *tokenizer.vocab.items(), *[ (token, i + len(tokenizer.vocab)) for i, token in enumerate(tokens_to_add) ] ]) tokenizer.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in tokenizer.vocab.items()]) tokenizer.wordpiece_tokenizer = WordpieceTokenizer(vocab=tokenizer.vocab, unk_token=tokenizer.unk_token) if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = ( get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None ) eval_dataset = ( get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None ) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = XLNetConfig() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) perplexity = math.exp(metrics["eval_loss"]) metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tags": "language-modeling" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, AdapterArguments)) model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Setup adapters if adapter_args.train_adapter: language = adapter_args.language if not language: raise ValueError( "--language flag must be set when training an adapter") # check if language adapter already exists, otherwise add it if language not in model.config.adapters.adapter_list( AdapterType.text_lang): # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, AdapterType.text_lang, config=adapter_config, load_as=language, ) # otherwise, add a fresh adapter else: model.add_adapter(language, AdapterType.text_lang, config=adapter_config) # Freeze all model weights except of those of this adapter & use this adapter in every forward pass model.train_adapter([language]) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") print("Config before overwrite max_position_embeddings:", config) config.max_position_embeddings = 4096 config.num_hidden_layers = 6 config.num_attention_heads = 8 config.hidden_size = 512 config.intermediate_size = 2048 print("Config after overwrite max_position_embeddings:", config) # if model_args.tokenizer_name: # tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) # elif model_args.model_name_or_path: # tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # else: # raise ValueError( # "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," # "and load it from here, using --tokenizer_name" # ) logging.info("Loading tokenizer") if model_args.tokenizer_name: tokenizer = BertTokenizerFast(model_args.tokenizer_name, clean_text=True, lowercase=False, strip_accents=True) else: raise ValueError("Specify tokenizer name") logging.info("Loading model") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logging.info("Resizing embeddings") model.resize_token_embeddings(len(tokenizer)) print(len(tokenizer.get_vocab()), len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") # Get datasets logging.info("Loading train dataset") train_dataset = get_dataset(data_args) if training_args.do_train else None logging.info("Loading eval dataset") eval_dataset = (get_dataset( data_args, evaluate=True, ) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, ) # Initialize our Trainer logging.info("Initializing trainer") trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: logging.info("Training") model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): from transformers import XLNetConfig config = XLNetConfig( vocab_size=21_128, d_model=768, n_head=12, n_layer=6, ) from transformers import XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained("./model/spbpe", max_len=512) from transformers import XLNetLMHeadModel model = XLNetLMHeadModel(config=config) model.resize_token_embeddings(len(tokenizer)) print(model.num_parameters()) from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="./data/data_train.csv", block_size=128, ) max_seq_length = 512 from transformers import DataCollatorForPermutationLanguageModeling data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=1.0 / 6, max_span_length=5) from transformers import Trainer, TrainingArguments training_args = TrainingArguments( output_dir="./model/xlnet_v1", overwrite_output_dir=True, num_train_epochs=5, per_gpu_train_batch_size=32, save_steps=10_000, save_total_limit=2, tpu_num_cores=8, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) trainer.train() if trainer.is_world_master(): trainer.save_model("./model/spbpe") print('FIN')
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train ): if not training_args.overwrite_output_dir: ckt = os.listdir(training_args.output_dir) ckt.sort(key=lambda x:int(x.split('-')[-1])) model_args.model_name_or_path=os.path.join(training_args.output_dir, ckt[-1]) logger.info(f"Output directory ({training_args.output_dir}) already exists and is not empty. Training from checkout %s.", model_args.model_name_or_path) else: pass logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.model_type in ["lecbert"]: from lecbert import LecbertConfig as AutoConfig from lecbert import LecbertTokenizer as AutoTokenizer from lecbert import LecbertForPreTraining as AutoModelForPreTraining else: from transformers import AutoConfig, AutoTokenizer, AutoModelForPreTraining if model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) elif model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) elif model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir, config=config) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) if model_args.model_name_or_path: model = AutoModelForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") if model_args.model_type == "lecbert": model = AutoModelForPreTraining(config) else: model = AutoModelForPreTraining.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.model_max_length # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.model_max_length) # Get datasets train_dataset = ( get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None ) eval_dataset = ( get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None ) if model_args.model_type == "lecbert": data_collator = DataCollatorForLEC( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, block_size=data_args.block_size ) elif model_args.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model(training_args.output_dir) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) if tokenizer.pad_token_id is None: if model_args.force_pad_token: # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn. # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token # when feeding to the model. tokenizer.add_special_tokens({"pad_token": "<pad>"}) else: logger.warning( "Attempting to train a model whose tokenizer has no padding token. This may result in errors in the encoding step. Set the --force_pad_token flag to fix this." ) if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) special_tokens_dict = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>'} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "fasta": FASTA_DATASET = True datasets = load_dataset_fasta(data_files, data_args.max_seq_length) else: if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = XLNetConfig() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = XLNetTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. tokenized_datasets = dict() for dataset_key, dataset in datasets.items(): # Tokenize encodings = tokenizer( dataset['sequences'], truncation=True, padding='max_length', # TODO get from args passed in max_length=data_args.max_seq_length, return_special_tokens_mask=True, return_token_type_ids=False, return_attention_mask=False ) torch_dataset = FastaDataset(encodings) tokenized_datasets[dataset_key] = torch_dataset # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.do_train = True # training_args.do_eval = True data_args.mlm = True if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) training_args.output_dir = os.environ["SM_OUTPUT_DATA_DIR"] input_path = os.environ["SM_CHANNEL_TRAINING"] folder = "wikitext-2-raw-v1" cmd = "tar xzf %s -C %s" % (input_path + "/" + folder + ".tar.gz", input_path) assert os.system(cmd) == 0 data_folder = f"{input_path}/{folder}" data_args.train_data_file = f"{data_folder}/wikitext-2-raw/{data_args.train_data_file}" data_args.eval_data_file = f"{data_folder}/wikitext-2-raw/{data_args.eval_data_file}" checkpoint_path = training_args.output_dir + "/checkpoints" project_name = "language-modeling" os.environ["WANDB_PROJECT"] = project_name wandb.init( project=project_name ) # TODO(tilo): is this really necessary? should be done by ML-library ( here transformers) assert wandb.api.api_key is not None assert is_wandb_available( ) # TODO(tilo): somehow I had issues, which seem to have resolved themselves # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) # neuralmind/bert-base-portuguese-cased # bert-base-cased # bert-base-multilingual-cased model_name_or_path = "scripts/tmp/bert-base-multilingual-cased-4098" if len(sys.argv) >= 2: model_name_or_path = sys.argv[1] if model_name_or_path == "": model_name_or_path = "bert-base-multilingual-cased" model_args, data_args, training_args = parser.parse_args_into_dataclasses( args=[ "--output_dir", "output", "--model_type", "bert", "--model_name_or_path", model_name_or_path, "--do_eval", "--mlm", "--line_by_line" ]) #train_dataset_path = os.path.join(os.path.dirname(__file__), "wikiportuguese_line_by_line", "wiki.train.raw") #test_dataset_path= os.path.join(os.path.dirname(__file__), "wikiportuguese_line_by_line", "wiki.test.raw") train_dataset_path = os.path.join(os.path.dirname(__file__), "wikiportuguese", "wiki.train.raw") test_dataset_path = os.path.join(os.path.dirname(__file__), "wikiportuguese", "wiki.test.raw") #train_dataset_path = os.path.join(os.path.dirname(__file__), "wiki103", "wiki.train.raw") #test_dataset_path= os.path.join(os.path.dirname(__file__), "wiki103", "wiki.test.raw") data_args.train_data_file = train_dataset_path data_args.eval_data_file = test_dataset_path if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len #data_args.block_size = 512 # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() eval_loss = eval_output["eval_loss"] perplexity = math.exp(eval_loss) bcp = eval_loss / math.log(2) result = {"perplexity": perplexity, "bpc": bcp} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) hf_logging.set_verbosity_info() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): logger.info( f"Output dir ({training_args.output_dir}) is not empty, will try to reload from there." ) model_args.model_name_or_path = training_args.output_dir # raise ValueError( # f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." # ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logger.info(model) num_params = sum(p.numel() for p in model.parameters()) logger.info('Model has %d parameters' % num_params) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) logger.info('Model has %d trainable parameters' % num_params) # ADD special tokens tokenizer.pad_token = tokenizer.eos_token special_tokens_dict = { 'additional_special_tokens': ['<STORY>', '<QUERY>', '<PROOF>', '<ANSWER>'] } # NOTE: should also have added "ent_1", "ent_2", ..., "ent_20" :/ num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) logger.info(f'We have added {num_added_toks} tokens') ''' if tokenizer.pad_token_id is None and data_args.line_by_line: # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn. # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token # when feeding to the model. # tokenizer.pad_token = tokenizer.eos_token num_added_toks = tokenizer.add_special_tokens({"pad_token": "<pad>"}) ''' model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.model_max_length # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.model_max_length) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if (training_args.do_eval or training_args.evaluate_during_training) else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, ) # start by saving tokenizer so that we can restart training! # if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) results = {} # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) logger.info(f"model_path: {model_path}") if model_path is not None: # Grab the most recent checkpoint checkpoints_sorted = trainer._sorted_checkpoints(use_mtime=True) assert len(checkpoints_sorted) > 0 checkpoint_most_recent = checkpoints_sorted[-1] logger.info( f"most recent checkpoint: {checkpoint_most_recent}. setting model_path to this." ) # TODO: find a way to set: # - patience_best_eval_loss = None # - patience_evals_without_improvement = 0 # - patience_should_stop = False model_path = checkpoint_most_recent train_results = trainer.train(model_path=model_path, ) results["train_step"] = train_results.global_step results["train_loss"] = train_results.training_loss results["train_ppl"] = math.exp(train_results.training_loss) # trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) # if trainer.is_world_master(): # tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() results["valid_loss"] = eval_output["eval_loss"] results["valid_ppl"] = math.exp(eval_output["eval_loss"]) output_eval_file = os.path.join(training_args.output_dir, "results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key]))) return results
def main(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.disable_tqdm = False if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use " f"--overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from " "another script, save it," "and load it from here, using --tokenizer_name" ) config._my_arg_tune_mode = model_args.tuning_mode # 0 means the regular token level objective, which is sum / output_len # 1 means the sentence level objective, which is sum # 2 means our buggy version which is sum/max_batch(input_len +output_len) # 3 means our buggy version which is sum/max_batch(output_len) # 4 means our buggy version which is sum/(input_len +output_len) config._objective_mode = model_args.objective_mode config._my_arg_task_mode = data_args.task_mode if model_args.tuning_mode in ['finetune', 'adaptertune', 'finetune-top']: print('objective is 0 because of finetune') elif model_args.tuning_mode == 'prefixtune': print('objective is {}'.format(config._objective_mode)) if model_args.tuning_mode == 'adaptertune': config.adapter_design = model_args.adapter_design config.bottleneck = model_args.adapter_bottleneck if model_args.model_name_or_path: config.return_dict = True model = GPT2LMHeadModelAdapter.from_pretrained( model_args.model_name_or_path, config=config, from_tf=bool(".ckpt" in model_args.model_name_or_path), cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) else: if model_args.model_name_or_path: print(config.return_dict) config.return_dict = True model = GPT2LMHeadModel.from_pretrained( model_args.model_name_or_path, config=config, from_tf=bool(".ckpt" in model_args.model_name_or_path), cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) print(model_args.tuning_mode) print('adapting the size of the model embedding to include [PAD]') print('len(tokenizer) = ', len(tokenizer)) tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model.resize_token_embeddings(len(tokenizer)) print('len(tokenizer) = ', len(tokenizer)) print(tokenizer.eos_token, tokenizer.eos_token_id) print(tokenizer.bos_token, tokenizer.bos_token_id) if model_args.tuning_mode == 'prefixtune': # prefixtune for param in model.base_model.parameters(): param.requires_grad = False gpt2 = model print('loading the prefix model from ', model_args.prefixModel_name_or_path) optim_prefix_bool: bool = model_args.optim_prefix.lower() == "yes" if model_args.prefixModel_name_or_path is not None: config2 = AutoConfig.from_pretrained(model_args.prefixModel_name_or_path, cache_dir=model_args.cache_dir) if model_args.prefix_mode == 'embedding': model = PrefixEmbTuning.from_pretrained( model_args.prefixModel_name_or_path, from_tf=bool(".ckpt" in model_args.prefixModel_name_or_path), config=config2, cache_dir=model_args.cache_dir, model_gpt2=gpt2, optim_prefix=optim_prefix_bool, preseqlen=model_args.preseqlen, use_infix=(data_args.format_mode == 'infix') ) elif model_args.prefix_mode == 'activation': model = PrefixTuning.from_pretrained( model_args.prefixModel_name_or_path, from_tf=bool(".ckpt" in model_args.prefixModel_name_or_path), config=config2, cache_dir=model_args.cache_dir, model_gpt2=gpt2, optim_prefix=optim_prefix_bool, preseqlen=model_args.preseqlen, use_infix=(data_args.format_mode == 'infix') ) else: assert False, "invalid prefix mode" else: # should clone the config and construct it. config_prefix = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) config_prefix._my_arg_tune_mode = model_args.tuning_mode config_prefix._my_arg_task_mode = data_args.task_mode config_prefix._my_arg_control = True config_prefix.train_weights = data_args.train_embs config_prefix.optim_prefix = optim_prefix_bool config_prefix.preseqlen = model_args.preseqlen config_prefix.use_infix = (data_args.format_mode == 'infix') config_prefix.format_mode = data_args.format_mode config_prefix.prefix_dropout = model_args.prefix_dropout config_prefix.vocab_size = len(tokenizer) config_prefix.lowdata = ('lowdata' in training_args.output_dir) if config_prefix.lowdata and data_args.use_lowdata_token == 'yes': config_prefix.lowdata_token = tokenizer([data_args.lowdata_token], add_prefix_space=True)['input_ids'] # return_tensors='np', print(data_args.lowdata_token) print(config_prefix.lowdata_token) # some extra stuff. config_prefix.init_random = model_args.init_random config_prefix.mid_dim = model_args.mid_dim print('training the prefix model from scratch. ') if model_args.prefix_mode == 'embedding': config_prefix.parametrize_emb = model_args.parametrize_emb model = PrefixEmbTuning(config_prefix, model_gpt2=gpt2) elif model_args.prefix_mode == 'activation': # TODO: Model is created here! print('model created here!') model = PrefixTuning(config_prefix, model_gpt2=gpt2) else: assert False, "invalid prefix mode" print('Not in dataless setting, loading the control code. ') if 'sentiment' in training_args.output_dir: print('sentiment does need discri_labels') discri_labels = None elif 'classify-sentiment' in training_args.output_dir: print('classify-sentiment does need discri_labels') discri_labels = None elif 'classify-topic' in training_args.output_dir: print('classify-topic does need discri_labels') discri_labels = None elif 'sent' in training_args.output_dir: discri_labels = ['negative', 'positive'] elif 'topic' in training_args.output_dir: discri_labels = ['world', 'sports', 'business', 'science'] elif 'keyword' in training_args.output_dir: print('keyword is unbounded.') discri_labels = None elif 'embMatch' in training_args.output_dir: print('embMatch is unbounded.') discri_labels = None elif 'data2text' in training_args.output_dir: print('data2text does need discri_labels') discri_labels = None elif 'triples' in training_args.output_dir: print('triples does need discri_labels') discri_labels = None elif 'webnlg' in training_args.output_dir: print('triples does need discri_labels') discri_labels = None elif 'writingPrompts' in training_args.output_dir: print('writingPrompts does need discri_labels') discri_labels = None elif 'cnndm' in training_args.output_dir: print('cnndm does need discri_labels') discri_labels = None elif 'xsum' in training_args.output_dir: print('xsum does need discri_labels') discri_labels = None elif 'lemma2text' in training_args.output_dir: print('lemma2text does need discri_labels') discri_labels = None else: assert False, 'should have topic/sent in the file name' train_dataset = ( get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir, training_args=training_args, finetune_mode=(model_args.tuning_mode == 'finetune')) if training_args.do_train else None ) eval_dataset = ( get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir, training_args=training_args, finetune_mode=(model_args.tuning_mode == 'finetune')) if training_args.do_eval else None ) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: if data_args.task_mode == 'embMatch': data_collator = DataCollatorForEmbMatchLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'topic' or data_args.task_mode == 'sentiment': data_collator = DataCollatorForKeywordLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'classify-topic' or data_args.task_mode == 'classify-sentiment': data_collator = DataCollatorForClassificationSentimentLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'length': data_collator = DataCollatorForKeywordLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'keyword': data_collator = DataCollatorForKeywordLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'data2text' or data_args.task_mode == 'triples' or data_args.task_mode == \ 'webnlg': print('FORMAT MODE IS ', data_args.format_mode) data_collator = DataCollatorForData2TextLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, format_mode=data_args.format_mode ) elif data_args.task_mode == 'writingPrompts': print('FORMAT MODE IS ', data_args.format_mode) data_collator = DataCollatorForWritingPromptsLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, format_mode=data_args.format_mode ) elif data_args.task_mode == 'xsum' or data_args.task_mode == 'cnndm': print('FORMAT MODE IS ', data_args.format_mode) data_collator = DataCollatorForSumLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, format_mode=data_args.format_mode ) elif data_args.task_mode == 'lemma2text': data_collator = DataCollatorForData2TextLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'text2data': data_collator = DataCollatorForText2DataLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) elif data_args.task_mode == 'gen_data': data_collator = DataCollatorForWeightedLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) if (model_args.tuning_mode == 'prefixtune'): if 'topic' in training_args.output_dir: discri_labels = ['world', 'sports', 'business', 'science'] elif 'sent' in training_args.output_dir: discri_labels = ['negative', 'positive'] trainer = Trainer_Prefix( model=model, tokenizer=tokenizer, discri_labels=discri_labels, model_gpt2=gpt2, args=training_args, prediction_loss_only=True, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, task_mode=data_args.task_mode, use_dropout=(model_args.use_dropout == 'yes') ) else: raise ValueError(f"Unsupported tuning_mode: {model_args.tuning_mode}") # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) trainer.train(model_path=model_path) trainer.save_model() # Evaluation results = {} if training_args.do_eval and not (data_args.dataless == 'yes'): logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) if data_args.task_mode == 'data2text': del model del trainer if model_args.tuning_mode == 'prefixtune' or model_args.tuning_mode == 'bothtune': del gpt2 torch.cuda.empty_cache() elem = os.path.abspath(training_args.output_dir) checkpoint_path = elem print('running evaluation on ', checkpoint_path) os.system('python ../text-generation/gen.py data2text yes yes {} no'.format(checkpoint_path)) if 'earlystop' in training_args.output_dir: elem = os.path.abspath(training_args.output_dir) checkpoint_path = glob.glob(os.path.join(elem, '*checkpoint*')) assert len(checkpoint_path) == 1 checkpoint_path = checkpoint_path[0] print('running early stopping evaluation on ', checkpoint_path) os.system('python ../text-generation/gen.py data2text yes yes {} no'.format(checkpoint_path)) elif data_args.task_mode == 'webnlg': del model del trainer if model_args.tuning_mode == 'prefixtune': del gpt2 torch.cuda.empty_cache() elem = os.path.abspath(training_args.output_dir) checkpoint_path = elem print('running evaluation on ', checkpoint_path) os.system('python ../text-generation/gen.py webnlg yes yes {} no'.format(checkpoint_path)) # also run for early stopping: if 'earlystop' in training_args.output_dir: elem = os.path.abspath(training_args.output_dir) checkpoint_path = glob.glob(os.path.join(elem, '*checkpoint*')) assert len(checkpoint_path) == 1 checkpoint_path = checkpoint_path[0] print('running early stopping evaluation on ', checkpoint_path) os.system('python ../text-generation/gen.py webnlg yes yes {} no'.format(checkpoint_path)) elif data_args.task_mode == 'triples': del model del trainer if model_args.tuning_mode == 'prefixtune': del gpt2 torch.cuda.empty_cache() elem = os.path.abspath(training_args.output_dir) checkpoint_path = elem print('running evaluation on ', checkpoint_path) os.system('python ../text-generation/gen.py triples yes yes {} no'.format(checkpoint_path)) if 'earlystop' in training_args.output_dir: elem = os.path.abspath(training_args.output_dir) checkpoint_path = glob.glob(os.path.join(elem, '*checkpoint*')) assert len(checkpoint_path) == 1 checkpoint_path = checkpoint_path[0] print('running early stopping evaluation on ', checkpoint_path) os.system('python ../text-generation/gen.py triples yes yes {} no'.format(checkpoint_path)) return results