def __init__(self, model_spec, model_dir, device): """Constructor for HfModel class. Args: model_spec: A str to pass into the `pretrained_model_name_or_path` argument of `transformers.T5ForConditionalGeneration.from_pretrained` (e.g. `"t5-base"` or a path to a previously trained model) or an instance of the `transformers.configuration_t5.T5Config` class to use to directly construct the `transformers.T5ForConditionalGeneration` object. model_dir: str, directory to save and load model checkpoints. device: `torch.device` on which the model should be run. """ # We have to import transformers here because it has a side effect of # creating a TensorFlow graph, which prevents eager execution from being # enabled in files that import hf_model.py import transformers # pylint: disable=import-outside-toplevel,g-import-not-at-top if isinstance(model_spec, str): self._model = transformers.T5ForConditionalGeneration.from_pretrained( model_spec ) elif isinstance(model_spec, transformers.T5Config): self._model = transformers.T5ForConditionalGeneration(model_spec) else: raise ValueError("model_spec should be a string or T5Config.") tf.io.gfile.makedirs(model_dir) self._writer = torch.utils.tensorboard.writer.SummaryWriter(model_dir) self._model_dir = model_dir self._device = device if self._device.type == "cuda": self._model.cuda() self._step = 0 self.load_latest_checkpoint() self.to_tensor = functools.partial(torch.as_tensor, device=self._device)
def init_model(args): # Load dataset, tokenizer, model from pretrained model/vocabulary ## google의 sentencepiece tokenizer tokenizer = transformers.T5Tokenizer.from_pretrained(args.tokenizer_path) special_tokens = ['<mask{}>'.format(d) for d in range(0, 100)] special_tokens += ['<unused{}>'.format(d) for d in range(0, 100)] special_tokens_dict = { 'bos_token': '<s>', 'sep_token': '<sep>', 'cls_token': '<cls>', 'mask_token': '<mask>', 'additional_special_tokens': special_tokens } tokenizer.add_special_tokens(special_tokens_dict) if args.weights == None: model = transformers.T5ForConditionalGeneration( vocab_size=tokenizer.vocab_size) else: logging.info('Load {}.'.format(args.weights)) model = transformers.T5ForConditionalGeneration.from_pretrained( args.weights) if model.config.vocab_size != tokenizer.vocab_size: logging.info('Resize embedding {} -> {}.'.format( model.config.vocab_size, tokenizer.vocab_size)) model.resize_token_embeddings(tokenizer.vocab_size) model.eval() loss_func = MaskedCrossEntropyLoss() if torch.cuda.device_count() > 1: logging.info('Training in multi GPU mode using {} GPUs.'.format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) if torch.cuda.is_available(): model.to('cuda') loss_func.to('cuda') return tokenizer, model, loss_func
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. accelerator = Accelerator() parser = transformers.HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) if accelerator.is_local_main_process: # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO, datefmt="[%X]", ) logger = logging.getLogger(__name__) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") if not os.path.exists(training_args.output_dir): os.makedirs(training_args.output_dir) logger.info(f"Created output_dir at {training_args.output_dir}") # Set seed before initializing model. transformers.set_seed(training_args.seed) if data_args.dataset_pickle_path is not None: if accelerator.is_local_main_process: logger.info("Loading processed data from pickle file.") with open(data_args.dataset_pickle_path, "rb") as f: tokenized_datasets = pickle.load(f) if accelerator.is_local_main_process: logger.info("Done loading pickle data.") else: # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.tokenizer_name: tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = transformers.T5Config.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer) ) if model_args.model_type != "t5": raise NotImplementedError config.decoder_start_token_id = config.pad_token_id elif model_args.model_name_or_path: config = transformers.T5Config.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = transformers.CONFIG_MAPPING[model_args.model_type]() if accelerator.is_local_main_process: logger.warning("You are instantiating a new config instance from scratch.") # Preprocessing the datasets. # First we tokenize all the texts. max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. # To ensure that the input length is `max_seq_length`, we need to increase the maximum length # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly. expanded_inputs_length, targets_length = compute_input_and_target_lengths( inputs_length=max_seq_length, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, ) if data_args.dataset_pickle_path is None: if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Since we make sure that all sequences are of the same length, no attention_mask is needed. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_attention_mask=False, truncation=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= expanded_inputs_length: total_length = (total_length // expanded_inputs_length) * expanded_inputs_length # Split by chunks of max_len. result = { k: [t[i: i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if accelerator.is_local_main_process: wandb.init(project="T5_Pretraining", entity="frostbyte") wandb.config.update(training_args) wandb.config.update(model_args) wandb.config.update(data_args) wandb.config.update(config.to_dict()) # Initialize our training if model_args.model_name_or_path: model = transformers.T5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, config=config, seed=training_args.seed) else: config.vocab_size = len(tokenizer) model = transformers.T5ForConditionalGeneration(config) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": training_args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] if training_args.adafactor: optimizer = Adafactor(optimizer_grouped_parameters, lr=training_args.learning_rate, scale_parameter=False, relative_step=False) else: optimizer = transformers.AdamW( optimizer_grouped_parameters, lr=training_args.learning_rate, betas=(training_args.adam_beta1, training_args.adam_beta2), eps=training_args.adam_epsilon ) optimizer.zero_grad() # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForT5MLM( tokenizer=tokenizer, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, input_length=max_seq_length, target_length=targets_length, pad_token_id=model.config.pad_token_id, decoder_start_token_id=model.config.decoder_start_token_id, ) # Store some constant num_epochs = int(training_args.num_train_epochs) train_batch_size = int(training_args.per_device_train_batch_size) eval_batch_size = int(training_args.per_device_eval_batch_size) train_loader = torch.utils.data.DataLoader(tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=train_batch_size) eval_loader = torch.utils.data.DataLoader(tokenized_datasets["validation"], shuffle=False, collate_fn=data_collator, batch_size=eval_batch_size) # num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs # scheduler = transformers.get_linear_schedule_with_warmup(optimizer, training_args.warmup_steps, num_train_steps) scheduler = NoamLR(optimizer, warmup_steps=training_args.warmup_steps) if model_args.model_resume_checkpoint is not None: if accelerator.is_local_main_process: logger.info("Resuming from checkpoint") checkpoint = torch.load(model_args.model_resume_checkpoint) model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) scheduler = checkpoint["scheduler"] resume_step = checkpoint["step"] else: resume_step = -1 model, optimizer, train_loader, eval_loader = accelerator.prepare(model, optimizer, train_loader, eval_loader) # for epoch in range(num_epochs): assert num_epochs == 1 epoch = 0 # only the "total" since the last logging step total_train_loss = torch.tensor([0.0], device=accelerator.device, requires_grad=False) total_train_specialization_metric = torch.tensor([0.0], device=accelerator.device, requires_grad=False) total_num_examples = torch.tensor([0.0], device=accelerator.device, requires_grad=False) for step, batch in tqdm(enumerate(train_loader), desc="Training", total=len(train_loader), disable=not accelerator.is_local_main_process): cur_step = epoch * len(train_loader) + step if cur_step <= resume_step: continue if cur_step % training_args.eval_steps == 0: # and cur_step > 0: if (cur_step) % training_args.gradient_accumulation_steps != 0: if accelerator.is_local_main_process: logger.info("Skipping evaluate because gradients are accumulated") continue eval_loss = torch.tensor([0.0], device=accelerator.device, requires_grad=False) eval_specialization_metric = torch.tensor([0.0], device=accelerator.device, requires_grad=False) eval_acc = torch.tensor([0.0], device=accelerator.device, requires_grad=False) model.eval() batch.to("cpu") for eval_batch in tqdm(eval_loader, desc="Evaluating", leave=False, disable=not accelerator.is_local_main_process): optimizer.zero_grad() loss, decoder_last_state, decoder_cache, decoder_states, decoder_attns, decoder_self_norms, \ decoder_cross_norms, encoder_last_state, encoder_states, encoder_attns, encoder_norms = \ model(**eval_batch, output_hidden_states=True, output_attentions=True, output_norms=True) preds = torch.argmax(decoder_last_state, dim=-1).detach().cpu() acc = torch.eq(preds, eval_batch["labels"].cpu()).float().sum().to(accelerator.device) del preds batch_specialization_metric, batch_size = compute_specialization_metric(norms_to_tensor(encoder_norms), accelerator.device) del encoder_norms eval_loss += loss.detach() eval_acc += acc / targets_length eval_specialization_metric += batch_specialization_metric del batch_specialization_metric, batch_size, loss, acc num_eval_examples = len(tokenized_datasets["validation"]) avg_eval_loss = accelerator.gather(eval_loss).mean().item() / len(eval_loader) avg_eval_specialization_metric = accelerator.gather(eval_specialization_metric).sum().item() / num_eval_examples avg_eval_acc = accelerator.gather(eval_acc).sum().item() / num_eval_examples if accelerator.is_local_main_process: wandb.log({ "eval_loss": avg_eval_loss, "eval_specialization_metric": avg_eval_specialization_metric, "eval_acc": avg_eval_acc, }, step=cur_step * 2) # TODO: don't hardcode, multiply by num processes del eval_loss, eval_acc, eval_specialization_metric batch.to(accelerator.device) optimizer.zero_grad() model.train() loss, decoder_last_state, decoder_cache, decoder_states, decoder_attns, decoder_self_norms, \ decoder_cross_norms, encoder_last_state, encoder_states, encoder_attns, encoder_norms = \ model(**batch, output_hidden_states=True, output_attentions=True, output_norms=True) accelerator.backward(loss) if (cur_step + 1) % training_args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() batch_specialization_metric, batch_size = compute_specialization_metric(norms_to_tensor(encoder_norms), device=accelerator.device) total_train_loss += loss.detach() total_train_specialization_metric += batch_specialization_metric total_num_examples += batch_size del loss, batch_specialization_metric, batch_size if cur_step % training_args.logging_steps == 0 and cur_step > 0: avg_train_loss = accelerator.gather(total_train_loss).mean().item() / training_args.logging_steps avg_train_specialization_metric = accelerator.gather(total_train_specialization_metric).mean().item() \ / accelerator.gather(total_num_examples).mean().item() if accelerator.is_local_main_process: wandb.log({ "train_loss": avg_train_loss, "train_specialization_metric": avg_train_specialization_metric, "learning_rate": scheduler.get_last_lr()[0], }, step=cur_step * 2) # TODO: don't hardcode, multiply by num processes total_train_loss[0] = 0.0 total_train_specialization_metric[0] = 0.0 total_num_examples[0] = 0.0 if cur_step % training_args.save_steps == 0 and cur_step > 0 and accelerator.is_local_main_process: checkpoint = { "step": cur_step, "model": model.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler } accelerator.save(checkpoint, f"{training_args.output_dir}/checkpoint_{cur_step // training_args.save_steps}.pt")