def test_run_wav2vec2_pretraining(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) tmp_dir = self.get_auto_remove_tmp_dir() testargs = f""" run_wav2vec2_pretraining_no_trainer.py --output_dir {tmp_dir} --model_name_or_path hf-internal-testing/tiny-random-wav2vec2 --dataset_name hf-internal-testing/librispeech_asr_dummy --dataset_config_names clean --dataset_split_names validation --learning_rate 1e-4 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --preprocessing_num_workers 16 --max_train_steps 5 --validation_split_percentage 5 --seed 42 """.split() if is_cuda_and_apex_available(): testargs.append("--fp16") with patch.object(sys, "argv", testargs): run_wav2vec2_pretraining_no_trainer.main() model = Wav2Vec2ForPreTraining.from_pretrained(tmp_dir) self.assertIsNotNone(model)
def test_inference_integration(self): model = Wav2Vec2ForPreTraining.from_pretrained( "facebook/wav2vec2-base") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths( torch.tensor(inputs_dict["input_values"].shape[1])), ) torch.manual_seed(0) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, device=inputs_dict["input_values"].device, min_masks=2, ).to(torch_device) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), mask_time_indices=mask_time_indices, ) # compute cosine similarity cosine_sim = torch.cosine_similarity( outputs.projected_states, outputs.projected_quantized_states, dim=-1) # retrieve cosine sim of masked features cosine_sim_masked = cosine_sim[mask_time_indices] # fmt: off expected_cosine_sim_masked = torch.tensor( [ 0.7458, 0.7188, 0.6418, 0.3729, 0.3741, 0.3694, 0.3110, 0.2257, 0.4403, 0.5415, 0.3950, 0.3701, 0.8831, 0.8613, 0.5229, 0.6696, 0.7206, 0.7877, 0.6758, 0.8746, 0.6596, 0.6282, 0.6178, 0.5839, 0.5926, 0.6651, 0.4635, 0.6332, 0.6572, 0.8776, 0.4999, 0.7001, 0.7257, 0.5098, 0.6229, 0.4566, 0.5261, 0.6363, 0.5371, 0.6997 ], device=torch_device, ) # fmt: on self.assertTrue( torch.allclose(cosine_sim_masked, expected_cosine_sim_masked, atol=1e-3))
def test_sample_negatives(self): batch_size = 2 sequence_length = 10 hidden_size = 4 num_negatives = 3 features = ( torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view( sequence_length, hidden_size) # each value in vector consits of same value features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous() negatives = Wav2Vec2ForPreTraining._sample_negatives( features, num_negatives) self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size)) # make sure no negatively sampled vector is actually a positive one for negative in negatives: self.assertTrue(((negative - features) == 0).sum() == 0.0) # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim self.assertTrue( negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
def test_model_for_pretraining(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( ) model = Wav2Vec2ForPreTraining(config).to(torch_device) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths( torch.tensor(inputs_dict["input_values"].shape[1])), ) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, device=inputs_dict["input_values"].device, min_masks=2, ).to(torch_device) loss = model( inputs_dict["input_values"], attention_mask=inputs_dict["attention_mask"], mask_time_indices=mask_time_indices, ).loss mask_time_indices[:, :mask_time_indices.shape[-1] // 2] = True loss_more_masked = model( inputs_dict["input_values"], attention_mask=inputs_dict["attention_mask"], mask_time_indices=mask_time_indices, ).loss # loss_more_masked has to be bigger or equal loss since more masked inputs have to be predicted self.assertTrue( loss.detach().item() <= loss_more_masked.detach().item())
def test_sample_negatives_with_attn_mask(self): batch_size = 2 sequence_length = 10 hidden_size = 4 num_negatives = 3 # second half of last input tensor is padded attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device) attention_mask[-1, sequence_length // 2 :] = 0 features = (torch.arange(sequence_length * hidden_size, device=torch_device) // hidden_size).view( sequence_length, hidden_size ) # each value in vector consits of same value features = features[None, :].expand(batch_size, sequence_length, hidden_size).contiguous() # replace masked feature vectors with -100 to test that those are not sampled features = torch.where(attention_mask[:, :, None].expand(features.shape).bool(), features, -100) negatives = Wav2Vec2ForPreTraining._sample_negatives(features, num_negatives, attention_mask=attention_mask) self.assertTrue((negatives >= 0).all().item()) self.assertTrue(negatives.shape == (num_negatives, batch_size, sequence_length, hidden_size)) # make sure no negatively sampled vector is actually a positive one for negative in negatives: self.assertTrue(((negative - features) == 0).sum() == 0.0) # make sure that full vectors are sampled and not values of vectors => this means that `unique()` yields a single value for `hidden_size` dim self.assertTrue(negatives.unique(dim=-1).shape, (num_negatives, batch_size, sequence_length, 1))
def test_loss_pretraining(self): model = Wav2Vec2ForPreTraining.from_pretrained( "facebook/wav2vec2-base", attention_dropout=0.0, feat_proj_dropout=0.0, hidden_dropout=0.0, layerdrop=0.0, ) model.to(torch_device).train() feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths( inputs_dict["input_values"].shape[1]), ) torch.manual_seed(0) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, device=inputs_dict["input_values"].device, min_masks=2, ).to(torch_device) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), mask_time_indices=mask_time_indices, ) # check diversity loss num_codevectors = model.config.num_codevectors_per_group * model.config.num_codevector_groups diversity_loss = (num_codevectors - outputs.codevector_perplexity) / num_codevectors self.assertTrue(abs(diversity_loss.item() - 0.8859) < 1e-3) # check overall loss (contrastive loss + diversity loss) expected_loss = 62.5170 self.assertTrue(abs(outputs.loss.item() - expected_loss) < 1e-3)
def __init__( self, source, save_path, mask_prob=0.65, mask_length=10, normalize_wav=True, ): super().__init__() self.mask_prob = mask_prob self.mask_length = mask_length self.normalize_wav = normalize_wav # Download the config of the model from HuggingFace. self.config = Wav2Vec2Config.from_pretrained(source, cache_dir=save_path) self.config.output_hidden_states = ( True # We want the hidden states as well! ) self.model = Wav2Vec2ForPreTraining(self.config) self.model.gradient_checkpointing_disable() # Required by DDP self.model.train()
def convert_wav2vec2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = Wav2Vec2Config.from_pretrained(config_path) else: config = Wav2Vec2Config() if is_finetuned: if dict_path: target_dict = Dictionary.load(dict_path) # important change bos & pad token id since CTC symbol is <pad> and # not <s> as in fairseq config.bos_token_id = target_dict.pad_index config.pad_token_id = target_dict.bos_index config.eos_token_id = target_dict.eos_index config.vocab_size = len(target_dict.symbols) vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") if not os.path.isdir(pytorch_dump_folder_path): logger.error( "--pytorch_dump_folder_path ({}) should be a directory". format(pytorch_dump_folder_path)) return os.makedirs(pytorch_dump_folder_path, exist_ok=True) with open(vocab_path, "w", encoding="utf-8") as vocab_handle: json.dump(target_dict.indices, vocab_handle) tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token=target_dict.unk_word, pad_token=target_dict.pad_word, bos_token=target_dict.bos_word, eos_token=target_dict.eos_word, word_delimiter_token="|", do_lower_case=False, ) return_attention_mask = True if config.feat_extract_norm == "layer" else False feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=return_attention_mask, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(pytorch_dump_folder_path) hf_wav2vec = Wav2Vec2ForCTC(config) else: hf_wav2vec = Wav2Vec2ForPreTraining(config) if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) model = model[0].eval() recursively_load_weights(model, hf_wav2vec, not is_finetuned) hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() configure_logger(model_args, training_args) # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) if "validation" not in datasets.keys(): # make sure only "validation" and "train" keys remain" datasets = DatasetDict() datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split= f"{data_args.train_split_name}[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split= f"{data_args.train_split_name}[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, ) else: # make sure only "validation" and "train" keys remain" datasets = DatasetDict() datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split="validation", cache_dir=model_args.cache_dir, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"{data_args.train_split_name}", cache_dir=model_args.cache_dir, ) # only normalized-inputs-training is supported feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_normalize=True) def prepare_dataset(batch): # check that all files have the correct sampling rate batch["speech"], _ = librosa.load(batch[data_args.speech_file_column], sr=feature_extractor.sampling_rate) return batch # load audio files into numpy arrays vectorized_datasets = datasets.map( prepare_dataset, num_proc=data_args.preprocessing_num_workers, remove_columns=datasets["train"].column_names) # filter audio files that are too long vectorized_datasets = vectorized_datasets.filter(lambda data: len(data[ "speech"]) < int(data_args.max_duration_in_seconds * feature_extractor. sampling_rate)) def normalize(batch): return feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate) # normalize and transform to `BatchFeatures` vectorized_datasets = vectorized_datasets.map( normalize, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, remove_columns=vectorized_datasets["train"].column_names, ) # pretraining is only supported for "newer" stable layer norm architecture # apply_spec_augment has to be True, mask_feature_prob has to be 0.0 config = Wav2Vec2Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, gradient_checkpointing=training_args.gradient_checkpointing, ) if not config.do_stable_layer_norm or config.feat_extract_norm != "layer": raise ValueError( "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and" " ``config.feat_extract_norm='layer'") model = Wav2Vec2ForPreTraining(config) data_collator = DataCollatorForWav2Vec2Pretraining( model=model, feature_extractor=feature_extractor) trainer = Wav2Vec2PreTrainer( model=model, data_collator=data_collator, args=training_args, train_dataset=vectorized_datasets["train"], eval_dataset=vectorized_datasets["validation"], tokenizer=feature_extractor, max_gumbel_temp=model_args.max_gumbel_temperature, min_gumbel_temp=model_args.min_gumbel_temperature, gumbel_temp_decay=model_args.gumbel_temperature_decay, ) trainer.train()
def test_inference_pretrained(self): model = Wav2Vec2ForPreTraining.from_pretrained( "facebook/wav2vec2-base") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( "facebook/wav2vec2-base", return_attention_mask=True) input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) features_shape = ( inputs_dict["input_values"].shape[0], model._get_feat_extract_output_lengths( torch.tensor(inputs_dict["input_values"].shape[1])), ) torch.manual_seed(0) mask_time_indices = _compute_mask_indices( features_shape, model.config.mask_time_prob, model.config.mask_time_length, device=inputs_dict["input_values"].device, min_masks=2, ).to(torch_device) with torch.no_grad(): outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), mask_time_indices=mask_time_indices, ) # compute cosine similarity cosine_sim = torch.cosine_similarity( outputs.projected_states, outputs.projected_quantized_states, dim=-1) # retrieve cosine sim of masked features cosine_sim_masked = cosine_sim[mask_time_indices] # ... now compare to randomly initialized model config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base") model_rand = Wav2Vec2ForPreTraining(config).to(torch_device).eval() with torch.no_grad(): outputs_rand = model_rand( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), mask_time_indices=mask_time_indices, ) # compute cosine similarity cosine_sim_rand = torch.cosine_similarity( outputs_rand.projected_states, outputs_rand.projected_quantized_states, dim=-1) # retrieve cosine sim of masked features cosine_sim_masked_rand = cosine_sim_rand[mask_time_indices] # a pretrained wav2vec2 model has learned to predict the quantized latent states # => the cosine similarity between quantized states and predicted states > 0.5 # a random wav2vec2 model has not learned to predict the quantized latent states # => the cosine similarity between quantized states and predicted states is very likely < 0.1 self.assertTrue(cosine_sim_masked.mean().item() - 5 * cosine_sim_masked_rand.mean().item() > 0)
def main(): # See all possible arguments in src/transformers/args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # set up weights and biases if available if is_wandb_available(): import wandb wandb.init(project=args.output_dir.split("/")[-1]) else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub and not args.preprocessing_only: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # 1. Download and create train, validation dataset # We load all dataset configuration and datset split pairs passed in # ``args.dataset_config_names`` and ``args.dataset_split_names`` datasets_splits = [] for dataset_config_name, train_split_name in zip(args.dataset_config_names, args.dataset_split_names): # load dataset dataset_split = load_dataset( args.dataset_name, dataset_config_name, split=train_split_name, cache_dir=args.cache_dir, ) datasets_splits.append(dataset_split) # Next, we concatenate all configurations and splits into a single training dataset raw_datasets = DatasetDict() if len(datasets_splits) > 1: raw_datasets["train"] = concatenate_datasets(datasets_splits).shuffle(seed=args.seed) else: raw_datasets["train"] = datasets_splits[0] # Take ``args.validation_split_percentage`` from the training dataset for the validation_split_percentage num_validation_samples = raw_datasets["train"].num_rows * args.validation_split_percentage // 100 if num_validation_samples == 0: raise ValueError( "`args.validation_split_percentage` is less than a single sample " f"for {len(raw_datasets['train'])} training samples. Increase " "`args.num_validation_split_percentage`. " ) raw_datasets["validation"] = raw_datasets["train"].select(range(num_validation_samples)) raw_datasets["train"] = raw_datasets["train"].select(range(num_validation_samples, raw_datasets["train"].num_rows)) # 2. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_name_or_path) # make sure that dataset decodes audio with correct sampling rate raw_datasets = raw_datasets.cast_column( args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate) ) # only normalized-inputs-training is supported if not feature_extractor.do_normalize: raise ValueError( "Training is only supported for normalized inputs. Make sure ``feature_extractor.do_normalize == True``" ) # set max & min audio length in number of samples max_length = int(args.max_duration_in_seconds * feature_extractor.sampling_rate) min_length = int(args.min_duration_in_seconds * feature_extractor.sampling_rate) def prepare_dataset(batch): sample = batch[args.audio_column_name] inputs = feature_extractor( sample["array"], sampling_rate=sample["sampling_rate"], max_length=max_length, truncation=True ) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(inputs.input_values[0]) return batch # load via mapped files via path cache_file_names = None if args.train_cache_file_name is not None: cache_file_names = {"train": args.train_cache_file_name, "validation": args.validation_cache_file_name} # load audio files into numpy arrays with accelerator.main_process_first(): vectorized_datasets = raw_datasets.map( prepare_dataset, num_proc=args.preprocessing_num_workers, remove_columns=raw_datasets["train"].column_names, cache_file_names=cache_file_names, ) if min_length > 0.0: vectorized_datasets = vectorized_datasets.filter( lambda x: x > min_length, num_proc=args.preprocessing_num_workers, input_columns=["input_length"], ) vectorized_datasets = vectorized_datasets.remove_columns("input_length") # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if args.preprocessing_only: return # 3. Load model config = Wav2Vec2Config.from_pretrained(args.model_name_or_path) # pretraining is only supported for "newer" stable layer norm architecture # apply_spec_augment has to be True, mask_feature_prob has to be 0.0 if not config.do_stable_layer_norm or config.feat_extract_norm != "layer": raise ValueError( "PreTraining is only supported for ``config.do_stable_layer_norm=True`` and" " ``config.feat_extract_norm='layer'" ) # initialize random model model = Wav2Vec2ForPreTraining(config) # Activate gradient checkpointing if needed if args.gradient_checkpointing: model.gradient_checkpointing_enable() # 4. Define data collator, optimizer and scheduler data_collator = DataCollatorForWav2Vec2Pretraining( model=model, feature_extractor=feature_extractor, pad_to_multiple_of=args.pad_to_multiple_of ) train_dataloader = DataLoader( vectorized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size, ) eval_dataloader = DataLoader( vectorized_datasets["validation"], collate_fn=data_collator, batch_size=args.per_device_eval_batch_size ) # Optimizer optimizer = AdamW( list(model.parameters()), lr=args.learning_rate, betas=[args.adam_beta1, args.adam_beta2], eps=args.adam_epsilon, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # 5. Train total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(vectorized_datasets['train'])}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") completed_steps = 0 starting_epoch = 0 # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 for epoch in range(starting_epoch, args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): # compute num of losses num_losses = batch["mask_time_indices"].sum() sub_attention_mask = batch.pop("sub_attention_mask", None) sub_attention_mask = ( sub_attention_mask if sub_attention_mask is not None else torch.ones_like(batch["mask_time_indices"]) ) percent_masked = num_losses / sub_attention_mask.sum() # forward outputs = model(**batch) # divide loss by gradient accumulation steps since gradients # are accumulated for multiple backward passes in PyTorch loss = outputs.loss / args.gradient_accumulation_steps accelerator.backward(loss) # make sure that `num_losses` is summed for distributed training # and average gradients over losses of all devices if accelerator.state.num_processes > 1: num_losses = accelerator.gather(num_losses).sum() gradient_multiplier = accelerator.state.num_processes / num_losses multiply_grads(model.module.parameters(), gradient_multiplier) else: multiply_grads(model.parameters(), 1 / num_losses) # update step if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: # compute grad norm for monitoring scale = ( accelerator.scaler._scale.item() if hasattr(accelerator, "scaler") and accelerator.scaler is not None else 1 ) if accelerator.state.num_processes > 1: grad_norm = get_grad_norm(model.module.parameters(), scale) else: grad_norm = get_grad_norm(model.parameters(), scale) # update parameters optimizer.step() optimizer.zero_grad() if not accelerator.optimizer_step_was_skipped: lr_scheduler.step() elif accelerator.is_local_main_process: progress_bar.write( f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..." ) # update gumbel temperature gumbel_temperature = max( args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, args.min_gumbel_temperature, ) if hasattr(model, "module"): model.module.set_gumbel_temperature(gumbel_temperature) else: model.set_gumbel_temperature(gumbel_temperature) progress_bar.update(1) completed_steps += 1 # 6. Log all results if (step + 1) % (args.gradient_accumulation_steps * args.logging_steps) == 0: loss.detach() outputs.contrastive_loss.detach() outputs.diversity_loss.detach() if accelerator.state.num_processes > 1: loss = accelerator.gather(loss).sum() outputs.contrastive_loss = accelerator.gather(outputs.contrastive_loss).sum() outputs.diversity_loss = accelerator.gather(outputs.diversity_loss).sum() percent_masked = accelerator.gather(percent_masked).sum() train_logs = { "loss": (loss * args.gradient_accumulation_steps) / num_losses, "constrast_loss": outputs.contrastive_loss / num_losses, "div_loss": outputs.diversity_loss / num_losses, "%_mask_idx": percent_masked / accelerator.num_processes, "ppl": outputs.codevector_perplexity, "lr": torch.tensor(optimizer.param_groups[0]["lr"]), "temp": torch.tensor(gumbel_temperature), "grad_norm": torch.tensor(grad_norm), } log_str = "" for k, v in train_logs.items(): log_str += "| {}: {:.3e}".format(k, v.item()) if accelerator.is_local_main_process: progress_bar.write(log_str) if is_wandb_available(): wandb.log(train_logs) # save model every `args.saving_steps` steps if (step + 1) % (args.gradient_accumulation_steps * args.saving_steps) == 0: if (args.push_to_hub and epoch < args.num_train_epochs - 1) or args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if (args.push_to_hub and epoch < args.num_train_epochs - 1) and accelerator.is_main_process: repo.push_to_hub( commit_message=f"Training in progress step {completed_steps}", blocking=False, auto_lfs_prune=True, ) # if completed steps > `args.max_train_steps` stop if completed_steps >= args.max_train_steps: break # 7. Validate! model.eval() # init logs val_logs = { "val_loss": 0, "val_contrastive_loss": 0, "val_diversity_loss": 0, "val_num_losses": 0, } for step, batch in enumerate(eval_dataloader): with torch.no_grad(): batch.pop("sub_attention_mask", None) outputs = model(**batch) val_logs["val_loss"] += outputs.loss val_logs["val_contrastive_loss"] += outputs.contrastive_loss val_logs["val_diversity_loss"] += outputs.diversity_loss val_logs["val_num_losses"] += batch["mask_time_indices"].sum() # sum over devices in multi-processing if accelerator.num_processes > 1: val_logs = {k: accelerator.gather(v).sum() for k, v in val_logs.items()} val_logs = {k: v / val_logs["val_num_losses"] for k, v in val_logs.items()} log_str = "" for k, v in val_logs.items(): log_str += "| {}: {:.3e}".format(k, v.item()) if accelerator.is_local_main_process: progress_bar.write(log_str) if is_wandb_available(): wandb.log(val_logs) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)