def test_special_characters_in_vocab(self): sent = "ʈʰ æ æ̃ ˧ kʰ" vocab_dict = { k: v for v, k in enumerate({phoneme for phoneme in sent.split()}) } vocab_file = os.path.join(self.tmpdirname, "vocab_special.json") with open(vocab_file, "w") as f: json.dump(vocab_dict, f) tokenizer = Wav2Vec2CTCTokenizer(vocab_file) expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True) self.assertEqual(sent, expected_sent) tokenizer.save_pretrained( os.path.join(self.tmpdirname, "special_tokenizer")) tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( os.path.join(self.tmpdirname, "special_tokenizer")) expected_sent = tokenizer.decode(tokenizer(sent).input_ids, spaces_between_special_tokens=True) self.assertEqual(sent, expected_sent)
def data_preparation(): data = import_data() global processor if glob.glob(f"results_hg/{MODEL}/{LABEL}/processor/*"): print(">> From pretrained processor ") processor = Wav2Vec2Processor.from_pretrained(f"results_hg/{MODEL}/{LABEL}/processor") else : print(">> Creating processor ") gen_vocab(data) tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \ pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \ sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/') dataset = data.map(speech_file_to_array_fn, \ remove_columns=data.column_names["train"], num_proc=4) dataset_prepared = dataset.map(prepare_dataset, \ remove_columns=dataset.column_names["train"], batch_size=8, num_proc=4, batched=True) data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) return processor, dataset_prepared, data_collator
def __init__(self): super(ASR_CTC, self).__init__() #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base') #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab()) #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") self.feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) self.processor = Wav2Vec2Processor( feature_extractor=self.feature_extractor, tokenizer=self.tokenizer) self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.1, hidden_dropout=0.1, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.1, gradient_checkpointing=True, ctc_loss_reduction="mean", pad_token_id=self.processor.tokenizer.pad_token_id, vocab_size=len(self.processor.tokenizer))
def load_tokenizer(): tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(recognizer_dir, unk_token='[UNK]', pad_token='[PAD]', word_delemiter_token='|', cache_dir=cache_dir) return tokenizer
def _init_processor(self, config: EasyDict): config.processor.tokenizer.vocab_file = config.common.vocab_file tokenizer = Wav2Vec2CTCTokenizer(**config.processor.tokenizer) feature_extractor = Wav2Vec2FeatureExtractor( **config.processor.feature_extractor) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(config.common.model_path) self._processor = processor
def save_processor(): processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-large-960h-lv60-self") processor.save_pretrained(hf_path) create_vocab("../add_wav2vec/data/temp/dict.ltr.txt") tok = Wav2Vec2CTCTokenizer(hf_path + "/vocab.json") tok.save_pretrained(hf_path) processor = Wav2Vec2Processor.from_pretrained(hf_path) processor.save_pretrained(hf_path)
def create_processor(self, model_args: ModelArguments) -> Wav2Vec2Processor: feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir ) if self.vocab_file: tokenizer = Wav2Vec2CTCTokenizer( self.vocab_file, cache_dir=model_args.cache_dir, do_lower_case=self.do_lower_case, word_delimiter_token=self.word_delimiter_token, ) else: tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_lower_case=self.do_lower_case, word_delimiter_token=self.word_delimiter_token, ) return Wav2Vec2Processor(feature_extractor, tokenizer)
def processor_init(): tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained('asr_output/new_processor/') return processor
def __init__(self, m_path): self.check_download_models(m_path) STT_MODEL_PATH, STT_VOCAB_FILE = self.get_model_files_dirs(m_path) self.SAMPLING_RATE = 16000 print("Initializing STT Model") tokenizer = Wav2Vec2CTCTokenizer(STT_VOCAB_FILE, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=self.SAMPLING_RATE, padding_value=0.0, do_normalize=True, return_attention_mask=False) self.processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) self.model = torch.jit.load(STT_MODEL_PATH)
def __init__(self, transcription_file, root_dir): """ :param transcription_file: Path to the text transcription. :param root_dir: Directory containing audio files. """ self.transcriptions = pd.read_csv(transcription_file, sep="\t") self.root_dir = root_dir self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") self.feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) self.processor = Wav2Vec2Processor( feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
def __init__(self, model_dir=None, hub_name=None,device="cuda"): #if model comes locally if model_dir != None: self.model = Wav2Vec2ForCTC.from_pretrained(model_dir) #currently only this confiqurations tokenizer = Wav2Vec2CTCTokenizer(os.path.join(model_dir, "vocab.json"), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) self.processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) #model from huggingface hub elif hub_name != None: pass else: print("no model path given") self.model.to(device) self.device = device
def data_preparation_v2(): common_voice_train = load_dataset("common_voice", "tr", split="train+validation") common_voice_test = load_dataset("common_voice", "tr", split="test") common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"]) common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"]) common_voice_train = common_voice_train.map(remove_special_characters) common_voice_test = common_voice_test.map(remove_special_characters) vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names) vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names) vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])) vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) with open(f'results_hg/{MODEL}/{LABEL}/vocab.json', 'w') as vocab_file: json.dump(vocab_dict, vocab_file) global processor print(">> Creating processor ") tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \ pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \ sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/') common_voice_train = common_voice_train.map(speech_file_to_array_fn_v2, remove_columns=common_voice_train.column_names) common_voice_test = common_voice_test.map(speech_file_to_array_fn_v2, remove_columns=common_voice_test.column_names) common_voice_train = common_voice_train.map(resample, num_proc=4) common_voice_test = common_voice_test.map(resample, num_proc=4) data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True) common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True) return processor, common_voice_train, common_voice_test, data_collator
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # override default run name and log all args wandb.init(project="wav2vec4humans", config=parser.parse_args()) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]' def remove_special_characters(batch, train=True): batch["text"] = (re.sub(chars_to_ignore_regex, "", unidecode(batch["sentence"])).lower().strip()) if train: batch["text"] += " " return batch def extract_all_chars(batch): all_text = " ".join(batch["text"]) vocab = list(set(all_text)) return {"vocab": [vocab], "all_text": [all_text]} resampler = dict() def get_resampler(sampling_rate): if sampling_rate in resampler.keys(): return resampler[sampling_rate] else: logger.info(f"Creating new resampler for {sampling_rate}") resampler[sampling_rate] = torchaudio.transforms.Resample( sampling_rate, 16_000) return resampler[sampling_rate] # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load(batch["path"]) batch["speech"] = get_resampler(sampling_rate)( speech_array).squeeze().numpy() batch["sampling_rate"] = 16_000 batch["target_text"] = batch["text"] batch["duration"] = len(speech_array.squeeze()) / sampling_rate return batch def filter_by_duration(batch): return (batch["duration"] <= 10 and batch["duration"] >= 1 and len(batch["target_text"]) > 5) # about 98% of samples def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch def get_length(item): # speeds up grouping by length in pre-loaded dataset item["length"] = len(item["input_values"]) return item # Pre-processed datasets dataset_path = Path(os.getenv("HF_HOME", ".")) / "datasets" dataset_train_path = f"{dataset_path}/{data_args.dataset_config_name}/train/{data_args.train_split_name}" dataset_eval_path = f"{dataset_path}/{data_args.dataset_config_name}/eval" dataset_test_path = f"{dataset_path}/{data_args.dataset_config_name}/test" vocab_path = f"{dataset_path}/{data_args.dataset_config_name}/vocab/vocab_test_{data_args.train_split_name}.json" train_dataset = None eval_dataset = None if training_args.do_eval else False log_timestamp() if Path(dataset_train_path).exists() and Path(vocab_path).exists(): train_dataset = datasets.load_from_disk(dataset_train_path) log_timestamp("load pre-processed data") else: train_dataset = datasets.load_dataset( "common_voice", data_args.dataset_config_name, split=data_args.train_split_name, ) log_timestamp("load data") train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"]) log_timestamp("remove special characters") if training_args.do_eval: if Path(dataset_eval_path).exists(): eval_dataset = datasets.load_from_disk(dataset_eval_path) else: eval_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test") eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"]) log_timestamp() if Path(dataset_test_path).exists() and Path(vocab_path).exists(): test_dataset = datasets.load_from_disk(dataset_test_path) else: test_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test") test_dataset = test_dataset.map( lambda x: remove_special_characters(x, train=False), remove_columns=["sentence"], ) log_timestamp() if not Path(vocab_path).exists(): # create vocab vocab_train = train_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names, ) vocab_test = test_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_dataset.column_names, ) vocab_list = list( set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])) vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) Path(vocab_path).parent.mkdir(parents=True, exist_ok=True) with open(vocab_path, "w") as vocab_file: json.dump(vocab_dict, vocab_file) log_timestamp("create vocab") # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", ) feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) model = Wav2Vec2ForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, activation_dropout=model_args.activation_dropout, attention_dropout=model_args.attention_dropout, hidden_dropout=model_args.hidden_dropout, feat_proj_dropout=model_args.feat_proj_dropout, mask_time_prob=model_args.mask_time_prob, gradient_checkpointing=model_args.gradient_checkpointing, layerdrop=model_args.layerdrop, ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer), ) log_timestamp("load model") if not Path(dataset_train_path).exists(): train_dataset = train_dataset.map( speech_file_to_array_fn, remove_columns=train_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("load audio") train_dataset = train_dataset.filter( filter_by_duration, remove_columns=["duration"], num_proc=data_args.preprocessing_num_workers, ) log_timestamp("filter data") train_dataset = train_dataset.map( prepare_dataset, remove_columns=train_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("process data") train_dataset = train_dataset.map( get_length, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("add input length") train_dataset.save_to_disk(dataset_train_path) log_timestamp("save to disk") if not Path(dataset_eval_path).exists() and training_args.do_eval: eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.filter( filter_by_duration, remove_columns=["duration"], num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_eval_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( get_length, num_proc=data_args.preprocessing_num_workers, ) eval_dataset.save_to_disk(dataset_eval_path) log_timestamp() if not Path(dataset_test_path).exists(): test_dataset = test_dataset.map( speech_file_to_array_fn, num_proc=data_args.preprocessing_num_workers, ) test_dataset = test_dataset.filter(filter_by_duration, remove_columns=["duration"]) test_dataset.save_to_disk(dataset_test_path) log_timestamp() # Metric cer_metric = datasets.load_metric("cer") # we use a custom WER that considers punctuation wer_metric = datasets.load_metric("metrics/wer_punctuation.py") def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) cer = cer_metric.compute(predictions=pred_str, references=label_str) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"cer": cer, "wer": wer} log_timestamp() if model_args.freeze_feature_extractor: model.freeze_feature_extractor() log_timestamp("freeze feature extractor") # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) log_timestamp("create data collator") # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) loss_nan_stopping_callback = LossNaNStoppingCallback() early_stopping_callback = EarlyStoppingCallback() timing_callback = TimingCallback() trainer.add_callback(loss_nan_stopping_callback) trainer.add_callback(early_stopping_callback) trainer.add_callback(timing_callback) # Training log_timestamp("setup trainer") if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None log_timestamp() train_result = trainer.train(resume_from_checkpoint=checkpoint) log_timestamp("train model") trainer.save_model() # save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) metrics = train_result.metrics metrics["train_samples"] = len(train_dataset) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Final test metrics logger.info("*** Test ***") log_timestamp() if loss_nan_stopping_callback.stopped: test_cer, test_wer = 1.0, 2.0 logger.info( "Loss NaN detected, typically resulting in bad WER & CER so we won't calculate them." ) else: def evaluate(batch): inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model( inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda"), ).logits pred_ids = torch.argmax(logits, dim=-1) batch["pred_strings"] = processor.batch_decode(pred_ids) return batch model.to("cuda") # no need to cache mapped test_dataset datasets.set_caching_enabled(False) result = test_dataset.map( evaluate, batched=True, batch_size=training_args.per_device_eval_batch_size) log_timestamp("get test predictions") test_cer = cer_metric.compute(predictions=result["pred_strings"], references=result["text"]) test_wer = wer_metric.compute(predictions=result["pred_strings"], references=result["text"]) log_timestamp("compute test metrics") metrics = {"cer": test_cer, "wer": test_wer} wandb.log({f"test/{k}": v for k, v in metrics.items()}) trainer.save_metrics("test", metrics) logger.info(metrics) # save model files log_timestamp() if not loss_nan_stopping_callback.stopped: artifact = wandb.Artifact(name=f"model-{wandb.run.id}", type="model", metadata={"cer": test_cer}) for f in Path(training_args.output_dir).iterdir(): if f.is_file(): artifact.add_file(str(f)) wandb.run.log_artifact(artifact) log_timestamp("log artifacts")
# In[15]: import json if from_start: with open('vocab.json', 'w') as vocab_file: json.dump(vocab_dict, vocab_file) # In[16]: from transformers import Wav2Vec2CTCTokenizer if from_start: tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") # In[17]: repo_name = "wav2vec2-large-xls-r-300m-irish-colab" # In[16]: from transformers import Wav2Vec2CTCTokenizer if not from_start: tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
def convert_sew_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) if config_path is not None: config = SEWDConfig.from_pretrained(config_path) else: config = convert_config(model[0]) model = model[0].eval() return_attention_mask = True if config.feat_extract_norm == "layer" else False feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=return_attention_mask, ) if is_finetuned: if dict_path: target_dict = Dictionary.load(dict_path) # important change bos & pad token id since CTC symbol is <pad> and # not <s> as in fairseq config.bos_token_id = target_dict.pad_index config.pad_token_id = target_dict.bos_index config.eos_token_id = target_dict.eos_index config.vocab_size = len(target_dict.symbols) vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") if not os.path.isdir(pytorch_dump_folder_path): logger.error( "--pytorch_dump_folder_path ({}) should be a directory". format(pytorch_dump_folder_path)) return os.makedirs(pytorch_dump_folder_path, exist_ok=True) with open(vocab_path, "w", encoding="utf-8") as vocab_handle: json.dump(target_dict.indices, vocab_handle) tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token=target_dict.unk_word, pad_token=target_dict.pad_word, bos_token=target_dict.bos_word, eos_token=target_dict.eos_word, word_delimiter_token="|", do_lower_case=False, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(pytorch_dump_folder_path) hf_model = SEWDForCTC(config) else: hf_model = SEWDModel(config) feature_extractor.save_pretrained(pytorch_dump_folder_path) recursively_load_weights(model, hf_model, is_finetuned) hf_model.save_pretrained(pytorch_dump_folder_path)
#%% vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] #%% vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) print(len(vocab_dict)) #%% with open('./data/vocab.json', 'w') as vocab_file: json.dump(vocab_dict, vocab_file) #%% tokenizer = Wav2Vec2CTCTokenizer("./data/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") #%% feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) #%% [markdown] ### Preprocess Data
def call_huggingface(self, df): assert self.model_url != '', "Error! A model URL is needed for HuggingFace scoring, but --asr_download_model is empty" if self.tokenizer_url == '': print( f"Setting empty --tokenizer_url field identically to --asr_download_model: {self.model_url}" ) self.tokenizer_url = self.model_url if self.scoring_sorting == 'ascending': df = df.sort_values(by=['n_frames']).reset_index(drop=True) elif self.scoring_sorting == 'descending': df = df.sort_values(by=['n_frames'], ascending=False).reset_index(drop=True) elif self.scoring_sorting == '': pass else: raise NotImplementedError print(f"Preparing dataloader for manifest {self.manifest}...") dataset = AudioDataset(df) dataloader = DataLoader(dataset, batch_size=self.batch_size, collate_fn=dataset.collater, num_workers=self.num_workers, pin_memory=True) if self.hf_username == 'facebook': print(f"Downloading tokenizer: {self.tokenizer_url}") tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( self.tokenizer_url) print(f"Downloading model: {self.model_url}") model = Wav2Vec2ForCTC.from_pretrained(self.model_url) elif self.hf_username == 'speechbrain': if torch.cuda.is_available(): run_opts = {"device": "cuda"} else: run_opts = {"device": "cpu"} print(f"Downloading model: {self.model_url}") model = EncoderDecoderASR.from_hparams(source=self.model_url, run_opts=run_opts, savedir=os.path.join( 'pretrained_models', self.hf_modelname)) else: raise NotImplementedError model.eval() print("Scoring dataset...") df['wer'] = np.nan for batch in tqdm(dataloader): indexes, waveforms, transcripts, wav_lens = batch if self.hf_username == 'facebook': output_logits = model(waveforms.squeeze()).logits predicted_ids = torch.argmax(output_logits, dim=-1) pred_transcripts = tokenizer.batch_decode(predicted_ids) elif self.hf_username == 'speechbrain': waveforms = waveforms.squeeze() #waveforms = model.audio_normalizer(waveforms, self.sampling_rate) pred_transcripts = model.transcribe_batch(waveforms, wav_lens)[0] for index, ref in enumerate(transcripts): sample_id = indexes[index] ref = transcripts[index] pred = pred_transcripts[index] measures = jiwer.compute_measures(ref, pred) wer = measures['wer'] * 100.0 assert ( ref == df.loc[int(sample_id), 'tgt_text'] ), "The reference text indicated by the sample ID in the transcripts file does not match with the one stored in the dataset!" df.at[int(sample_id), 'wer'] = wer return df
save_path = "/media/nas/samir-data/wav2vec2_models" #os.mkdir(save_path) with open(save_path+'/hyperparameters.json', 'w') as f: json.dump(hyperparameters, f) wer_metric = load_metric("wer") #Generators training_set = AudioDataset(train_annotation_file, train_data_folder, MAX_LEN) training_generator = torch.utils.data.DataLoader(training_set, **hyperparameters) validation_set = AudioDataset(validation_annotation_file, validation_data_folder, MAX_LEN) validation_generator = torch.utils.data.DataLoader(validation_set, **hyperparameters) print('LOAD TOKENIZER...') tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) nb_labels = 37 print('INITIALIZING MODEL...') wav2vec2_model = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.1, hidden_dropout=0.1, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.1, gradient_checkpointing=True, ctc_loss_reduction="mean",
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: train_dataset = datasets.load_dataset( "tommy19970714/jsut_asr_hiragana", data_args.dataset_config_name, split="validation[:80%]", cache_dir=model_args.cache_dir, use_auth_token=True, ) eval_dataset = datasets.load_dataset( "tommy19970714/jsut_asr_hiragana", data_args.dataset_config_name, split="validation[80%:]", cache_dir=model_args.cache_dir, use_auth_token=True, ) # 日本語データセットのcolumn名を合わせる train_dataset = train_dataset.remove_columns(["id"]) train_dataset = train_dataset.rename_column("file", "path") train_dataset = train_dataset.rename_column("text", "sentence") eval_dataset = eval_dataset.remove_columns(["id"]) eval_dataset = eval_dataset.rename_column("file", "path") eval_dataset = eval_dataset.rename_column("text", "sentence") # Create and save tokenizer chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]' def remove_special_characters(batch): batch["text"] = ( re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " ") return batch train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"]) eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"]) def extract_all_chars(batch): all_text = " ".join(batch["text"]) vocab = list(set(all_text)) return {"vocab": [vocab], "all_text": [all_text]} vocab_train = train_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names, ) vocab_test = train_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=eval_dataset.column_names, ) vocab_list = list( set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])) vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) with open("vocab.json", "w") as vocab_file: json.dump(vocab_dict, vocab_file) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = Wav2Vec2CTCTokenizer( "vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", ) feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) model = Wav2Vec2ForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, activation_dropout=model_args.activation_dropout, attention_dropout=model_args.attention_dropout, hidden_dropout=model_args.hidden_dropout, feat_proj_dropout=model_args.feat_proj_dropout, mask_time_prob=model_args.mask_time_prob, gradient_checkpointing=model_args.gradient_checkpointing, layerdrop=model_args.layerdrop, ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer), use_auth_token=True, ) if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range( data_args.max_train_samples)) if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) # 日本語データセットはすでに16kHzになっているため resampler = torchaudio.transforms.Resample(16_000, 16_000) # Preprocessing the datasets. # We need to read the aduio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load(batch["path"]) batch["speech"] = resampler(speech_array).squeeze().numpy() batch["sampling_rate"] = 16_000 batch["target_text"] = batch["text"] return batch train_dataset = train_dataset.map( speech_file_to_array_fn, remove_columns=train_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch train_dataset = train_dataset.map( prepare_dataset, remove_columns=train_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) # Metric wer_metric = datasets.load_metric("wer") def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} if model_args.freeze_feature_extractor: model.freeze_feature_extractor() # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = (data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) return results
def create_processor(path: str = "./vocab.json"): tokenizer = Wav2Vec2CTCTokenizer(path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) return processor
timit = timit.map(remove_special_characters) vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=timit.column_names['train']) vocab_list = list(set(vocabs['train']['vocab'][0]) | set(vocabs['test']['vocab'][0])) vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict['[UNK]'] = len(vocab_dict) vocab_dict['[PAD]'] = len(vocab_dict) with open('vocab.json', 'w') as vocab_file: json.dump(vocab_dict, vocab_file) tokenizer = Wav2Vec2CTCTokenizer('./vocab.json', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) timit = timit.map(speech_file_to_array_fn, remove_columns=timit.column_names['train'], num_proc=4) timit_prepared = timit.map(prepare_dataset, remove_columns=timit.column_names['train'], batch_size=8, num_proc=4, batched=True) with open('timit_prepared.pkl', 'wb') as f: pickle.dump(timit_prepared, f) with open('processor.pkl', 'wb') as f: pickle.dump(processor, f)
def __setstate__(self, state): self.__dict__.update(state) self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(self.backbone)
wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # dir_path="/media/nas/samir-data/wav2vec2_models/inputs" dir_path = "/media/nas/samir-data/asr_transformers" test_data_folder = "/media/nas/CORPUS_FINAL/Corpus_audio/Corpus_FR/COMMONVOICE/common-voice-fr/clips" test_annotation_file = "/media/nas/CORPUS_FINAL/Corpus_audio/Corpus_FR/COMMONVOICE/common-voice-fr/test1.tsv" test_set = AudioDataset(test_annotation_file, test_data_folder, MAX_LEN) test_generator = torch.utils.data.DataLoader(test_set, batch_size=1) processor = Wav2Vec2Processor.from_pretrained( "/media/nas/samir-data/wav2vec2_models/checkpoint-94000") tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( "/media/nas/samir-data/wav2vec2_models/checkpoint-94000") model = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-xlsr-53-french") model.eval() wer_metric = load_metric("wer") # initialize the prediction predictions = [] references = [] for audio_file in test_generator: # for block in sf.blocks(audio_file["input_values"], blocksize=50000): input_dict = processor(np.squeeze(audio_file["input_values"], 0), return_attention_mask=False, return_tensors="pt",
vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) print(len(vocab_dict)) print(vocab_dict) with open('vocab.%s.json' % language, 'w') as vocab_file: json.dump(vocab_dict, vocab_file) print("\nConstructing tokenizer") tokenizer = Wav2Vec2CTCTokenizer("./vocab.%s.json" % language, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|") print("\nFeature Extractor") feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) print("\nConstructing Processor") processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(output_dir) print("\nCreating array from speech files")
def get_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def tokenizer(self): if self.backbone is not None and self.backbone != self._backbone: self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( self.backbone) self._backbone = self.backbone return self._tokenizer
def __init__(self, backbone: str): super().__init__() self.backbone = backbone self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(self.backbone)
vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["_"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) import json with open('vocab_english.json', 'w') as vocab_file: json.dump(vocab_dict, vocab_file) from transformers import Wav2Vec2CTCTokenizer tokenizer = Wav2Vec2CTCTokenizer("./vocab_english.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="_") from transformers import Wav2Vec2FeatureExtractor feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False) from transformers import Wav2Vec2Processor processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) print(timit["train"][0])