def _init_model(self, config: EasyDict): if not config.common.checkpoint_model: print(f"Loading pretrained model {config.common.pretrained_model}") config.model.pretrained_model_name_or_path = config.common.pretrained_model config.model.pad_token_id = self._processor.tokenizer.pad_token_id config.model.vocab_size = len(self._processor.tokenizer) self._model = Wav2Vec2ForCTC.from_pretrained(**config.model) else: print(f"Loading from checkpoint {config.common.checkpoint_model}") self._model = Wav2Vec2ForCTC.from_pretrained( config.common.checkpoint_model).to("cuda")
def load_asr_model(device): """Load model""" print(f"[INFO]: Load the pre-trained ASR by {ASR_PRETRAINED_MODEL}.") model = Wav2Vec2ForCTC.from_pretrained(ASR_PRETRAINED_MODEL).to(device) tokenizer = Wav2Vec2Tokenizer.from_pretrained(ASR_PRETRAINED_MODEL) models = {"model": model, "tokenizer": tokenizer} return models
def main(input_pipe, output_pipe): source = ZeroMQSource(input_pipe) sink = ZeroMQSink(output_pipe) debug('[+] loading processor') processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h') debug('[+] loading model') model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h') sink.signal_ready() try: while True: debug('>>> waiting for connection') # torchaudio seems to expect complete files so send small parts buf = io.BytesIO(source.recv()) start = time.time() debug('[+] converting audio') wav = io.BytesIO(webm_to_wav(buf.read())) waveform, sample_rate = torchaudio.load(wav) waveform = waveform[0] # Wav2Vec2Processor expects mono 16kHz audio debug('[+] input_values') input_values = processor(waveform, sampling_rate=sample_rate, return_tensors='pt').input_values debug('[+] logits') logits = model(input_values).logits debug('[+] predicted_ids') predicted_ids = torch.argmax(logits, dim=-1) debug('[+] transcription') transcription = processor.batch_decode(predicted_ids)[0] duration = time.time() - start sink.send(start, duration, transcription) finally: source.close() sink.close()
def main(): config = configparser.ConfigParser() config.read('config.ini') # Initialize tokenizer and model from HuggingFace tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") if config.getboolean('config', 'from_microphone'): # Record from microphone and transcript audio = record_from_mic(config) transcriptions = wav2vec2_inference(audio, tokenizer, model) print(f"Transcribed audio: {transcriptions}") if config.getboolean('config', 'save_transcriptions'): with open('mic_transcription.txt', 'w') as file: file.write(transcriptions) print(f"Transcribed audio stored in mic_transcription.txt") else: # Transcript files in configuration file audio_files = json.loads(config.get('config', 'audio_files')) for audio_file in audio_files: audio, _ = sf.read(audio_file, dtype='float32') transcriptions = wav2vec2_inference(audio, tokenizer, model) print(f"Transcribed audio: {transcriptions}") if config.getboolean('config', 'save_transcriptions'): with open(f'{Path(audio_file).stem}.txt', 'w') as file: file.write(transcriptions) print(f"Transcribed audio stored in {Path(audio_file).stem}.txt")
def __init__(self): super(ASR_CTC, self).__init__() #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base') #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab()) #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base') self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|") self.feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) self.processor = Wav2Vec2Processor( feature_extractor=self.feature_extractor, tokenizer=self.tokenizer) self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.1, hidden_dropout=0.1, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.1, gradient_checkpointing=True, ctc_loss_reduction="mean", pad_token_id=self.processor.tokenizer.pad_token_id, vocab_size=len(self.processor.tokenizer))
def load_model(): model_name = "facebook/wav2vec2-large-xlsr-53-french" model = Wav2Vec2ForCTC.from_pretrained(model_name) processor = Wav2Vec2Processor.from_pretrained(model_name) resampler = torchaudio.transforms.Resample(orig_freq=16_000, new_freq=16_000) return model, processor, resampler
def test_simple_wav2vec2(self): import numpy as np from datasets import load_dataset model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.zeros((34000, )) output = asr(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) filename = ds[0]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
def __init__(self, subtitle_lookup_path, wav_dir, wav2vec_checkpoint, pretrained_chpt="facebook/wav2vec2-base-960h", num_proc=4, preprocess_batch_size=8): """ Args: subtitle_lookup_path (NOTE this file must be compatible with pd.read_csv()) wav_dir: directory of wav files e.g. './wav_data/' pretrained_chpt: pretrained checkpoints to load num_proc: number of processes allowed when doing dataset preprocessing preprocess_batch_size: this is ONLY used inside this dataset to preprocess wav files faster Returns: """ self.subtitle_lookup_path = subtitle_lookup_path self.wav_dir = wav_dir self.num_proc = num_proc self.preprocess_batch_size = preprocess_batch_size self.processor = Wav2Vec2Processor.from_pretrained(pretrained_chpt) self.feature_extractor = Wav2Vec2ForCTC.from_pretrained( pretrained_chpt).wav2vec2.feature_extractor # self.knnw_prepared = dataset_dict.load_from_disk(wav2vec_checkpoint) # self.knnw_prepared.save_to_disk(wavdir) self.knnw_prepared = self.setup_dataset()
def check_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = Wav2Vec2ForCTC(config=config) model.to(torch_device) model.train() # freeze feature encoder model.freeze_feature_extractor() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths( torch.tensor(input_lengths)) labels = ids_tensor( (input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i]:] = 0.0 if max_length_labels[i] < labels.shape[-1]: # it's important that we make sure that target lenghts are at least # one shorter than logit lenghts to prevent -inf labels[i, max_length_labels[i] - 1:] = -100 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def test_inference_ctc_normal_batched(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") model.to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True) input_values = inputs.input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def get_predictions(test_dir_root: str, bs: int, extra_step: float, loading_step: float) -> None: device = torch.device("cuda:0") if torch.cuda.is_available() \ else torch.device("cpu") # load model and tokenizer model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME).eval().to(device) tokenizer = Wav2Vec2Tokenizer.from_pretrained(MODEL_NAME) test_dir_root = Path(test_dir_root) # iterate over the files in the correct order with open(test_dir_root / "FILE_ORDER", "r") as f: wav_file_order = f.read().splitlines() token_predictions = {} for wf in wav_file_order: wf = f"{wf}.wav" print(f"Generating token predictions for {wf}") path_to_wav = test_dir_root / "wavs" / wf token_predictions[wf] = get_preds_for_wav(model, tokenizer, device, bs, path_to_wav, extra_step, loading_step) test_dir_root.mkdir(parents = True, exist_ok = True) path_to_preds = test_dir_root / "token_predictions.json" with open(path_to_preds, "w") as f: json.dump(token_predictions, f) print(f"Wav2Vec predictions saved at {path_to_preds}")
def test_simple_wav2vec2(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") asr = AutomaticSpeechRecognitionPipeline( model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) waveform = np.tile(np.arange(1000, dtype=np.float32), 34) output = asr(waveform) self.assertEqual(output, {"text": ""}) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) filename = ds[40]["file"] with open(filename, "rb") as f: data = f.read() output = asr(data) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
def __init__(self, model_name="facebook/wav2vec2-base-960h", device=None, half=False): """ ``` basic wrapper speech transcription Args: model_name(str): Helsinki-NLP model device(str): device to use (e.g., 'cuda', 'cpu') half(bool): If True, use half precision. ``` """ if not TORCH: raise ImportError('Transcriber requires PyTorch to be installed.') self.torch_device = device if self.torch_device is None: self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' #if not SOUNDFILE: #raise ImportError("SoundFile library not installed or libsndfile not found: pip install soundfile") if not LIBROSA: raise ImportError( "librosa library must be installed: pip install librosa. Conda users may also have to install ffmpeg: conda install -c conda-forge ffmpeg" ) # load model and processor self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to( self.torch_device) self.processor = Wav2Vec2Processor.from_pretrained(model_name) if half: self.model = self.model.half()
def _get_model(model_id): from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor tokenizer = Wav2Vec2Processor.from_pretrained(model_id).tokenizer labels = [k for k, v in sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1])] original = Wav2Vec2ForCTC.from_pretrained(model_id) model = import_huggingface_model(original) return model.eval(), labels
def convert_to_onnx(model_id_or_path, onnx_model_name): print(f"Converting {model_id_or_path} to onnx") model = Wav2Vec2ForCTC.from_pretrained(model_id_or_path) audio_len = 250000 x = torch.randn(1, audio_len, requires_grad=True) torch.onnx.export( model, # model being run x, # model input (or a tuple for multiple inputs) onnx_model_name, # where to save the model (can be a file or file-like object) export_params= True, # store the trained parameter weights inside the model file opset_version=11, # the ONNX version to export the model to do_constant_folding= True, # whether to execute constant folding for optimization input_names=['input'], # the model's input names output_names=['output'], # the model's output names dynamic_axes={ 'input': { 1: 'audio_len' }, # variable length axes 'output': { 1: 'audio_len' } })
def test_chunking(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = AutoTokenizer.from_pretrained( "facebook/wav2vec2-base-960h") feature_extractor = AutoFeatureExtractor.from_pretrained( "facebook/wav2vec2-base-960h") speech_recognizer = pipeline( task="automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, framework="pt", chunk_length_s=10.0, ) ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 audio = np.tile(audio, n_repeats) output = speech_recognizer([audio], batch_size=2) expected_text = "A MAN SAID TO THE UNIVERSE SIR I EXIST " * n_repeats expected = [{"text": expected_text.strip()}] self.assertEqual(output, expected)
def __init__(self, path="facebook/wav2vec2-base-960h", quantize=False, gpu=True, batch=64): """ Constructs a new transcription pipeline. Args: path: optional path to model, accepts Hugging Face model hub id or local path, uses default model for task if not provided quantize: if model should be quantized, defaults to False gpu: True/False if GPU should be enabled, also supports a GPU device id batch: batch size used to incrementally process content langdetect: path to language detection model, uses a default path if not provided """ # Call parent constructor super().__init__(path, quantize, gpu, batch) if not SOUNDFILE: raise ImportError( "SoundFile library not installed or libsndfile not found") # load model and processor self.model = Wav2Vec2ForCTC.from_pretrained(self.path) self.processor = Wav2Vec2Processor.from_pretrained(self.path) # Move model to device self.model.to(self.device)
def get_model(tokenizer, n_langs=2): """Constructs the model with asr and language identification, from the base Wav2Vec2 model by modifying the last lm_head layer. Args: tokenizer: The tokenizer whose length is all the alphabets that the model can predict. n_langs: The number of different languages the model needs to distinguish between. Returns: The constructed model, having len(tokenizer)+n_langs+1 outputs in the last layer. """ model = Wav2Vec2ForCTC.from_pretrained(config.model) pt_wts = model.lm_head.weight pt_bias = model.lm_head.bias new_lm_head = nn.Linear( pt_wts.shape[1], len(tokenizer) + (0 if n_langs <= 1 else n_langs + 1)) init_wts = new_lm_head.weight.clone().detach() init_bs = new_lm_head.bias.clone().detach() init_wts[:pt_wts.shape[0], :] = pt_wts.clone().detach() init_wts[pt_wts.shape[0]:, :] = torch.mean(pt_wts.clone().detach(), dim=0) init_bs[:pt_bias.shape[0]] = pt_bias.clone().detach() init_bs[pt_wts.shape[0]:] = torch.mean(pt_bias.clone().detach(), dim=0) with torch.no_grad(): new_lm_head.weight = nn.Parameter(init_wts) new_lm_head.bias = nn.Parameter(init_bs) model.lm_head = new_lm_head return model.to(config.device)
def __init__(self, model_name='te'): model_name = model_name.lower() for x, y in LANGUAGE_ALISASES.items(): model_name = model_name.replace(x, y) if model_name not in MODEL_URLS and model_name not in LANGUAGE_ALISASES: if model_name in LANGUAGE_ALISASES: model_name = LANGUAGE_ALISASES[model_name] print(f"model_name should be one of {list(MODEL_URLS.keys())}") return None home = os.path.expanduser("~") lang_path = os.path.join(home, ".IndicASR_" + model_name) if not os.path.exists(lang_path): os.mkdir(lang_path) for file_name, url in MODEL_URLS[model_name].items(): file_path = os.path.join(lang_path, file_name) if os.path.exists(file_path): continue print(f"Downloading {file_name}") pydload.dload(url=url, save_to_path=file_path, max_time=None) self.processor = Wav2Vec2Processor.from_pretrained(lang_path) self.model = Wav2Vec2ForCTC.from_pretrained(lang_path) if torch.cuda.is_available(): print(f"Using GPU") self.model = self.model.cuda()
def convert_wav2vec2_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = Wav2Vec2Config.from_pretrained(config_path) else: config = Wav2Vec2Config() if is_finetuned: hf_wav2vec = Wav2Vec2ForCTC(config) else: hf_wav2vec = Wav2Vec2Model(config) if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": dict_path}) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) model = model[0].eval() recursively_load_weights(model, hf_wav2vec, is_finetuned) hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
def test_inference_ctc_robust_batched(self): model = Wav2Vec2ForCTC.from_pretrained( "facebook/wav2vec2-large-960h-lv60-self").to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True) input_speech = self._load_datasamples(4) inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about", "his instant panic was followed by a small sharp blow high on his chest", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def check_ctc_loss(self, config, input_values, *args): model = Wav2Vec2ForCTC(config=config) model.to(torch_device) # make sure that dropout is disabled model.eval() input_values = input_values[:3] attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 attention_mask[i, input_lengths[i] :] = 0 model.config.ctc_loss_reduction = "sum" sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() model.config.ctc_loss_reduction = "mean" mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() self.parent.assertTrue(isinstance(sum_loss, float)) self.parent.assertTrue(isinstance(mean_loss, float))
def load(self): if not hasattr(self, 'processor'): self.processor = Wav2Vec2Processor.from_pretrained( self.recognizer_dir) if self.load_model: self.model = Wav2Vec2ForCTC.from_pretrained(self.recognizer_dir) if self.use_cuda: self.model = self.model.to("cuda")
def __init__(self): self.REQUIRED_SAMPLE_RATE = 16000 # Use Facebook's pretrained Wav2Vec2 model # https://huggingface.co/facebook/wav2vec2-large-960h PRETRAINED_MODEL = 'facebook/wav2vec2-base-960h' self.processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL) self.model = Wav2Vec2ForCTC.from_pretrained(PRETRAINED_MODEL)
def init(self): self.processor = Wav2Vec2Processor.from_pretrained(self.model_name) assert self.processor.feature_extractor.do_normalize is True self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name) target_dictionary = list(self.processor.tokenizer.get_vocab().keys()) print(f"target_dictionary: {target_dictionary}") self.decoder = GreedyDecoder(target_dictionary).init() return self
def evaluate_asr(): def read_txt(txt_path): data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence']) has_colon = data['path'].str.contains('|') data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True) data = Dataset.from_pandas(data) return(data) test = read_txt('./data/speech-sme-asr/test_asr.txt') processor = Wav2Vec2Processor.from_pretrained("asr_output/pretrained_processor") # print(processor.__dict__) # print(processor.tokenizer) # exit() model = Wav2Vec2ForCTC.from_pretrained("asr_output/checkpoint-27363").to("cpu") # print(model) # exit() # resampler = torchaudio.transforms.Resample(new_freq=16_000) def speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load('./data/'+ batch["path"]) batch["speech"] = speech_array[0].numpy() return batch test_dataset = test.map(speech_file_to_array_fn) input_dict = processor(test_dataset['speech'][:11],sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(input_dict.input_values.to("cpu")).logits predicted_ids = torch.argmax(logits, dim=-1) print("Prediction:", processor.batch_decode(predicted_ids)) print("Reference:", test_dataset["sentence"][:11]) wer = load_metric("wer") resampler = torchaudio.transforms.Resample(48_000, 16_000) def evaluate_batch(batch): inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values.to("cpu"), attention_mask=inputs.attention_mask.to("cpu")).logits pred_ids = torch.argmax(logits, dim=-1) batch["pred_strings"] = processor.batch_decode(pred_ids) return batch result = test_dataset.map(evaluate_batch, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory msg = "WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])) print(msg) return msg
def process(PATH): audio, sampling_rate = librosa.load(PATH, sr=16000) tokenizer = Wav2Vec2Tokenizer.from_pretrained( "facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") input_values = tokenizer(audio, return_tensors='pt').input_values logits = model(input_values).logits prediction = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(prediction)[0] print(transcription) return
def test_mask_time_prob_ctc(self): model = Wav2Vec2ForCTC.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", mask_time_prob=0.2, mask_time_length=2 ) model.to(torch_device).train() processor = Wav2Vec2Processor.from_pretrained( "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True ) batch_duration_in_seconds = [1, 3, 2, 6] input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
class Transcription(): """ Simple class to upload the data in the sound file and transcribe it. """ tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") #initialize file names origin_file = 'audio.wav' destination_file = 'rec4.wav' file_name = 'rec4.wav' file_path = os.path.join('.', file_name) # def __init__(self, origin_file): # self.origin_file = origin_file def change_filename(self): "Change the audio file from .oga to .wav" if os.path.exists(self.destination_file): os.remove(self.destination_file) process = subprocess.run(['ffmpeg', '-hide_banner','-i', self.origin_file, self.destination_file]) if process.returncode != 0: raise Exception("Something went wrong") def map_to_array(self): "Read file and convert to a format that the model can accept" self.speech, self.sampling_rate = torchaudio.load(self.origin_file) self.resample_rate = 16000 self.speech = librosa.resample(np.asarray(self.speech).reshape(-1,), self.sampling_rate, self.resample_rate) self.speech = librosa.to_mono(self.speech) return self.speech, self.resample_rate def indicate_transcription(self): "Transcribe" #self.change_filename() self.speech, self.sampling_rate = self.map_to_array() input_values = self.tokenizer(self.speech, return_tensors="pt", padding="longest").input_values logits = self.model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = self.tokenizer.batch_decode(predicted_ids) transcription = ''.join(transcription) return transcription.lower() def __str__(self): return self.indicate_transcription()
def decode_audio(audio_values, recognizer_dir = '', model = None, processor = None): if recognizer_dir: model = Wav2Vec2ForCTC.from_pretrained(recognizer_dir).to('cuda') processor = Wav2Vec2Processor.from_pretrained(recognizer_dir) if not model: m = 'please provide model directory or model and processor' raise ValueError(m) input_dict = processor(audio_values, return_tensors='pt', padding = True, sampling_rate = 16_000) logits = model(input_dict.input_values.to('cuda')).logits labels = torch_argmax(logits, dim = -1)[0] return labels_to_letters(pred_ids, processor=processor)