def __init__(self, model_id: str): model_type = get_type(model_id) if model_type is ModelType.ENCODERASR: self.model = EncoderASR.from_hparams(source=model_id) elif model_type is ModelType.ENCODERDECODERASR: self.model = EncoderDecoderASR.from_hparams(source=model_id) # Reduce latency self.model.mods.decoder.beam_size = 1 else: raise ValueError( f"{model_type.value} is invalid for automatic-speech-recognition" ) # Please define a `self.sampling_rate` for this pipeline # to automatically read the input correctly self.sampling_rate = self.model.hparams.sample_rate
def getTranscription(self): asr_model = EncoderDecoderASR.from_hparams( source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="./pretrained_ASR") transcription = asr_model.transcribe_file(self.path_to_wav) return transcription.lower() #with sr.Microphone() as source: # stt = SpeechToText() # r = sr.Recognizer() # audio = r.listen(source, timeout=5) # print("audio") # print(audio) # name = r.recognize_google(audio) # print("google works, why can't you?") # print(name) # stt.saveAudio(audio) # name = stt.getTranscription() # print(name)
def call_huggingface(self, df): assert self.model_url != '', "Error! A model URL is needed for HuggingFace scoring, but --asr_download_model is empty" if self.tokenizer_url == '': print( f"Setting empty --tokenizer_url field identically to --asr_download_model: {self.model_url}" ) self.tokenizer_url = self.model_url if self.scoring_sorting == 'ascending': df = df.sort_values(by=['n_frames']).reset_index(drop=True) elif self.scoring_sorting == 'descending': df = df.sort_values(by=['n_frames'], ascending=False).reset_index(drop=True) elif self.scoring_sorting == '': pass else: raise NotImplementedError print(f"Preparing dataloader for manifest {self.manifest}...") dataset = AudioDataset(df) dataloader = DataLoader(dataset, batch_size=self.batch_size, collate_fn=dataset.collater, num_workers=self.num_workers, pin_memory=True) if self.hf_username == 'facebook': print(f"Downloading tokenizer: {self.tokenizer_url}") tokenizer = Wav2Vec2CTCTokenizer.from_pretrained( self.tokenizer_url) print(f"Downloading model: {self.model_url}") model = Wav2Vec2ForCTC.from_pretrained(self.model_url) elif self.hf_username == 'speechbrain': if torch.cuda.is_available(): run_opts = {"device": "cuda"} else: run_opts = {"device": "cpu"} print(f"Downloading model: {self.model_url}") model = EncoderDecoderASR.from_hparams(source=self.model_url, run_opts=run_opts, savedir=os.path.join( 'pretrained_models', self.hf_modelname)) else: raise NotImplementedError model.eval() print("Scoring dataset...") df['wer'] = np.nan for batch in tqdm(dataloader): indexes, waveforms, transcripts, wav_lens = batch if self.hf_username == 'facebook': output_logits = model(waveforms.squeeze()).logits predicted_ids = torch.argmax(output_logits, dim=-1) pred_transcripts = tokenizer.batch_decode(predicted_ids) elif self.hf_username == 'speechbrain': waveforms = waveforms.squeeze() #waveforms = model.audio_normalizer(waveforms, self.sampling_rate) pred_transcripts = model.transcribe_batch(waveforms, wav_lens)[0] for index, ref in enumerate(transcripts): sample_id = indexes[index] ref = transcripts[index] pred = pred_transcripts[index] measures = jiwer.compute_measures(ref, pred) wer = measures['wer'] * 100.0 assert ( ref == df.loc[int(sample_id), 'tgt_text'] ), "The reference text indicated by the sample ID in the transcripts file does not match with the one stored in the dataset!" df.at[int(sample_id), 'wer'] = wer return df
# !pip install librosa import time from time import perf_counter import numpy as np import matplotlib.pyplot as plt import librosa from pydub import AudioSegment import os from google.colab import files import moviepy.editor from transformers import pipeline from speechbrain.pretrained import EncoderDecoderASR asr_model2 = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_models/asr-crdnn-rnnlm-librispeech") def transcribe_audio(fileList = []): listOfText = [] if fileList == [] : uploaded = files.upload() listOfAudios = list(uploaded.keys()) else: listOfAudios = fileList a = perf_counter() for file in listOfAudios: duration = librosa.get_duration(filename=file) t1 = 0 t2 = duration * 1000 if duration < 30 else 30000 textTemp = ""
def load_asr_model(): asr_model = EncoderDecoderASR.from_hparams( source="speechbrain/asr-transformer-transformerlm-librispeech", savedir="pretrained_model/") return asr_model
def asr_model(): """Load model for the CTC segmentation test.""" asr_model = EncoderDecoderASR.from_hparams( source="speechbrain/asr-transformer-transformerlm-librispeech") return asr_model