Python EncoderDecoderASR 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: speechbrain.pretrained

클래스/타입: EncoderDecoderASR

hotexamples.com에서의 예제들: 6

Python EncoderDecoderASR - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 speechbrain.pretrained.EncoderDecoderASR에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

from_hparams(6)

자주 사용되는 메소드들

from_hparams (6)

예제 #1

파일 보기

    def __init__(self, model_id: str):
        model_type = get_type(model_id)
        if model_type is ModelType.ENCODERASR:
            self.model = EncoderASR.from_hparams(source=model_id)
        elif model_type is ModelType.ENCODERDECODERASR:
            self.model = EncoderDecoderASR.from_hparams(source=model_id)

            # Reduce latency
            self.model.mods.decoder.beam_size = 1
        else:
            raise ValueError(
                f"{model_type.value} is invalid for automatic-speech-recognition"
            )

        # Please define a `self.sampling_rate` for this pipeline
        # to automatically read the input correctly
        self.sampling_rate = self.model.hparams.sample_rate

예제 #2

파일 보기

파일: speech_to_text.py 프로젝트: ariabee/applecore

    def getTranscription(self):
        asr_model = EncoderDecoderASR.from_hparams(
            source="speechbrain/asr-crdnn-rnnlm-librispeech",
            savedir="./pretrained_ASR")
        transcription = asr_model.transcribe_file(self.path_to_wav)
        return transcription.lower()


#with sr.Microphone() as source:
#    stt = SpeechToText()
#    r = sr.Recognizer()
#    audio = r.listen(source, timeout=5)
#    print("audio")
#    print(audio)
#    name = r.recognize_google(audio)
#    print("google works, why can't you?")
#    print(name)
#    stt.saveAudio(audio)
#    name = stt.getTranscription()
#    print(name)

예제 #3

파일 보기

파일: scoring.py 프로젝트: gcambara/speacher

    def call_huggingface(self, df):
        assert self.model_url != '', "Error! A model URL is needed for HuggingFace scoring, but --asr_download_model is empty"
        if self.tokenizer_url == '':
            print(
                f"Setting empty --tokenizer_url field identically to --asr_download_model: {self.model_url}"
            )
            self.tokenizer_url = self.model_url

        if self.scoring_sorting == 'ascending':
            df = df.sort_values(by=['n_frames']).reset_index(drop=True)
        elif self.scoring_sorting == 'descending':
            df = df.sort_values(by=['n_frames'],
                                ascending=False).reset_index(drop=True)
        elif self.scoring_sorting == '':
            pass
        else:
            raise NotImplementedError

        print(f"Preparing dataloader for manifest {self.manifest}...")
        dataset = AudioDataset(df)
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                collate_fn=dataset.collater,
                                num_workers=self.num_workers,
                                pin_memory=True)

        if self.hf_username == 'facebook':
            print(f"Downloading tokenizer: {self.tokenizer_url}")
            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
                self.tokenizer_url)

            print(f"Downloading model: {self.model_url}")
            model = Wav2Vec2ForCTC.from_pretrained(self.model_url)
        elif self.hf_username == 'speechbrain':
            if torch.cuda.is_available():
                run_opts = {"device": "cuda"}
            else:
                run_opts = {"device": "cpu"}
            print(f"Downloading model: {self.model_url}")
            model = EncoderDecoderASR.from_hparams(source=self.model_url,
                                                   run_opts=run_opts,
                                                   savedir=os.path.join(
                                                       'pretrained_models',
                                                       self.hf_modelname))
        else:
            raise NotImplementedError

        model.eval()

        print("Scoring dataset...")
        df['wer'] = np.nan

        for batch in tqdm(dataloader):
            indexes, waveforms, transcripts, wav_lens = batch

            if self.hf_username == 'facebook':
                output_logits = model(waveforms.squeeze()).logits
                predicted_ids = torch.argmax(output_logits, dim=-1)
                pred_transcripts = tokenizer.batch_decode(predicted_ids)
            elif self.hf_username == 'speechbrain':
                waveforms = waveforms.squeeze()
                #waveforms = model.audio_normalizer(waveforms, self.sampling_rate)
                pred_transcripts = model.transcribe_batch(waveforms,
                                                          wav_lens)[0]

            for index, ref in enumerate(transcripts):
                sample_id = indexes[index]
                ref = transcripts[index]
                pred = pred_transcripts[index]
                measures = jiwer.compute_measures(ref, pred)
                wer = measures['wer'] * 100.0
                assert (
                    ref == df.loc[int(sample_id), 'tgt_text']
                ), "The reference text indicated by the sample ID in the transcripts file does not match with the one stored in the dataset!"
                df.at[int(sample_id), 'wer'] = wer

        return df

예제 #4

파일 보기

파일: asr.py 프로젝트: eloyhernandezlua/MentalHealthProjectAI

# !pip install librosa

import time
from time import perf_counter
import numpy as np
import matplotlib.pyplot as plt
import librosa
from pydub import AudioSegment
import os
from google.colab import files
import moviepy.editor
from transformers import pipeline

from speechbrain.pretrained import EncoderDecoderASR

asr_model2 = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_models/asr-crdnn-rnnlm-librispeech")

def transcribe_audio(fileList = []):
  listOfText = []
  if fileList == [] :
    uploaded = files.upload()
    listOfAudios = list(uploaded.keys())
  else:
    listOfAudios = fileList
  
  a = perf_counter()
  for file in listOfAudios:
    duration = librosa.get_duration(filename=file)
    t1 = 0
    t2 = duration * 1000 if duration < 30 else 30000
    textTemp = ""

예제 #5

파일 보기

def load_asr_model():
    asr_model = EncoderDecoderASR.from_hparams(
        source="speechbrain/asr-transformer-transformerlm-librispeech",
        savedir="pretrained_model/")
    return asr_model

예제 #6

파일 보기

def asr_model():
    """Load model for the CTC segmentation test."""

    asr_model = EncoderDecoderASR.from_hparams(
        source="speechbrain/asr-transformer-transformerlm-librispeech")
    return asr_model