예제 #1
0
def data_preparation():
    data = import_data()
    global processor

    if glob.glob(f"results_hg/{MODEL}/{LABEL}/processor/*"):
        print(">> From pretrained processor ")
        processor = Wav2Vec2Processor.from_pretrained(f"results_hg/{MODEL}/{LABEL}/processor")
    else :
        print(">> Creating processor ")

        gen_vocab(data)
        tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \
            pad_token="[PAD]", word_delimiter_token="|")
        feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \
            sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
        processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
        processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/')

    dataset = data.map(speech_file_to_array_fn, \
         remove_columns=data.column_names["train"], num_proc=4)
    dataset_prepared = dataset.map(prepare_dataset, \
        remove_columns=dataset.column_names["train"], batch_size=8, num_proc=4, batched=True)

    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

    return processor, dataset_prepared, data_collator
def save_processor():
    processor = Wav2Vec2Processor.from_pretrained(
        "facebook/wav2vec2-large-960h-lv60-self")
    processor.save_pretrained(hf_path)

    create_vocab("../add_wav2vec/data/temp/dict.ltr.txt")
    tok = Wav2Vec2CTCTokenizer(hf_path + "/vocab.json")
    tok.save_pretrained(hf_path)
    processor = Wav2Vec2Processor.from_pretrained(hf_path)
    processor.save_pretrained(hf_path)
예제 #3
0
    def test_push_to_hub(self):
        processor = Wav2Vec2Processor.from_pretrained(SAMPLE_PROCESSOR_CONFIG_DIR)
        with tempfile.TemporaryDirectory() as tmp_dir:
            processor.save_pretrained(
                os.path.join(tmp_dir, "test-processor"), push_to_hub=True, use_auth_token=self._token
            )

            new_processor = Wav2Vec2Processor.from_pretrained(f"{USER}/test-processor")
            for k, v in processor.feature_extractor.__dict__.items():
                self.assertEqual(v, getattr(new_processor.feature_extractor, k))
            self.assertDictEqual(new_processor.tokenizer.get_vocab(), processor.tokenizer.get_vocab())
예제 #4
0
def load_model():
    model_name = "facebook/wav2vec2-large-xlsr-53-french"
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    resampler = torchaudio.transforms.Resample(orig_freq=16_000,
                                               new_freq=16_000)
    return model, processor, resampler
예제 #5
0
파일: expert.py 프로젝트: simpleoier/s3prl
    def __init__(self,
                 ckpt: str = None,
                 model_config: str = None,
                 feature_selection: str = None,
                 **kwargs):
        """
        Args:
            ckpt:
                The checkpoint path for loading your pretrained weights.

            model_config:
                The config path for constructing your model.
                Might not needed if you also save that in your checkpoint file.

            feature_selection:
                The string for you to control the different behavior of the
                same pretrained model, like extracting different layers as
                the representations.
        """
        super().__init__()

        self.processor = Wav2Vec2Processor.from_pretrained(ckpt)
        self.model = Wav2Vec2Model.from_pretrained(ckpt)

        pseudo_input = [torch.randn(SAMPLE_RATE)]
        pseudo_output = self.forward(pseudo_input)
        self._output_dim = pseudo_output[0].size(-1)
예제 #6
0
def main(input_pipe, output_pipe):
    source = ZeroMQSource(input_pipe)
    sink = ZeroMQSink(output_pipe)

    debug('[+] loading processor')
    processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base-960h')
    debug('[+] loading model')
    model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')

    sink.signal_ready()
    try:
        while True:
            debug('>>> waiting for connection')
            # torchaudio seems to expect complete files so send small parts
            buf = io.BytesIO(source.recv())
            start = time.time()
            debug('[+] converting audio')
            wav = io.BytesIO(webm_to_wav(buf.read()))
            waveform, sample_rate = torchaudio.load(wav)
            waveform = waveform[0]  # Wav2Vec2Processor expects mono 16kHz audio
            debug('[+] input_values')
            input_values = processor(waveform, sampling_rate=sample_rate, return_tensors='pt').input_values
            debug('[+] logits')
            logits = model(input_values).logits
            debug('[+] predicted_ids')
            predicted_ids = torch.argmax(logits, dim=-1)
            debug('[+] transcription')
            transcription = processor.batch_decode(predicted_ids)[0]
            duration = time.time() - start
            sink.send(start, duration, transcription)
    finally:
        source.close()
        sink.close()
    def __init__(self, csv_file, hp):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the wavs.
        """
        self.landmarks_frame = pd.read_csv(csv_file, sep='\|', header=None)
        self.hp = hp
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(self.hp.spm_model)
        ## TODO: variable
        self.processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-large-lv60")

        ## TODO
        if self.hp.lengths_file is None or not os.path.exists(
                self.hp.lengths_file):
            print('lengths_file is not exists. Make...')
            lengths_list = []
            pbar = tqdm(range(len(self.landmarks_frame)))
            for idx in pbar:
                wav_name = self.landmarks_frame.loc[idx, 0]
                audio_input, sampling_rate = sf.read(wav_name)
                wav_input = self.processor(audio_input,
                                           sampling_rate=sampling_rate,
                                           return_tensors="pt").input_values
                ## TODO: check calucation for lengths (int(wav_input.shape[1]//320))
                # [1, lengths of wav] -> [lengths of wav]
                wav2vec2_length = math.floor(
                    (wav_input.shape[1] - 400) / 320.) + 1

                lengths_list.append(wav2vec2_length)

            self.lengths_np = np.array(lengths_list)
            np.save(self.hp.lengths_file, self.lengths_np)
예제 #8
0
 def __init__(self):
     super(ASR_CTC, self).__init__()
     #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base')
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab())
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                           unk_token="<unk>",
                                           pad_token="<pad>",
                                           word_delimiter_token="|")
     self.feature_extractor = Wav2Vec2FeatureExtractor(
         feature_size=1,
         sampling_rate=16000,
         padding_value=0.0,
         do_normalize=True,
         return_attention_mask=True)
     self.processor = Wav2Vec2Processor(
         feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
     self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained(
         "facebook/wav2vec2-large-xlsr-53",
         attention_dropout=0.1,
         hidden_dropout=0.1,
         feat_proj_dropout=0.0,
         mask_time_prob=0.05,
         layerdrop=0.1,
         gradient_checkpointing=True,
         ctc_loss_reduction="mean",
         pad_token_id=self.processor.tokenizer.pad_token_id,
         vocab_size=len(self.processor.tokenizer))
def _get_model(model_id):
    from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
    tokenizer = Wav2Vec2Processor.from_pretrained(model_id).tokenizer
    labels = [k for k, v in sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1])]
    original = Wav2Vec2ForCTC.from_pretrained(model_id)
    model = import_huggingface_model(original)
    return model.eval(), labels
    def test_inference_ctc_batched(self):
        model = HubertForCTC.from_pretrained(
            "facebook/hubert-large-ls960-ft",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/hubert-large-ls960-ft", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
예제 #11
0
    def __init__(self,
                 subtitle_lookup_path,
                 wav_dir,
                 wav2vec_checkpoint,
                 pretrained_chpt="facebook/wav2vec2-base-960h",
                 num_proc=4,
                 preprocess_batch_size=8):
        """
        Args:
            subtitle_lookup_path (NOTE this file must be compatible with pd.read_csv())
            wav_dir: directory of wav files e.g. './wav_data/'
            pretrained_chpt: pretrained checkpoints to load
            num_proc: number of processes allowed when doing dataset preprocessing
            preprocess_batch_size: this is ONLY used inside this dataset to preprocess wav files faster
        Returns:
            
        """
        self.subtitle_lookup_path = subtitle_lookup_path
        self.wav_dir = wav_dir
        self.num_proc = num_proc
        self.preprocess_batch_size = preprocess_batch_size
        self.processor = Wav2Vec2Processor.from_pretrained(pretrained_chpt)
        self.feature_extractor = Wav2Vec2ForCTC.from_pretrained(
            pretrained_chpt).wav2vec2.feature_extractor

        # self.knnw_prepared = dataset_dict.load_from_disk(wav2vec_checkpoint)
        # self.knnw_prepared.save_to_disk(wavdir)
        self.knnw_prepared = self.setup_dataset()
예제 #12
0
def prepare(reg=True, from_scratch=False):
    # load data
    test = read_txt('./data/speech-sme-asr/test_asr.txt')
    train = read_txt('./data/speech-sme-asr/train_asr.txt')

    # remove special characters
    train = train.map(remove_special_characters)
    test = test.map(remove_special_characters)

    # build vocab dict

    if from_scratch:
        vocab_dict = build_vocab_dict(train, test)
        write_vocab_dict_to_disk(vocab_dict)

        processor = processor_init()
    if reg:
        # processor = processor_init()
        processor = Wav2Vec2Processor.from_pretrained(
            './asr_output/pretrained_processor')

    def prepare_dataset(batch):

        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values

        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
        return batch

    # speech file to array
    train = train.map(speech_file_to_array_fn,
                      remove_columns=train.column_names)
    test = test.map(speech_file_to_array_fn, remove_columns=test.column_names)

    print("Preparing train dataset")
    train = train.map(prepare_dataset,
                      remove_columns=train.column_names,
                      batch_size=1,
                      num_proc=1,
                      batched=True)
    print("Preparing test dataset")
    test = test.map(prepare_dataset,
                    remove_columns=test.column_names,
                    batch_size=1,
                    num_proc=1,
                    batched=True)
    print("Done")

    pickle.dump(train, open('./data/speech-sme-asr/train_asr.pkl', 'wb'))

    pickle.dump(test, open('./data/speech-sme-asr/test_asr.pkl', 'wb'))

    return train, test
    def test_inference_ctc_normal_batched(self):
        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        model.to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/wav2vec2-base-960h", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech,
                           return_tensors="pt",
                           padding=True,
                           truncation=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
예제 #14
0
    def __init__(self,
                 path="facebook/wav2vec2-base-960h",
                 quantize=False,
                 gpu=True,
                 batch=64):
        """
        Constructs a new transcription pipeline.

        Args:
            path: optional path to model, accepts Hugging Face model hub id or local path,
                  uses default model for task if not provided
            quantize: if model should be quantized, defaults to False
            gpu: True/False if GPU should be enabled, also supports a GPU device id
            batch: batch size used to incrementally process content
            langdetect: path to language detection model, uses a default path if not provided
        """

        # Call parent constructor
        super().__init__(path, quantize, gpu, batch)

        if not SOUNDFILE:
            raise ImportError(
                "SoundFile library not installed or libsndfile not found")

        # load model and processor
        self.model = Wav2Vec2ForCTC.from_pretrained(self.path)
        self.processor = Wav2Vec2Processor.from_pretrained(self.path)

        # Move model to device
        self.model.to(self.device)
예제 #15
0
파일: core.py 프로젝트: vishalraj-95/ktrain
    def __init__(self,
                 model_name="facebook/wav2vec2-base-960h",
                 device=None,
                 half=False):
        """
        ```
        basic wrapper speech transcription

        Args:
          model_name(str): Helsinki-NLP model
          device(str): device to use (e.g., 'cuda', 'cpu')
          half(bool): If True, use half precision.
        ```
        """
        if not TORCH:
            raise ImportError('Transcriber requires PyTorch to be installed.')
        self.torch_device = device
        if self.torch_device is None:
            self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        #if not SOUNDFILE:
        #raise ImportError("SoundFile library not installed or libsndfile not found: pip install soundfile")
        if not LIBROSA:
            raise ImportError(
                "librosa library must be installed: pip install librosa. Conda users may also have to install ffmpeg: conda install -c conda-forge ffmpeg"
            )

        # load model and processor
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name).to(
            self.torch_device)
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        if half: self.model = self.model.half()
    def __init__(self,
                 source,
                 save_path,
                 output_norm=True,
                 freeze=True,
                 pretrain=True):
        super().__init__()

        # Download the model from HuggingFace and load it.
        # The Processor is only used to retrieve the normalisation
        self.proc = Wav2Vec2Processor.from_pretrained(source,
                                                      cache_dir=save_path)
        self.model = Wav2Vec2Model.from_pretrained(source, cache_dir=save_path)

        # Randomly initialized layers if pretrain is False
        if not (pretrain):
            self.reset_layer(self.model)

        # We check if inputs need to be normalized w.r.t pretrained wav2vec2
        self.normalize_wav = self.proc.feature_extractor.do_normalize

        self.freeze = freeze
        self.output_norm = output_norm
        if self.freeze:
            self.model.eval()
        else:
            self.model.train()
    def test_inference_ctc_batched(self):
        model = Data2VecAudioForCTC.from_pretrained(
            "facebook/data2vec-audio-base-960h").to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-wav2vec2", do_lower_case=True)

        input_speech = self._load_datasamples(4)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with thousands of spectators were trivialities not worth thinking about",
            "his instant of panic was followed by a small sharp blow high on his chest",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
예제 #18
0
    def __init__(self, model_name='te'):
        model_name = model_name.lower()
        for x, y in LANGUAGE_ALISASES.items():
            model_name = model_name.replace(x, y)

        if model_name not in MODEL_URLS and model_name not in LANGUAGE_ALISASES:
            if model_name in LANGUAGE_ALISASES:
                model_name = LANGUAGE_ALISASES[model_name]

            print(f"model_name should be one of {list(MODEL_URLS.keys())}")
            return None

        home = os.path.expanduser("~")
        lang_path = os.path.join(home, ".IndicASR_" + model_name)
        if not os.path.exists(lang_path):
            os.mkdir(lang_path)

        for file_name, url in MODEL_URLS[model_name].items():
            file_path = os.path.join(lang_path, file_name)
            if os.path.exists(file_path):
                continue
            print(f"Downloading {file_name}")
            pydload.dload(url=url, save_to_path=file_path, max_time=None)

        self.processor = Wav2Vec2Processor.from_pretrained(lang_path)
        self.model = Wav2Vec2ForCTC.from_pretrained(lang_path)

        if torch.cuda.is_available():
            print(f"Using GPU")
            self.model = self.model.cuda()
예제 #19
0
    def test_inference_ctc_robust_batched(self):
        model = TFHubertForCTC.from_pretrained(
            "facebook/hubert-large-ls960-ft")
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/hubert-large-ls960-ft", do_lower_case=True)

        input_speech = self._load_datasamples(4)

        inputs = processor(input_speech,
                           return_tensors="tf",
                           padding=True,
                           sampling_rate=16000)

        input_values = inputs.input_values
        attention_mask = inputs.attention_mask

        logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = tf.argmax(logits, axis=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
            "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around"
            " him with the thousands of spectators were trivialities not worth thinking about",
            "his instant of panic was followed by a small sharp blow high on his chest",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
예제 #20
0
    def test_inference_ctc_batched(self):
        # TODO: enable this test once the finetuned models are available
        model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-100h").to(
            torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "asapp/sew-d-tiny-100k-ft-100h", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
예제 #21
0
 def __init__(self, device="cuda"):
     self.encoder = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
     self.encoder.eval()
     self.encoder = self.encoder.to(device)
     self.preprocessor = Wav2Vec2Processor.from_pretrained(
         "facebook/wav2vec2-base")
     self.preprocessor._sample_rate = 16000
     self.device = device
예제 #22
0
 def load(self):
     if not hasattr(self, 'processor'):
         self.processor = Wav2Vec2Processor.from_pretrained(
             self.recognizer_dir)
     if self.load_model:
         self.model = Wav2Vec2ForCTC.from_pretrained(self.recognizer_dir)
         if self.use_cuda:
             self.model = self.model.to("cuda")
예제 #23
0
def load_processor(vocab_dir= vocab_dir, cache_dir = cache_dir, force = False):
	global processor
	if processor: return processor
	tokenizer = load_tokenizer(vocab_dir,cache_dir)
	feature_extractor = load_feature_extractor()
	processor = Wav2Vec2Processor(feature_extractor=feature_extractor, 
		tokenizer=tokenizer)
	return processor
예제 #24
0
    def __init__(self):
        self.REQUIRED_SAMPLE_RATE = 16000

        # Use Facebook's pretrained Wav2Vec2 model
        # https://huggingface.co/facebook/wav2vec2-large-960h
        PRETRAINED_MODEL = 'facebook/wav2vec2-base-960h'
        self.processor = Wav2Vec2Processor.from_pretrained(PRETRAINED_MODEL)
        self.model = Wav2Vec2ForCTC.from_pretrained(PRETRAINED_MODEL)
예제 #25
0
 def init(self):
     self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
     assert self.processor.feature_extractor.do_normalize is True
     self.model = Wav2Vec2ForCTC.from_pretrained(self.model_name)
     target_dictionary = list(self.processor.tokenizer.get_vocab().keys())
     print(f"target_dictionary: {target_dictionary}")
     self.decoder = GreedyDecoder(target_dictionary).init()
     return self
예제 #26
0
def evaluate_asr():
    def read_txt(txt_path):
        data = pd.read_csv(txt_path, delimiter='\n', header=None, names=['path', 'sentence'])
        
        has_colon = data['path'].str.contains('|')
        data[['path', 'sentence']] = data.loc[has_colon, 'path'].str.split('|', expand=True)

        data = Dataset.from_pandas(data)
        return(data)

    test = read_txt('./data/speech-sme-asr/test_asr.txt')
    processor = Wav2Vec2Processor.from_pretrained("asr_output/pretrained_processor")
    # print(processor.__dict__)
    # print(processor.tokenizer)

    # exit()
    model = Wav2Vec2ForCTC.from_pretrained("asr_output/checkpoint-27363").to("cpu")
    # print(model)
    # exit()
    # resampler = torchaudio.transforms.Resample(new_freq=16_000)

    def speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load('./data/'+ batch["path"])
        batch["speech"] = speech_array[0].numpy()
        return batch

    test_dataset = test.map(speech_file_to_array_fn)
    input_dict = processor(test_dataset['speech'][:11],sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(input_dict.input_values.to("cpu")).logits


    predicted_ids = torch.argmax(logits, dim=-1)

    print("Prediction:", processor.batch_decode(predicted_ids))
    print("Reference:", test_dataset["sentence"][:11])

    wer = load_metric("wer")

    resampler = torchaudio.transforms.Resample(48_000, 16_000)

    def evaluate_batch(batch):
        inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

        with torch.no_grad():
            logits = model(inputs.input_values.to("cpu"), attention_mask=inputs.attention_mask.to("cpu")).logits

        pred_ids = torch.argmax(logits, dim=-1)
        batch["pred_strings"] = processor.batch_decode(pred_ids)
        return batch

    result = test_dataset.map(evaluate_batch, batched=True, batch_size=8) # batch_size=8 -> requires ~14.5GB GPU memory

    msg = "WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"]))
    print(msg)
    return msg
예제 #27
0
파일: model_ctc.py 프로젝트: vbrydik/pyw2v2
    def _init_processor(self, config: EasyDict):
        config.processor.tokenizer.vocab_file = config.common.vocab_file
        tokenizer = Wav2Vec2CTCTokenizer(**config.processor.tokenizer)
        feature_extractor = Wav2Vec2FeatureExtractor(
            **config.processor.feature_extractor)

        processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                      tokenizer=tokenizer)
        processor.save_pretrained(config.common.model_path)
        self._processor = processor
예제 #28
0
    def test_mask_time_prob_ctc(self):
        model = UniSpeechSatForCTC.from_pretrained(
            "hf-internal-testing/tiny-random-unispeech-sat", mask_time_prob=0.2, mask_time_length=2
        )
        model.to(torch_device).train()
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-unispeech-sat", return_attention_mask=True
        )

        batch_duration_in_seconds = [1, 3, 2, 6]
        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]
예제 #29
0
 def __init__(self, hidden_size=512, num_classes=8, device='cpu', sr=16000):
     super(Wav2VecClassifier, self).__init__()
     self.hidden_size = hidden_size
     self.sr = sr
     self.device = device
     self.processor = Wav2Vec2Processor.from_pretrained(
         "facebook/wav2vec2-base-960h")
     self.model = Wav2Vec2Model.from_pretrained(
         "facebook/wav2vec2-base-960h")
     self.lstm = nn.LSTM(768, hidden_size, batch_first=True)
     self.fc = nn.Linear(hidden_size, num_classes)
예제 #30
0
    def test_mask_time_prob_ctc(self):
        model = Data2VecAudioForCTC.from_pretrained(
            "facebook/data2vec-audio-base-960h", mask_time_prob=0.2, mask_time_length=2
        )
        model.to(torch_device).train()
        processor = Wav2Vec2Processor.from_pretrained(
            "hf-internal-testing/tiny-random-wav2vec2", return_attention_mask=True
        )

        batch_duration_in_seconds = [1, 3, 2, 6]
        input_features = [np.random.random(16_000 * s) for s in batch_duration_in_seconds]