def test_special_characters_in_vocab(self):
        sent = "ʈʰ æ æ̃ ˧ kʰ"

        vocab_dict = {
            k: v
            for v, k in enumerate({phoneme
                                   for phoneme in sent.split()})
        }
        vocab_file = os.path.join(self.tmpdirname, "vocab_special.json")

        with open(vocab_file, "w") as f:
            json.dump(vocab_dict, f)

        tokenizer = Wav2Vec2CTCTokenizer(vocab_file)

        expected_sent = tokenizer.decode(tokenizer(sent).input_ids,
                                         spaces_between_special_tokens=True)
        self.assertEqual(sent, expected_sent)

        tokenizer.save_pretrained(
            os.path.join(self.tmpdirname, "special_tokenizer"))
        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
            os.path.join(self.tmpdirname, "special_tokenizer"))

        expected_sent = tokenizer.decode(tokenizer(sent).input_ids,
                                         spaces_between_special_tokens=True)
        self.assertEqual(sent, expected_sent)
예제 #2
0
def data_preparation():
    data = import_data()
    global processor

    if glob.glob(f"results_hg/{MODEL}/{LABEL}/processor/*"):
        print(">> From pretrained processor ")
        processor = Wav2Vec2Processor.from_pretrained(f"results_hg/{MODEL}/{LABEL}/processor")
    else :
        print(">> Creating processor ")

        gen_vocab(data)
        tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \
            pad_token="[PAD]", word_delimiter_token="|")
        feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \
            sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
        processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
        processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/')

    dataset = data.map(speech_file_to_array_fn, \
         remove_columns=data.column_names["train"], num_proc=4)
    dataset_prepared = dataset.map(prepare_dataset, \
        remove_columns=dataset.column_names["train"], batch_size=8, num_proc=4, batched=True)

    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

    return processor, dataset_prepared, data_collator
예제 #3
0
 def __init__(self):
     super(ASR_CTC, self).__init__()
     #self.wav2Vec2Tokenizer = Wav2Vec2Tokenizer.from_pretrained('facebook/wav2vec2-base')
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     #self.nb_labels = len(self.wav2Vec2Tokenizer.get_vocab())
     #self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base')
     self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                           unk_token="<unk>",
                                           pad_token="<pad>",
                                           word_delimiter_token="|")
     self.feature_extractor = Wav2Vec2FeatureExtractor(
         feature_size=1,
         sampling_rate=16000,
         padding_value=0.0,
         do_normalize=True,
         return_attention_mask=True)
     self.processor = Wav2Vec2Processor(
         feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
     self.wav2Vec2ForCTC = Wav2Vec2ForCTC.from_pretrained(
         "facebook/wav2vec2-large-xlsr-53",
         attention_dropout=0.1,
         hidden_dropout=0.1,
         feat_proj_dropout=0.0,
         mask_time_prob=0.05,
         layerdrop=0.1,
         gradient_checkpointing=True,
         ctc_loss_reduction="mean",
         pad_token_id=self.processor.tokenizer.pad_token_id,
         vocab_size=len(self.processor.tokenizer))
예제 #4
0
def load_tokenizer():
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(recognizer_dir,
                                                     unk_token='[UNK]',
                                                     pad_token='[PAD]',
                                                     word_delemiter_token='|',
                                                     cache_dir=cache_dir)
    return tokenizer
예제 #5
0
파일: model_ctc.py 프로젝트: vbrydik/pyw2v2
    def _init_processor(self, config: EasyDict):
        config.processor.tokenizer.vocab_file = config.common.vocab_file
        tokenizer = Wav2Vec2CTCTokenizer(**config.processor.tokenizer)
        feature_extractor = Wav2Vec2FeatureExtractor(
            **config.processor.feature_extractor)

        processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                      tokenizer=tokenizer)
        processor.save_pretrained(config.common.model_path)
        self._processor = processor
def save_processor():
    processor = Wav2Vec2Processor.from_pretrained(
        "facebook/wav2vec2-large-960h-lv60-self")
    processor.save_pretrained(hf_path)

    create_vocab("../add_wav2vec/data/temp/dict.ltr.txt")
    tok = Wav2Vec2CTCTokenizer(hf_path + "/vocab.json")
    tok.save_pretrained(hf_path)
    processor = Wav2Vec2Processor.from_pretrained(hf_path)
    processor.save_pretrained(hf_path)
예제 #7
0
 def create_processor(self, model_args: ModelArguments) -> Wav2Vec2Processor:
     feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
         model_args.model_name_or_path, cache_dir=model_args.cache_dir
     )
     if self.vocab_file:
         tokenizer = Wav2Vec2CTCTokenizer(
             self.vocab_file,
             cache_dir=model_args.cache_dir,
             do_lower_case=self.do_lower_case,
             word_delimiter_token=self.word_delimiter_token,
         )
     else:
         tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             do_lower_case=self.do_lower_case,
             word_delimiter_token=self.word_delimiter_token,
         )
     return Wav2Vec2Processor(feature_extractor, tokenizer)
예제 #8
0
def processor_init():
    tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                     unk_token="[UNK]",
                                     pad_token="[PAD]",
                                     word_delimiter_token="|")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                                 sampling_rate=16000,
                                                 padding_value=0.0,
                                                 do_normalize=True,
                                                 return_attention_mask=True)
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                  tokenizer=tokenizer)
    processor.save_pretrained('asr_output/new_processor/')
    return processor
예제 #9
0
파일: utils.py 프로젝트: DhivehiAI/DV-Subs
    def __init__(self, m_path):
        self.check_download_models(m_path)
        STT_MODEL_PATH, STT_VOCAB_FILE = self.get_model_files_dirs(m_path)
        self.SAMPLING_RATE = 16000

        print("Initializing STT Model")
        tokenizer = Wav2Vec2CTCTokenizer(STT_VOCAB_FILE,
                                         unk_token="[UNK]",
                                         pad_token="[PAD]",
                                         word_delimiter_token="|")
        feature_extractor = Wav2Vec2FeatureExtractor(
            feature_size=1,
            sampling_rate=self.SAMPLING_RATE,
            padding_value=0.0,
            do_normalize=True,
            return_attention_mask=False)
        self.processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                           tokenizer=tokenizer)
        self.model = torch.jit.load(STT_MODEL_PATH)
예제 #10
0
 def __init__(self, transcription_file, root_dir):
     """
     :param transcription_file: Path to the text transcription.
     :param root_dir: Directory containing audio files.
     """
     self.transcriptions = pd.read_csv(transcription_file, sep="\t")
     self.root_dir = root_dir
     self.tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                           unk_token="<unk>",
                                           pad_token="<pad>",
                                           word_delimiter_token="|")
     self.feature_extractor = Wav2Vec2FeatureExtractor(
         feature_size=1,
         sampling_rate=16000,
         padding_value=0.0,
         do_normalize=True,
         return_attention_mask=True)
     self.processor = Wav2Vec2Processor(
         feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
예제 #11
0
    def __init__(self, model_dir=None, hub_name=None,device="cuda"):
        
        #if model comes locally
        if model_dir != None:
            self.model = Wav2Vec2ForCTC.from_pretrained(model_dir)
            #currently only this confiqurations
            tokenizer = Wav2Vec2CTCTokenizer(os.path.join(model_dir, "vocab.json"), unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
            feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
            self.processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
        
        #model from huggingface hub
        elif hub_name != None:
            pass

        else:
            print("no model path given")

        self.model.to(device)
        self.device = device
예제 #12
0
def data_preparation_v2():
    common_voice_train = load_dataset("common_voice", "tr", split="train+validation")
    common_voice_test = load_dataset("common_voice", "tr", split="test")
    common_voice_train = common_voice_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
    common_voice_test = common_voice_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
    common_voice_train = common_voice_train.map(remove_special_characters)
    common_voice_test = common_voice_test.map(remove_special_characters)

    vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
    vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)
    vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    with open(f'results_hg/{MODEL}/{LABEL}/vocab.json', 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)

    global processor

    print(">> Creating processor ")

    tokenizer = Wav2Vec2CTCTokenizer(f"results_hg/{MODEL}/{LABEL}/vocab.json", unk_token="[UNK]", \
        pad_token="[PAD]", word_delimiter_token="|")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, \
        sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    processor.save_pretrained(f'results_hg/{MODEL}/{LABEL}/processor/')

    common_voice_train = common_voice_train.map(speech_file_to_array_fn_v2, remove_columns=common_voice_train.column_names)
    common_voice_test = common_voice_test.map(speech_file_to_array_fn_v2, remove_columns=common_voice_test.column_names)
    
    common_voice_train = common_voice_train.map(resample, num_proc=4)
    common_voice_test = common_voice_test.map(resample, num_proc=4)
    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

    common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
    common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

    return processor, common_voice_train, common_voice_test, data_collator
예제 #13
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # override default run name and log all args
    wandb.init(project="wav2vec4humans", config=parser.parse_args())

    # Detecting last checkpoint.
    last_checkpoint = None
    if (os.path.isdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]'

    def remove_special_characters(batch, train=True):
        batch["text"] = (re.sub(chars_to_ignore_regex, "",
                                unidecode(batch["sentence"])).lower().strip())
        if train:
            batch["text"] += " "
        return batch

    def extract_all_chars(batch):
        all_text = " ".join(batch["text"])
        vocab = list(set(all_text))
        return {"vocab": [vocab], "all_text": [all_text]}

    resampler = dict()

    def get_resampler(sampling_rate):
        if sampling_rate in resampler.keys():
            return resampler[sampling_rate]
        else:
            logger.info(f"Creating new resampler for {sampling_rate}")
            resampler[sampling_rate] = torchaudio.transforms.Resample(
                sampling_rate, 16_000)
            return resampler[sampling_rate]

    # Preprocessing the datasets.
    # We need to read the audio files as arrays and tokenize the targets.
    def speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load(batch["path"])
        batch["speech"] = get_resampler(sampling_rate)(
            speech_array).squeeze().numpy()
        batch["sampling_rate"] = 16_000
        batch["target_text"] = batch["text"]
        batch["duration"] = len(speech_array.squeeze()) / sampling_rate
        return batch

    def filter_by_duration(batch):
        return (batch["duration"] <= 10 and batch["duration"] >= 1
                and len(batch["target_text"]) > 5)  # about 98% of samples

    def prepare_dataset(batch):
        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values
        # Setup the processor for targets
        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
        return batch

    def get_length(item):
        # speeds up grouping by length in pre-loaded dataset
        item["length"] = len(item["input_values"])
        return item

    # Pre-processed datasets
    dataset_path = Path(os.getenv("HF_HOME", ".")) / "datasets"
    dataset_train_path = f"{dataset_path}/{data_args.dataset_config_name}/train/{data_args.train_split_name}"
    dataset_eval_path = f"{dataset_path}/{data_args.dataset_config_name}/eval"
    dataset_test_path = f"{dataset_path}/{data_args.dataset_config_name}/test"
    vocab_path = f"{dataset_path}/{data_args.dataset_config_name}/vocab/vocab_test_{data_args.train_split_name}.json"

    train_dataset = None
    eval_dataset = None if training_args.do_eval else False

    log_timestamp()
    if Path(dataset_train_path).exists() and Path(vocab_path).exists():
        train_dataset = datasets.load_from_disk(dataset_train_path)
        log_timestamp("load pre-processed data")
    else:
        train_dataset = datasets.load_dataset(
            "common_voice",
            data_args.dataset_config_name,
            split=data_args.train_split_name,
        )
        log_timestamp("load data")
        train_dataset = train_dataset.map(remove_special_characters,
                                          remove_columns=["sentence"])
        log_timestamp("remove special characters")

    if training_args.do_eval:
        if Path(dataset_eval_path).exists():
            eval_dataset = datasets.load_from_disk(dataset_eval_path)
        else:
            eval_dataset = datasets.load_dataset("common_voice",
                                                 data_args.dataset_config_name,
                                                 split="test")
            eval_dataset = eval_dataset.map(remove_special_characters,
                                            remove_columns=["sentence"])
    log_timestamp()

    if Path(dataset_test_path).exists() and Path(vocab_path).exists():
        test_dataset = datasets.load_from_disk(dataset_test_path)
    else:
        test_dataset = datasets.load_dataset("common_voice",
                                             data_args.dataset_config_name,
                                             split="test")
        test_dataset = test_dataset.map(
            lambda x: remove_special_characters(x, train=False),
            remove_columns=["sentence"],
        )
    log_timestamp()

    if not Path(vocab_path).exists():
        # create vocab
        vocab_train = train_dataset.map(
            extract_all_chars,
            batched=True,
            batch_size=-1,
            keep_in_memory=True,
            remove_columns=train_dataset.column_names,
        )
        vocab_test = test_dataset.map(
            extract_all_chars,
            batched=True,
            batch_size=-1,
            keep_in_memory=True,
            remove_columns=test_dataset.column_names,
        )
        vocab_list = list(
            set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
        vocab_dict = {v: k for k, v in enumerate(vocab_list)}
        vocab_dict["|"] = vocab_dict[" "]
        del vocab_dict[" "]
        vocab_dict["[UNK]"] = len(vocab_dict)
        vocab_dict["[PAD]"] = len(vocab_dict)
        Path(vocab_path).parent.mkdir(parents=True, exist_ok=True)
        with open(vocab_path, "w") as vocab_file:
            json.dump(vocab_dict, vocab_file)
        log_timestamp("create vocab")

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    tokenizer = Wav2Vec2CTCTokenizer(
        vocab_path,
        unk_token="[UNK]",
        pad_token="[PAD]",
        word_delimiter_token="|",
    )
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16_000,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True,
    )
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                  tokenizer=tokenizer)
    model = Wav2Vec2ForCTC.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        activation_dropout=model_args.activation_dropout,
        attention_dropout=model_args.attention_dropout,
        hidden_dropout=model_args.hidden_dropout,
        feat_proj_dropout=model_args.feat_proj_dropout,
        mask_time_prob=model_args.mask_time_prob,
        gradient_checkpointing=model_args.gradient_checkpointing,
        layerdrop=model_args.layerdrop,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer),
    )
    log_timestamp("load model")

    if not Path(dataset_train_path).exists():
        train_dataset = train_dataset.map(
            speech_file_to_array_fn,
            remove_columns=train_dataset.column_names,
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("load audio")
        train_dataset = train_dataset.filter(
            filter_by_duration,
            remove_columns=["duration"],
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("filter data")
        train_dataset = train_dataset.map(
            prepare_dataset,
            remove_columns=train_dataset.column_names,
            batch_size=training_args.per_device_train_batch_size,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("process data")
        train_dataset = train_dataset.map(
            get_length,
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("add input length")
        train_dataset.save_to_disk(dataset_train_path)
        log_timestamp("save to disk")

    if not Path(dataset_eval_path).exists() and training_args.do_eval:
        eval_dataset = eval_dataset.map(
            speech_file_to_array_fn,
            remove_columns=eval_dataset.column_names,
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset = eval_dataset.filter(
            filter_by_duration,
            remove_columns=["duration"],
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset = eval_dataset.map(
            prepare_dataset,
            remove_columns=eval_dataset.column_names,
            batch_size=training_args.per_device_eval_batch_size,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset = eval_dataset.map(
            get_length,
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset.save_to_disk(dataset_eval_path)
    log_timestamp()

    if not Path(dataset_test_path).exists():
        test_dataset = test_dataset.map(
            speech_file_to_array_fn,
            num_proc=data_args.preprocessing_num_workers,
        )
        test_dataset = test_dataset.filter(filter_by_duration,
                                           remove_columns=["duration"])
        test_dataset.save_to_disk(dataset_test_path)
    log_timestamp()

    # Metric
    cer_metric = datasets.load_metric("cer")
    # we use a custom WER that considers punctuation
    wer_metric = datasets.load_metric("metrics/wer_punctuation.py")

    def compute_metrics(pred):
        pred_logits = pred.predictions
        pred_ids = np.argmax(pred_logits, axis=-1)

        pred.label_ids[pred.label_ids ==
                       -100] = processor.tokenizer.pad_token_id

        pred_str = processor.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

        cer = cer_metric.compute(predictions=pred_str, references=label_str)
        wer = wer_metric.compute(predictions=pred_str, references=label_str)

        return {"cer": cer, "wer": wer}

    log_timestamp()

    if model_args.freeze_feature_extractor:
        model.freeze_feature_extractor()
        log_timestamp("freeze feature extractor")

    # Data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor,
                                               padding=True)
    log_timestamp("create data collator")

    # Initialize our Trainer
    trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=processor.feature_extractor,
    )
    loss_nan_stopping_callback = LossNaNStoppingCallback()
    early_stopping_callback = EarlyStoppingCallback()
    timing_callback = TimingCallback()
    trainer.add_callback(loss_nan_stopping_callback)
    trainer.add_callback(early_stopping_callback)
    trainer.add_callback(timing_callback)

    # Training
    log_timestamp("setup trainer")
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        log_timestamp()
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        log_timestamp("train model")
        trainer.save_model()

        # save the feature_extractor and the tokenizer
        if is_main_process(training_args.local_rank):
            processor.save_pretrained(training_args.output_dir)

        metrics = train_result.metrics
        metrics["train_samples"] = len(train_dataset)

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Final test metrics
    logger.info("*** Test ***")
    log_timestamp()

    if loss_nan_stopping_callback.stopped:
        test_cer, test_wer = 1.0, 2.0
        logger.info(
            "Loss NaN detected, typically resulting in bad WER & CER so we won't calculate them."
        )
    else:

        def evaluate(batch):
            inputs = processor(batch["speech"],
                               sampling_rate=16_000,
                               return_tensors="pt",
                               padding=True)
            with torch.no_grad():
                logits = model(
                    inputs.input_values.to("cuda"),
                    attention_mask=inputs.attention_mask.to("cuda"),
                ).logits
            pred_ids = torch.argmax(logits, dim=-1)
            batch["pred_strings"] = processor.batch_decode(pred_ids)
            return batch

        model.to("cuda")
        # no need to cache mapped test_dataset
        datasets.set_caching_enabled(False)
        result = test_dataset.map(
            evaluate,
            batched=True,
            batch_size=training_args.per_device_eval_batch_size)
        log_timestamp("get test predictions")
        test_cer = cer_metric.compute(predictions=result["pred_strings"],
                                      references=result["text"])
        test_wer = wer_metric.compute(predictions=result["pred_strings"],
                                      references=result["text"])
        log_timestamp("compute test metrics")

    metrics = {"cer": test_cer, "wer": test_wer}
    wandb.log({f"test/{k}": v for k, v in metrics.items()})
    trainer.save_metrics("test", metrics)
    logger.info(metrics)

    # save model files
    log_timestamp()
    if not loss_nan_stopping_callback.stopped:
        artifact = wandb.Artifact(name=f"model-{wandb.run.id}",
                                  type="model",
                                  metadata={"cer": test_cer})
        for f in Path(training_args.output_dir).iterdir():
            if f.is_file():
                artifact.add_file(str(f))
        wandb.run.log_artifact(artifact)
        log_timestamp("log artifacts")
예제 #14
0
# In[15]:


import json
if from_start:
    with open('vocab.json', 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)


# In[16]:


from transformers import Wav2Vec2CTCTokenizer

if from_start:
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")


# In[17]:


repo_name = "wav2vec2-large-xls-r-300m-irish-colab"


# In[16]:


from transformers import Wav2Vec2CTCTokenizer

if not from_start:
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
def convert_sew_checkpoint(checkpoint_path,
                           pytorch_dump_folder_path,
                           config_path=None,
                           dict_path=None,
                           is_finetuned=True):
    """
    Copy/paste/tweak model's weights to transformers design.
    """

    if is_finetuned:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path],
            arg_overrides={"data": "/".join(dict_path.split("/")[:-1])})
    else:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path])

    if config_path is not None:
        config = SEWDConfig.from_pretrained(config_path)
    else:
        config = convert_config(model[0])
    model = model[0].eval()

    return_attention_mask = True if config.feat_extract_norm == "layer" else False
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=True,
        return_attention_mask=return_attention_mask,
    )

    if is_finetuned:
        if dict_path:
            target_dict = Dictionary.load(dict_path)

            # important change bos & pad token id since CTC symbol is <pad> and
            # not <s> as in fairseq
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error(
                    "--pytorch_dump_folder_path ({}) should be a directory".
                    format(pytorch_dump_folder_path))
                return
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(target_dict.indices, vocab_handle)
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                          tokenizer=tokenizer)
            processor.save_pretrained(pytorch_dump_folder_path)

        hf_model = SEWDForCTC(config)
    else:
        hf_model = SEWDModel(config)
        feature_extractor.save_pretrained(pytorch_dump_folder_path)

    recursively_load_weights(model, hf_model, is_finetuned)

    hf_model.save_pretrained(pytorch_dump_folder_path)
예제 #16
0
#%%
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

#%%
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

#%%
with open('./data/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

#%%
tokenizer = Wav2Vec2CTCTokenizer("./data/vocab.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

#%%
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False,
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

#%% [markdown]
### Preprocess Data
예제 #17
0
    def call_huggingface(self, df):
        assert self.model_url != '', "Error! A model URL is needed for HuggingFace scoring, but --asr_download_model is empty"
        if self.tokenizer_url == '':
            print(
                f"Setting empty --tokenizer_url field identically to --asr_download_model: {self.model_url}"
            )
            self.tokenizer_url = self.model_url

        if self.scoring_sorting == 'ascending':
            df = df.sort_values(by=['n_frames']).reset_index(drop=True)
        elif self.scoring_sorting == 'descending':
            df = df.sort_values(by=['n_frames'],
                                ascending=False).reset_index(drop=True)
        elif self.scoring_sorting == '':
            pass
        else:
            raise NotImplementedError

        print(f"Preparing dataloader for manifest {self.manifest}...")
        dataset = AudioDataset(df)
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                collate_fn=dataset.collater,
                                num_workers=self.num_workers,
                                pin_memory=True)

        if self.hf_username == 'facebook':
            print(f"Downloading tokenizer: {self.tokenizer_url}")
            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
                self.tokenizer_url)

            print(f"Downloading model: {self.model_url}")
            model = Wav2Vec2ForCTC.from_pretrained(self.model_url)
        elif self.hf_username == 'speechbrain':
            if torch.cuda.is_available():
                run_opts = {"device": "cuda"}
            else:
                run_opts = {"device": "cpu"}
            print(f"Downloading model: {self.model_url}")
            model = EncoderDecoderASR.from_hparams(source=self.model_url,
                                                   run_opts=run_opts,
                                                   savedir=os.path.join(
                                                       'pretrained_models',
                                                       self.hf_modelname))
        else:
            raise NotImplementedError

        model.eval()

        print("Scoring dataset...")
        df['wer'] = np.nan

        for batch in tqdm(dataloader):
            indexes, waveforms, transcripts, wav_lens = batch

            if self.hf_username == 'facebook':
                output_logits = model(waveforms.squeeze()).logits
                predicted_ids = torch.argmax(output_logits, dim=-1)
                pred_transcripts = tokenizer.batch_decode(predicted_ids)
            elif self.hf_username == 'speechbrain':
                waveforms = waveforms.squeeze()
                #waveforms = model.audio_normalizer(waveforms, self.sampling_rate)
                pred_transcripts = model.transcribe_batch(waveforms,
                                                          wav_lens)[0]

            for index, ref in enumerate(transcripts):
                sample_id = indexes[index]
                ref = transcripts[index]
                pred = pred_transcripts[index]
                measures = jiwer.compute_measures(ref, pred)
                wer = measures['wer'] * 100.0
                assert (
                    ref == df.loc[int(sample_id), 'tgt_text']
                ), "The reference text indicated by the sample ID in the transcripts file does not match with the one stored in the dataset!"
                df.at[int(sample_id), 'wer'] = wer

        return df
예제 #18
0
    save_path = "/media/nas/samir-data/wav2vec2_models"
    #os.mkdir(save_path)
    with open(save_path+'/hyperparameters.json', 'w') as f:
        json.dump(hyperparameters, f)

    wer_metric = load_metric("wer")

    #Generators
    training_set = AudioDataset(train_annotation_file, train_data_folder, MAX_LEN)
    training_generator = torch.utils.data.DataLoader(training_set, **hyperparameters)

    validation_set = AudioDataset(validation_annotation_file, validation_data_folder, MAX_LEN)
    validation_generator = torch.utils.data.DataLoader(validation_set, **hyperparameters)

    print('LOAD TOKENIZER...')
    tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0,
                                                      do_normalize=True, return_attention_mask=True)
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    nb_labels = 37

    print('INITIALIZING MODEL...')
    wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
예제 #19
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if (os.path.isdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets:
    train_dataset = datasets.load_dataset(
        "tommy19970714/jsut_asr_hiragana",
        data_args.dataset_config_name,
        split="validation[:80%]",
        cache_dir=model_args.cache_dir,
        use_auth_token=True,
    )
    eval_dataset = datasets.load_dataset(
        "tommy19970714/jsut_asr_hiragana",
        data_args.dataset_config_name,
        split="validation[80%:]",
        cache_dir=model_args.cache_dir,
        use_auth_token=True,
    )
    # 日本語データセットのcolumn名を合わせる
    train_dataset = train_dataset.remove_columns(["id"])
    train_dataset = train_dataset.rename_column("file", "path")
    train_dataset = train_dataset.rename_column("text", "sentence")
    eval_dataset = eval_dataset.remove_columns(["id"])
    eval_dataset = eval_dataset.rename_column("file", "path")
    eval_dataset = eval_dataset.rename_column("text", "sentence")

    # Create and save tokenizer
    chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]'

    def remove_special_characters(batch):
        batch["text"] = (
            re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " ")
        return batch

    train_dataset = train_dataset.map(remove_special_characters,
                                      remove_columns=["sentence"])
    eval_dataset = eval_dataset.map(remove_special_characters,
                                    remove_columns=["sentence"])

    def extract_all_chars(batch):
        all_text = " ".join(batch["text"])
        vocab = list(set(all_text))
        return {"vocab": [vocab], "all_text": [all_text]}

    vocab_train = train_dataset.map(
        extract_all_chars,
        batched=True,
        batch_size=-1,
        keep_in_memory=True,
        remove_columns=train_dataset.column_names,
    )
    vocab_test = train_dataset.map(
        extract_all_chars,
        batched=True,
        batch_size=-1,
        keep_in_memory=True,
        remove_columns=eval_dataset.column_names,
    )

    vocab_list = list(
        set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    with open("vocab.json", "w") as vocab_file:
        json.dump(vocab_dict, vocab_file)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    tokenizer = Wav2Vec2CTCTokenizer(
        "vocab.json",
        unk_token="[UNK]",
        pad_token="[PAD]",
        word_delimiter_token="|",
    )
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16_000,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True,
    )
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                  tokenizer=tokenizer)
    model = Wav2Vec2ForCTC.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        activation_dropout=model_args.activation_dropout,
        attention_dropout=model_args.attention_dropout,
        hidden_dropout=model_args.hidden_dropout,
        feat_proj_dropout=model_args.feat_proj_dropout,
        mask_time_prob=model_args.mask_time_prob,
        gradient_checkpointing=model_args.gradient_checkpointing,
        layerdrop=model_args.layerdrop,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer),
        use_auth_token=True,
    )

    if data_args.max_train_samples is not None:
        train_dataset = train_dataset.select(range(
            data_args.max_train_samples))

    if data_args.max_val_samples is not None:
        eval_dataset = eval_dataset.select(range(data_args.max_val_samples))

    # 日本語データセットはすでに16kHzになっているため
    resampler = torchaudio.transforms.Resample(16_000, 16_000)

    # Preprocessing the datasets.
    # We need to read the aduio files as arrays and tokenize the targets.
    def speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load(batch["path"])
        batch["speech"] = resampler(speech_array).squeeze().numpy()
        batch["sampling_rate"] = 16_000
        batch["target_text"] = batch["text"]
        return batch

    train_dataset = train_dataset.map(
        speech_file_to_array_fn,
        remove_columns=train_dataset.column_names,
        num_proc=data_args.preprocessing_num_workers,
    )
    eval_dataset = eval_dataset.map(
        speech_file_to_array_fn,
        remove_columns=eval_dataset.column_names,
        num_proc=data_args.preprocessing_num_workers,
    )

    def prepare_dataset(batch):
        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values
        # Setup the processor for targets
        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
        return batch

    train_dataset = train_dataset.map(
        prepare_dataset,
        remove_columns=train_dataset.column_names,
        batch_size=training_args.per_device_train_batch_size,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
    )
    eval_dataset = eval_dataset.map(
        prepare_dataset,
        remove_columns=eval_dataset.column_names,
        batch_size=training_args.per_device_train_batch_size,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
    )

    # Metric
    wer_metric = datasets.load_metric("wer")

    def compute_metrics(pred):
        pred_logits = pred.predictions
        pred_ids = np.argmax(pred_logits, axis=-1)

        pred.label_ids[pred.label_ids ==
                       -100] = processor.tokenizer.pad_token_id

        pred_str = processor.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

        wer = wer_metric.compute(predictions=pred_str, references=label_str)

        return {"wer": wer}

    if model_args.freeze_feature_extractor:
        model.freeze_feature_extractor()

    # Data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor,
                                               padding=True)

    # Initialize our Trainer
    trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=processor.feature_extractor,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()

        # save the feature_extractor and the tokenizer
        if is_main_process(training_args.local_rank):
            processor.save_pretrained(training_args.output_dir)

        metrics = train_result.metrics
        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate()
        max_val_samples = (data_args.max_val_samples
                           if data_args.max_val_samples is not None else
                           len(eval_dataset))
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    return results
예제 #20
0
def create_processor(path: str = "./vocab.json"):
    tokenizer = Wav2Vec2CTCTokenizer(path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
    return processor
예제 #21
0
	timit = timit.map(remove_special_characters)

	vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=timit.column_names['train'])
	vocab_list = list(set(vocabs['train']['vocab'][0]) | set(vocabs['test']['vocab'][0]))
	vocab_dict = {v: k for k, v in enumerate(vocab_list)}

	vocab_dict["|"] = vocab_dict[" "]
	del vocab_dict[" "]

	vocab_dict['[UNK]'] = len(vocab_dict)
	vocab_dict['[PAD]'] = len(vocab_dict)

	with open('vocab.json', 'w') as vocab_file:
	    json.dump(vocab_dict, vocab_file)

	tokenizer = Wav2Vec2CTCTokenizer('./vocab.json', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

	feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

	processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

	timit = timit.map(speech_file_to_array_fn, remove_columns=timit.column_names['train'], num_proc=4)

	timit_prepared = timit.map(prepare_dataset, remove_columns=timit.column_names['train'], batch_size=8, num_proc=4, batched=True)

	with open('timit_prepared.pkl', 'wb') as f:
		pickle.dump(timit_prepared, f)

	with open('processor.pkl', 'wb') as f:
		pickle.dump(processor, f)
예제 #22
0
 def __setstate__(self, state):
     self.__dict__.update(state)
     self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(self.backbone)
예제 #23
0
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


# dir_path="/media/nas/samir-data/wav2vec2_models/inputs"
dir_path = "/media/nas/samir-data/asr_transformers"

test_data_folder = "/media/nas/CORPUS_FINAL/Corpus_audio/Corpus_FR/COMMONVOICE/common-voice-fr/clips"
test_annotation_file = "/media/nas/CORPUS_FINAL/Corpus_audio/Corpus_FR/COMMONVOICE/common-voice-fr/test1.tsv"
test_set = AudioDataset(test_annotation_file, test_data_folder, MAX_LEN)
test_generator = torch.utils.data.DataLoader(test_set, batch_size=1)

processor = Wav2Vec2Processor.from_pretrained(
    "/media/nas/samir-data/wav2vec2_models/checkpoint-94000")
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    "/media/nas/samir-data/wav2vec2_models/checkpoint-94000")

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53-french")
model.eval()

wer_metric = load_metric("wer")

# initialize the prediction
predictions = []
references = []
for audio_file in test_generator:
    # for block in sf.blocks(audio_file["input_values"], blocksize=50000):
    input_dict = processor(np.squeeze(audio_file["input_values"], 0),
                           return_attention_mask=False,
                           return_tensors="pt",
예제 #24
0
    vocab_dict = {v: k for k, v in enumerate(vocab_list)}

    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]

    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)
    print(len(vocab_dict))
    print(vocab_dict)

    with open('vocab.%s.json' % language, 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)

    print("\nConstructing tokenizer")
    tokenizer = Wav2Vec2CTCTokenizer("./vocab.%s.json" % language,
                                     unk_token="[UNK]",
                                     pad_token="[PAD]",
                                     word_delimiter_token="|")

    print("\nFeature Extractor")
    feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                                 sampling_rate=16000,
                                                 padding_value=0.0,
                                                 do_normalize=True,
                                                 return_attention_mask=True)

    print("\nConstructing Processor")
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                  tokenizer=tokenizer)
    processor.save_pretrained(output_dir)

    print("\nCreating array from speech files")
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
예제 #26
0
 def tokenizer(self):
     if self.backbone is not None and self.backbone != self._backbone:
         self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
             self.backbone)
         self._backbone = self.backbone
     return self._tokenizer
    def __init__(self, backbone: str):
        super().__init__()

        self.backbone = backbone
        self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(self.backbone)
예제 #28
0
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["_"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

import json
with open('vocab_english.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("./vocab_english.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="_")

from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=False)

from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

print(timit["train"][0])