def check_ctc_training(self, config, input_values, *args):
        config.ctc_zero_infinity = True
        model = HubertForCTC(config=config)
        model.to(torch_device)
        model.train()

        # freeze feature encoder
        model.freeze_feature_encoder()

        input_values = input_values[:3]

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        max_length_labels = model._get_feat_extract_output_lengths(
            torch.tensor(input_lengths))
        labels = ids_tensor(
            (input_values.shape[0], max(max_length_labels) - 2),
            model.config.vocab_size)

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i]:] = 0.0

            if max_length_labels[i] < labels.shape[-1]:
                # it's important that we make sure that target lenghts are at least
                # one shorter than logit lenghts to prevent -inf
                labels[i, max_length_labels[i] - 1:] = -100

        loss = model(input_values, labels=labels).loss
        self.parent.assertFalse(torch.isinf(loss).item())

        loss.backward()
    def test_inference_ctc_batched(self):
        model = HubertForCTC.from_pretrained(
            "facebook/hubert-large-ls960-ft",
            torch_dtype=torch.float16).to(torch_device)
        processor = Wav2Vec2Processor.from_pretrained(
            "facebook/hubert-large-ls960-ft", do_lower_case=True)

        input_speech = self._load_datasamples(2)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.half().to(torch_device)
        attention_mask = inputs.attention_mask.to(torch_device)

        with torch.no_grad():
            logits = model(input_values, attention_mask=attention_mask).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_trans = processor.batch_decode(predicted_ids)

        EXPECTED_TRANSCRIPTIONS = [
            "a man said to the universe sir i exist",
            "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore",
        ]
        self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
Exemplo n.º 3
0
    def check_ctc_loss(self, config, input_values, *args):
        model = HubertForCTC(config=config)
        model.to(torch_device)

        # make sure that dropout is disabled
        model.eval()

        input_values = input_values[:3]
        attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long)

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
        labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size)

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i] :] = 0.0
            attention_mask[i, input_lengths[i] :] = 0

        model.config.ctc_loss_reduction = "sum"
        sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()

        model.config.ctc_loss_reduction = "mean"
        mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item()

        self.parent.assertTrue(isinstance(sum_loss, float))
        self.parent.assertTrue(isinstance(mean_loss, float))
Exemplo n.º 4
0
    def check_labels_out_of_vocab(self, config, input_values, *args):
        model = HubertForCTC(config)
        model.to(torch_device)
        model.train()

        input_values = input_values[:3]

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]
        max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths))
        labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100)

        with pytest.raises(ValueError):
            model(input_values, labels=labels)
Exemplo n.º 5
0
# only load the relevant featuresets for featurization to save memory
if 'allosaurus_features' in feature_sets:
	import allosaurus_features
if 'audioset_features' in feature_sets:
	import audioset_features
if 'audiotext_features' in feature_sets:
	import audiotext_features
if 'hubert_features' in feature_sets:
	import hubert_features
	import torch
	from transformers import HubertModel, HubertConfig
	from transformers import Wav2Vec2Processor, HubertForCTC
	import soundfile as sf
	hubert_processor_ = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
	hubert_model_ = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
else:
	hubert_model_ = ''
	hubert_processor_ =''
if 'librosa_features' in feature_sets:
	import librosa_features
if 'loudness_features' in feature_sets:
	import loudness_features
if 'meta_features' in feature_sets:
	import meta_features
	os.system('pip3 install scikit-learn==0.19.1')
if 'mixed_features' in feature_sets:
	import mixed_features
if 'multispeaker_features' in feature_sets:
	import multispeaker_features
if 'myprosody_features' in feature_sets:
def convert_hubert_checkpoint(checkpoint_path,
                              pytorch_dump_folder_path,
                              config_path=None,
                              dict_path=None,
                              is_finetuned=True):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = HubertConfig.from_pretrained(config_path)
    else:
        config = HubertConfig()

    if is_finetuned:
        if dict_path:
            target_dict = Dictionary.load(dict_path)

            # important change bos & pad token id since CTC symbol is <pad> and
            # not <s> as in fairseq
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error(
                    "--pytorch_dump_folder_path ({}) should be a directory".
                    format(pytorch_dump_folder_path))
                return
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(target_dict.indices, vocab_handle)
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )
            return_attention_mask = True if config.feat_extract_norm == "layer" else False
            feature_extractor = Wav2Vec2FeatureExtractor(
                feature_size=1,
                sampling_rate=16000,
                padding_value=0,
                do_normalize=True,
                return_attention_mask=return_attention_mask,
            )
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                          tokenizer=tokenizer)
            processor.save_pretrained(pytorch_dump_folder_path)

        hf_wav2vec = HubertForCTC(config)
    else:
        hf_wav2vec = HubertModel(config)

    if is_finetuned:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path],
            arg_overrides={"data": "/".join(dict_path.split("/")[:-1])})
    else:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path])

    model = model[0].eval()

    recursively_load_weights(model, hf_wav2vec, is_finetuned)

    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
Exemplo n.º 7
0
sampling_rate = 16000
channels = 1
batch_size = 1
my_dataset = LPAudioSet(os.path.join(
    os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'audio'),
                        sr=sampling_rate,
                        channels=channels)
train_loader = torch.utils.data.DataLoader(my_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=1,
                                           drop_last=True,
                                           collate_fn=LPAudioSet.collate_fn)
for idx, audio in enumerate(train_loader):
    print(idx, audio.shape)
#sys.exit(0)

processor = Wav2Vec2Processor.from_pretrained(
    "facebook/hubert-xlarge-ls960-ft",
    cache_dir=os.getenv("cache_dir", "../../models"))
model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft",
                                     cache_dir=os.getenv(
                                         "cache_dir", "../../models"))
for idx, audio in enumerate(train_loader):
    input_values = processor(audio,
                             sampling_rate=sampling_rate,
                             return_tensors="pt").input_values  # Batch size 1
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    print(transcription)