예제 #1
0
    def test_pytorch_audio_dataset(self):
        featurizer = WaveformFeaturizer.from_config(self.featurizer_config)
        ds = AudioDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer,)

        for i in range(len(ds)):
            if i == 5:
                logging.info(ds[i])
    def __init__(
        self,
        manifest_filepath,
        durs_dir,
        labels,
        batch_size,
        sample_rate=16000,
        int_values=False,
        bos_id=None,
        eos_id=None,
        pad_id=None,
        min_duration=0.1,
        max_duration=None,
        normalize_transcripts=True,
        trim_silence=False,
        load_audio=True,
        drop_last=False,
        shuffle=True,
        num_workers=0,
    ):
        super().__init__()

        # Set up dataset.
        self._featurizer = WaveformFeaturizer(sample_rate=sample_rate,
                                              int_values=int_values,
                                              augmentor=None)
        dataset_params = {
            'manifest_filepath': manifest_filepath,
            'labels': labels,
            'featurizer': self._featurizer,
            'max_duration': max_duration,
            'min_duration': min_duration,
            'normalize': normalize_transcripts,
            'trim': trim_silence,
            'bos_id': bos_id,
            'eos_id': eos_id,
            'load_audio': load_audio,
        }
        audio_dataset = AudioDataset(**dataset_params)
        self._dataset = fastspeech.FastSpeechDataset(audio_dataset, durs_dir)
        self._pad_id = pad_id
        self.sample_rate = sample_rate

        sampler = None
        if self._placement == nemo.core.DeviceType.AllGpu:
            sampler = torch.utils.data.distributed.DistributedSampler(
                self._dataset)

        self._dataloader = torch.utils.data.DataLoader(
            dataset=self._dataset,
            batch_size=batch_size,
            collate_fn=self._collate,
            drop_last=drop_last,
            shuffle=shuffle if sampler is None else False,
            sampler=sampler,
            num_workers=num_workers,
        )
예제 #3
0
        def construct_perturbed_dataset(perturbation):
            if perturbation is not None:
                # Execute perturbations with 100% probability
                prob_perturb = [(1.0, perturbation)]
                audio_augmentor = perturb.AudioAugmentor(prob_perturb)
            else:
                audio_augmentor = None

            featurizer = WaveformFeaturizer(
                sample_rate=self.featurizer_config['sample_rate'],
                int_values=self.featurizer_config['int_values'],
                augmentor=audio_augmentor,
            )

            ds = AudioLabelDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer)
            return ds
    def test_pytorch_audio_dataset_with_perturbation(self):
        perturbations = [
            perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46),
            perturb.ShiftPerturbation(min_shift_ms=-5.0, max_shift_ms=5.0),
        ]

        # Execute perturbations with 100% probability
        prob_perturb = [(1.0, p) for p in perturbations]

        audio_augmentor = perturb.AudioAugmentor(prob_perturb)

        featurizer = WaveformFeaturizer(
            sample_rate=self.featurizer_config['sample_rate'],
            int_values=self.featurizer_config['int_values'],
            augmentor=audio_augmentor,
        )
        ds = AudioLabelDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer,)

        for i in range(len(ds)):
            logging.info(ds[i])
예제 #5
0
    def __init__(
        self,
        data: str,
        durs: str,
        labels: List[str],
        durs_type: str = 'full-pad',
        speakers: str = None,
        speaker_table: str = None,
        speaker_embs: str = None,
        batch_size: int = 32,
        sample_rate: int = 16000,
        int_values: bool = False,
        bos_id: Optional[int] = None,
        eos_id: Optional[int] = None,
        pad_id: Optional[int] = None,
        blank_id: Optional[int] = None,
        min_duration: Optional[float] = 0.1,
        max_duration: Optional[float] = None,
        normalize_transcripts: bool = True,
        trim_silence: bool = False,
        load_audio: bool = True,
        drop_last: bool = False,
        shuffle: bool = True,
        num_workers: int = 0,
        sampler_type: str = 'default',
        bd_aug: bool = False,
    ):
        """Creates TalkNet data iterator.

        Args:
            data: Path to dataset manifest file.
            durs: Path to pickled durations file.
            labels: List strings of labels to use.
            durs_type: String id of durations type to use.
            speakers: Speakers list file.
            speaker_table: Speakers ids mapping.
            speaker_embs: Speakers embeddings file.
            batch_size: Number of sample in batch.
            sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if it is not
                already.
            int_values: Bool indicating whether the audio file is saved as int data or float data.
            bos_id: Beginning of string symbol id used for seq2seq models.
            eos_id: End of string symbol id used for seq2seq models.
            pad_id: Token used to pad when collating samples in batches.
            blank_id: Int id of blank symbol.
            min_duration: All training files which have a duration less than min_duration are dropped.
            max_duration: All training files which have a duration more than max_duration are dropped.
            normalize_transcripts: Whether to use automatic text cleaning.
            trim_silence: Whether to use trim silence from beginning and end of audio signal using
                librosa.effects.trim().
            load_audio: Controls whether the dataloader loads the audio signal and transcript or just the transcript.
            drop_last: See PyTorch DataLoader.
            shuffle: See PyTorch DataLoader.
            num_workers: See PyTorch DataLoader.
            sampler_type: String id of sampler type to use.
            bd_aug: True if use augmentation for blanks/durs.
        """

        super().__init__()

        # Set up dataset.
        self._featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=None)
        dataset_params = {
            'manifest_filepath': data,
            'labels': labels,
            'featurizer': self._featurizer,
            'max_duration': max_duration,
            'min_duration': min_duration,
            'normalize': normalize_transcripts,
            'trim': trim_silence,
            'bos_id': bos_id,
            'eos_id': eos_id,
            'load_audio': load_audio,
            'add_misc': True,
        }
        audio_dataset = AudioDataset(**dataset_params)
        self._dataset = TalkNetDataset(audio_dataset, durs, durs_type, speakers, speaker_table, speaker_embs)
        self._durs_type = durs_type
        self._pad_id = pad_id
        self._blank_id = blank_id
        self._space_id = labels.index(' ')
        self._sample_rate = sample_rate
        self._load_audio = load_audio
        self._bd_aug = bd_aug

        sampler = None
        if self._placement == nemo.core.DeviceType.AllGpu:
            if sampler_type == 'all':
                sampler = AllSampler(self._dataset)
            elif sampler_type == 'default':
                sampler = torch.utils.data.distributed.DistributedSampler(self._dataset)  # noqa
            elif sampler_type == 'super-smart':
                sampler = LengthsAwareSampler(
                    dataset=self._dataset,
                    lengths=[e.duration for e in audio_dataset.collection],
                    batch_size=batch_size,
                )
            else:
                raise ValueError("Invalid sample type.")

        self._dataloader = torch.utils.data.DataLoader(  # noqa
            dataset=self._dataset,
            batch_size=batch_size,
            collate_fn=self._collate,
            drop_last=drop_last,
            shuffle=shuffle if sampler is None else False,
            sampler=sampler,
            num_workers=num_workers,
        )