def test_pytorch_audio_dataset(self): featurizer = WaveformFeaturizer.from_config(self.featurizer_config) ds = AudioDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer,) for i in range(len(ds)): if i == 5: logging.info(ds[i])
def __init__( self, manifest_filepath, durs_dir, labels, batch_size, sample_rate=16000, int_values=False, bos_id=None, eos_id=None, pad_id=None, min_duration=0.1, max_duration=None, normalize_transcripts=True, trim_silence=False, load_audio=True, drop_last=False, shuffle=True, num_workers=0, ): super().__init__() # Set up dataset. self._featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=None) dataset_params = { 'manifest_filepath': manifest_filepath, 'labels': labels, 'featurizer': self._featurizer, 'max_duration': max_duration, 'min_duration': min_duration, 'normalize': normalize_transcripts, 'trim': trim_silence, 'bos_id': bos_id, 'eos_id': eos_id, 'load_audio': load_audio, } audio_dataset = AudioDataset(**dataset_params) self._dataset = fastspeech.FastSpeechDataset(audio_dataset, durs_dir) self._pad_id = pad_id self.sample_rate = sample_rate sampler = None if self._placement == nemo.core.DeviceType.AllGpu: sampler = torch.utils.data.distributed.DistributedSampler( self._dataset) self._dataloader = torch.utils.data.DataLoader( dataset=self._dataset, batch_size=batch_size, collate_fn=self._collate, drop_last=drop_last, shuffle=shuffle if sampler is None else False, sampler=sampler, num_workers=num_workers, )
def construct_perturbed_dataset(perturbation): if perturbation is not None: # Execute perturbations with 100% probability prob_perturb = [(1.0, perturbation)] audio_augmentor = perturb.AudioAugmentor(prob_perturb) else: audio_augmentor = None featurizer = WaveformFeaturizer( sample_rate=self.featurizer_config['sample_rate'], int_values=self.featurizer_config['int_values'], augmentor=audio_augmentor, ) ds = AudioLabelDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer) return ds
def test_pytorch_audio_dataset_with_perturbation(self): perturbations = [ perturb.WhiteNoisePerturbation(min_level=-90, max_level=-46), perturb.ShiftPerturbation(min_shift_ms=-5.0, max_shift_ms=5.0), ] # Execute perturbations with 100% probability prob_perturb = [(1.0, p) for p in perturbations] audio_augmentor = perturb.AudioAugmentor(prob_perturb) featurizer = WaveformFeaturizer( sample_rate=self.featurizer_config['sample_rate'], int_values=self.featurizer_config['int_values'], augmentor=audio_augmentor, ) ds = AudioLabelDataset(manifest_filepath=self.manifest_filepath, labels=self.labels, featurizer=featurizer,) for i in range(len(ds)): logging.info(ds[i])
def __init__( self, data: str, durs: str, labels: List[str], durs_type: str = 'full-pad', speakers: str = None, speaker_table: str = None, speaker_embs: str = None, batch_size: int = 32, sample_rate: int = 16000, int_values: bool = False, bos_id: Optional[int] = None, eos_id: Optional[int] = None, pad_id: Optional[int] = None, blank_id: Optional[int] = None, min_duration: Optional[float] = 0.1, max_duration: Optional[float] = None, normalize_transcripts: bool = True, trim_silence: bool = False, load_audio: bool = True, drop_last: bool = False, shuffle: bool = True, num_workers: int = 0, sampler_type: str = 'default', bd_aug: bool = False, ): """Creates TalkNet data iterator. Args: data: Path to dataset manifest file. durs: Path to pickled durations file. labels: List strings of labels to use. durs_type: String id of durations type to use. speakers: Speakers list file. speaker_table: Speakers ids mapping. speaker_embs: Speakers embeddings file. batch_size: Number of sample in batch. sample_rate: Target sampling rate for data. Audio files will be resampled to sample_rate if it is not already. int_values: Bool indicating whether the audio file is saved as int data or float data. bos_id: Beginning of string symbol id used for seq2seq models. eos_id: End of string symbol id used for seq2seq models. pad_id: Token used to pad when collating samples in batches. blank_id: Int id of blank symbol. min_duration: All training files which have a duration less than min_duration are dropped. max_duration: All training files which have a duration more than max_duration are dropped. normalize_transcripts: Whether to use automatic text cleaning. trim_silence: Whether to use trim silence from beginning and end of audio signal using librosa.effects.trim(). load_audio: Controls whether the dataloader loads the audio signal and transcript or just the transcript. drop_last: See PyTorch DataLoader. shuffle: See PyTorch DataLoader. num_workers: See PyTorch DataLoader. sampler_type: String id of sampler type to use. bd_aug: True if use augmentation for blanks/durs. """ super().__init__() # Set up dataset. self._featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=None) dataset_params = { 'manifest_filepath': data, 'labels': labels, 'featurizer': self._featurizer, 'max_duration': max_duration, 'min_duration': min_duration, 'normalize': normalize_transcripts, 'trim': trim_silence, 'bos_id': bos_id, 'eos_id': eos_id, 'load_audio': load_audio, 'add_misc': True, } audio_dataset = AudioDataset(**dataset_params) self._dataset = TalkNetDataset(audio_dataset, durs, durs_type, speakers, speaker_table, speaker_embs) self._durs_type = durs_type self._pad_id = pad_id self._blank_id = blank_id self._space_id = labels.index(' ') self._sample_rate = sample_rate self._load_audio = load_audio self._bd_aug = bd_aug sampler = None if self._placement == nemo.core.DeviceType.AllGpu: if sampler_type == 'all': sampler = AllSampler(self._dataset) elif sampler_type == 'default': sampler = torch.utils.data.distributed.DistributedSampler(self._dataset) # noqa elif sampler_type == 'super-smart': sampler = LengthsAwareSampler( dataset=self._dataset, lengths=[e.duration for e in audio_dataset.collection], batch_size=batch_size, ) else: raise ValueError("Invalid sample type.") self._dataloader = torch.utils.data.DataLoader( # noqa dataset=self._dataset, batch_size=batch_size, collate_fn=self._collate, drop_last=drop_last, shuffle=shuffle if sampler is None else False, sampler=sampler, num_workers=num_workers, )