def __getitem__(self, index): wavpath = self.wavpaths[index] wav, sr = torchaudio.load(wavpath) wav = wav.squeeze().numpy() if self.split == 'train': transforms = WavformAugmentation(sr) wav = transforms(wav) # pad to 150k frames if len(wav) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.split == 'train' else True) padded = (wav[:self.max_length] if flip else wav[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wav)] = wav # pad w/ silence label = self.labels[index] if self.caller_intent == 'dialog_acts': label = torch.LongTensor(label) elif self.caller_intent == 'sentiment': label = [label['positive'], label['neutral'], label['negative']] label = torch.FloatTensor(label) padded = torch.from_numpy(padded).float() return index, padded, label
def __getitem__(self, index): wav_name = self.wav_paths[index] wav_path = os.path.join(self.root, wav_name) label = self.labels[index] if self.label_type == 'action': label = FLUENTSPEECH_ACTIONS.index(label) elif self.label_type == 'object': label = FLUENTSPEECH_OBJECTS.index(label) elif self.label_type == 'location': label = FLUENTSPEECH_LOCATIONS.index(label) wavform, sample_rate = torchaudio.load(wav_path) wavform = wavform[0].numpy() if self.wavform_transforms: transforms = WavformAugmentation(sample_rate) wavform = transforms(wavform) # pad to 150k frames if len(wavform) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.train else True) padded = (wavform[:self.max_length] if flip else wavform[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wavform)] = wavform # pad w/ silence hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800} spectrum = librosa.feature.melspectrogram( padded, sample_rate, hop_length=hop_length_dict[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, int(label)
def __getitem__(self, index): wavpath = self.wavpaths[index] wav, sr = torchaudio.load(wavpath) wav = wav.squeeze().numpy() if self.wavform_transforms and self.split == 'train': transforms = WavformAugmentation(sr) wav = transforms(wav) # pad to 150k frames if len(wav) > self.max_length: # randomly pick which side to chop off (fix if validation) flip = (bool(random.getrandbits(1)) if self.split == 'train' else True) padded = (wav[:self.max_length] if flip else wav[-self.max_length:]) else: padded = np.zeros(self.max_length) padded[:len(wav)] = wav # pad w/ silence spectrum = librosa.feature.melspectrogram( padded, sr, hop_length=HARPER_VALLEY_HOP_LENGTH_DICT[self.input_size], n_mels=self.input_size, ) if self.spectral_transforms: # apply time and frequency masks transforms = SpectrumAugmentation() spectrum = transforms(spectrum) # log mel-spectrogram spectrum = librosa.power_to_db(spectrum**2) spectrum = torch.from_numpy(spectrum).float() spectrum = spectrum.unsqueeze(0) if self.spectral_transforms: # apply noise on spectral noise_stdev = 0.25 * self.normalize_stdev[0] noise = torch.randn_like(spectrum) * noise_stdev spectrum = spectrum + noise label = self.labels[index] if self.caller_intent == 'dialog_acts': label = torch.LongTensor(label) elif self.caller_intent == 'sentiment': label = [label['positive'], label['neutral'], label['negative']] label = torch.FloatTensor(label) normalize = Normalize(self.normalize_mean, self.normalize_stdev) spectrum = normalize(spectrum) return index, spectrum, label