def __getitem__(self, index):
        wavpath = self.wavpaths[index]
        wav, sr = torchaudio.load(wavpath)
        wav = wav.squeeze().numpy()

        if self.split == 'train':
            transforms = WavformAugmentation(sr)
            wav = transforms(wav)

        # pad to 150k frames
        if len(wav) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1))
                    if self.split == 'train' else True)
            padded = (wav[:self.max_length]
                      if flip else wav[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wav)] = wav  # pad w/ silence

        label = self.labels[index]
        if self.caller_intent == 'dialog_acts':
            label = torch.LongTensor(label)
        elif self.caller_intent == 'sentiment':
            label = [label['positive'], label['neutral'], label['negative']]
            label = torch.FloatTensor(label)

        padded = torch.from_numpy(padded).float()
        return index, padded, label
示例#2
0
    def __getitem__(self, index):
        wav_name = self.wav_paths[index]
        wav_path = os.path.join(self.root, wav_name)

        label = self.labels[index]

        if self.label_type == 'action':
            label = FLUENTSPEECH_ACTIONS.index(label)
        elif self.label_type == 'object':
            label = FLUENTSPEECH_OBJECTS.index(label)
        elif self.label_type == 'location':
            label = FLUENTSPEECH_LOCATIONS.index(label)

        wavform, sample_rate = torchaudio.load(wav_path)
        wavform = wavform[0].numpy()

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800}
        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=hop_length_dict[self.input_size],
            n_mels=self.input_size,
        )

        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, int(label)
    def __getitem__(self, index):
        wavpath = self.wavpaths[index]
        wav, sr = torchaudio.load(wavpath)
        wav = wav.squeeze().numpy()

        if self.wavform_transforms and self.split == 'train':
            transforms = WavformAugmentation(sr)
            wav = transforms(wav)

        # pad to 150k frames
        if len(wav) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1))
                    if self.split == 'train' else True)
            padded = (wav[:self.max_length]
                      if flip else wav[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wav)] = wav  # pad w/ silence

        spectrum = librosa.feature.melspectrogram(
            padded,
            sr,
            hop_length=HARPER_VALLEY_HOP_LENGTH_DICT[self.input_size],
            n_mels=self.input_size,
        )

        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        label = self.labels[index]
        if self.caller_intent == 'dialog_acts':
            label = torch.LongTensor(label)
        elif self.caller_intent == 'sentiment':
            label = [label['positive'], label['neutral'], label['negative']]
            label = torch.FloatTensor(label)

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, label