class LibriSpeechTransfer(Dataset):
    """
    Divide the dev-clean split of LibriSpeech into train and 
    test splits by speaker so we can train a logreg fairly.
    """
    def __init__(
        self,
        root,
        train=True,
        spectral_transforms=False,
        wavform_transforms=False,
        max_length=150526,
        input_size=112,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        assert not (spectral_transforms and wavform_transforms)
        self.dataset = LIBRISPEECH(root,
                                   url='dev-clean',
                                   download=True,
                                   folder_in_archive='LibriSpeech')

        all_speaker_ids = self.get_speaker_ids(self.dataset)
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
        self.num_labels = num_unique_speakers

        self.indices = self.train_test_split(self.dataset,
                                             all_speaker_ids,
                                             train=train)
        self.spectral_transforms = spectral_transforms
        self.wavform_transforms = wavform_transforms
        self.max_length = max_length
        self.train = train
        self.input_size = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev

    def get_speaker_ids(self, dataset):
        speaker_ids = []
        for i in range(len(dataset)):
            fileid = dataset._walker[i]
            speaker_id = self.load_librispeech_speaker_id(
                fileid,
                dataset._path,
                dataset._ext_audio,
                dataset._ext_txt,
            )
            speaker_ids.append(speaker_id)
        return np.array(speaker_ids)

    def train_test_split(self, dataset, speaker_ids, train=True):
        rs = np.random.RandomState(42)  # fix seed so reproducible splitting

        unique_speaker_ids = sorted(set(speaker_ids))
        unique_speaker_ids = np.array(unique_speaker_ids)

        # train test split to ensure the 80/20 splits
        train_indices, test_indices = [], []
        for speaker_id in unique_speaker_ids:
            speaker_indices = np.where(speaker_ids == speaker_id)[0]
            size = len(speaker_indices)
            rs.shuffle(speaker_indices)
            train_size = int(0.8 * size)
            train_indices.extend(speaker_indices[:train_size].tolist())
            test_indices.extend(speaker_indices[train_size:].tolist())

        return train_indices if train else test_indices

    def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt):
        speaker_id, _, _ = fileid.split("-")
        return int(speaker_id)

    def __getitem__(self, index):
        # NOTE: overwrite index with our custom indices mapping exapmles
        #       to the training and test splits
        index = self.indices[index]

        try:
            wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                index)
        except:
            index2 = (index + 1) % len(self.dataset)
            wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                index2)

        speaker_id = self.speaker_id_map[speaker_id]
        wavform = np.asarray(wavform[0])

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800}
        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=hop_length_dict[self.input_size],
            n_mels=self.input_size,
        )

        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, speaker_id

    def __len__(self):
        return len(self.indices)
class LibriSpeech(Dataset):
    def __init__(
        self,
        root,
        train=True,
        spectral_transforms=False,
        wavform_transforms=True,
        train_urls=[
            'train-clean-100',
            'train-clean-360',
            'train-other-500',
        ],
        test_url='dev-clean',
        max_length=150526,
        input_size=112,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        # choose to either apply augmentation at wavform or at augmentation level
        assert not (spectral_transforms and wavform_transforms)

        if train:
            datasets = []
            for train_url in train_urls:
                dataset = LIBRISPEECH(root,
                                      url=train_url,
                                      download=True,
                                      folder_in_archive='LibriSpeech')
                datasets.append(dataset)
            self.dataset = ConcatDatasets(datasets)
        else:
            self.dataset = LIBRISPEECH(root,
                                       url=test_url,
                                       download=True,
                                       folder_in_archive='LibriSpeech')

        self.wavform_transforms = wavform_transforms
        self.spectral_transforms = spectral_transforms
        self.max_length = max_length
        self.train = train
        self.input_size = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev
        all_speaker_ids = self.get_speaker_ids()
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers

    def get_speaker_ids(self):
        if self.train:
            speaker_ids = []
            for dataset in self.dataset.datasets:
                speaker_ids_i = self._get_speaker_ids(dataset)
                speaker_ids.append(speaker_ids_i)
            return np.concatenate(speaker_ids)
        else:
            return self._get_speaker_ids(self.dataset)

    def _get_speaker_ids(self, dataset):
        speaker_ids = []
        for i in range(len(dataset)):
            fileid = dataset._walker[i]
            speaker_id = self.load_librispeech_speaker_id(
                fileid,
                dataset._path,
                dataset._ext_audio,
                dataset._ext_txt,
            )
            speaker_ids.append(speaker_id)
        return np.array(speaker_ids)

    def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt):
        speaker_id, _, _ = fileid.split("-")
        return int(speaker_id)

    def __getitem__(self, index):
        if index in BAD_LIBRISPEECH_INDICES:
            index = index + 1

        wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
            index)

        speaker_id = self.speaker_id_map[speaker_id]
        wavform = np.asarray(wavform[0])

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=LIBRISPEECH_HOP_LENGTH_DICT[self.input_size],
            n_mels=self.input_size,
        )

        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, speaker_id

    def __len__(self):
        return len(self.dataset)
示例#3
0
class LibriSpeech(Dataset):
    def __init__(
        self,
        root=DATA_ROOTS['librispeech'],
        train=True,
        small=False,
        spectral_transforms=False,
        wavform_transforms=True,
        test_url='dev-clean',
        max_length=150526,
        input_size=224,
        normalize_mean=LIBRISPEECH_MEAN,
        normalize_stdev=LIBRISPEECH_STDEV,
    ):
        super().__init__()
        # choose to either apply augmentation at wavform or at augmentation level
        assert not (spectral_transforms and wavform_transforms)
        if train:
            if small:
                self.dataset = LIBRISPEECH(root,
                                           url='train-clean-100',
                                           download=True,
                                           folder_in_archive='LibriSpeech')
            else:
                self.dataset1 = LIBRISPEECH(root,
                                            url='train-clean-100',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
                self.dataset2 = LIBRISPEECH(root,
                                            url='train-clean-360',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
                self.dataset3 = LIBRISPEECH(root,
                                            url='train-other-500',
                                            download=True,
                                            folder_in_archive='LibriSpeech')
        else:
            self.dataset = LIBRISPEECH(root,
                                       url=test_url,
                                       download=True,
                                       folder_in_archive='LibriSpeech')

        self.spectral_transforms = spectral_transforms
        self.wavform_transforms = wavform_transforms
        self.max_length = max_length
        self.train = train
        self.small = small
        all_speaker_ids = self.get_speaker_ids()
        unique_speaker_ids = sorted(list(set(all_speaker_ids)))
        num_unique_speakers = len(unique_speaker_ids)
        self.speaker_id_map = dict(
            zip(unique_speaker_ids, range(num_unique_speakers)))
        self.all_speaker_ids = np.array(
            [self.speaker_id_map[sid] for sid in all_speaker_ids])
        self.num_unique_speakers = num_unique_speakers
        self.num_labels = num_unique_speakers
        self.input_size = input_size
        self.FILTER_SIZE = input_size
        self.normalize_mean = normalize_mean
        self.normalize_stdev = normalize_stdev

    def get_speaker_ids(self):
        if self.train and not self.small:
            speaker_ids_1 = self._get_speaker_ids(self.dataset1)
            speaker_ids_2 = self._get_speaker_ids(self.dataset2)
            speaker_ids_3 = self._get_speaker_ids(self.dataset3)
            return np.concatenate(
                [speaker_ids_1, speaker_ids_2, speaker_ids_3])
        else:
            return self._get_speaker_ids(self.dataset)

    def _get_speaker_ids(self, dataset):
        speaker_ids = []
        for i in range(len(dataset)):
            fileid = dataset._walker[i]
            speaker_id = self.load_librispeech_speaker_id(
                fileid,
                dataset._path,
                dataset._ext_audio,
                dataset._ext_txt,
            )
            speaker_ids.append(speaker_id)
        return np.array(speaker_ids)

    def load_librispeech_speaker_id(self, fileid, path, ext_audio, ext_txt):
        speaker_id, _, _ = fileid.split("-")
        return int(speaker_id)

    def __getitem__(self, index):

        if self.train and not self.small:
            if index >= (len(self.dataset1) + len(self.dataset2)):
                try:
                    wavform, sample_rate, _, speaker_id, _, _ = \
                        self.dataset3.__getitem__(index - len(self.dataset1) - len(self.dataset2))
                except:
                    index2 = (index - len(self.dataset1) - len(self.dataset2) +
                              1) % len(self.dataset3)
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset3(
                        index2)
            elif index >= len(self.dataset1):
                try:
                    wavform, sample_rate, _, speaker_id, _, _ = \
                        self.dataset2.__getitem__(index - len(self.dataset1))
                except:
                    index2 = (index - len(self.dataset1) + 1) % len(
                        self.dataset2)
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset2.__getitem__(
                        index2)
            else:
                try:
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__(
                        index)
                except:
                    index2 = (index + 1) % len(self.dataset)
                    wavform, sample_rate, _, speaker_id, _, _ = self.dataset1.__getitem__(
                        index2)
        else:
            try:
                wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                    index)
            except:
                index2 = (index + 1) % len(self.dataset)
                wavform, sample_rate, _, speaker_id, _, _ = self.dataset.__getitem__(
                    index2)

        speaker_id = self.speaker_id_map[speaker_id]
        wavform = np.asarray(wavform[0])

        if self.wavform_transforms:
            transforms = WavformAugmentation(sample_rate)
            wavform = transforms(wavform)

        # pad to 150k frames
        if len(wavform) > self.max_length:
            # randomly pick which side to chop off (fix if validation)
            flip = (bool(random.getrandbits(1)) if self.train else True)
            padded = (wavform[:self.max_length]
                      if flip else wavform[-self.max_length:])
        else:
            padded = np.zeros(self.max_length)
            padded[:len(wavform)] = wavform  # pad w/ silence

        hop_length_dict = {224: 672, 112: 1344, 64: 2360, 32: 4800}
        spectrum = librosa.feature.melspectrogram(
            padded,
            sample_rate,
            hop_length=hop_length_dict[self.input_size],
            n_mels=self.input_size,
        )
        if self.spectral_transforms:  # apply time and frequency masks
            transforms = SpectrumAugmentation()
            spectrum = transforms(spectrum)

        # log mel-spectrogram
        spectrum = librosa.power_to_db(spectrum**2)
        spectrum = torch.from_numpy(spectrum).float()
        spectrum = spectrum.unsqueeze(0)

        if self.spectral_transforms:  # apply noise on spectral
            noise_stdev = 0.25 * self.normalize_stdev[0]
            noise = torch.randn_like(spectrum) * noise_stdev
            spectrum = spectrum + noise

        normalize = Normalize(self.normalize_mean, self.normalize_stdev)
        spectrum = normalize(spectrum)

        return index, spectrum, speaker_id

    def __len__(self):
        if self.train and not self.small:
            return len(self.dataset1) + len(self.dataset2) + len(self.dataset3)
        else:
            return len(self.dataset)