Пример #1
0
 def __init__(self,
              training_files,
              segment_length,
              filter_length,
              hop_length,
              win_length,
              sampling_rate,
              data_folder,
              audio_format,
              return_stft=False):
     self.audio_files = files_to_list(training_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.return_stft = return_stft
     if self.return_stft:
         self.stft = STFT(filter_length=filter_length,
                          hop_length=hop_length,
                          win_length=win_length)
     else:
         self.stft = TacotronSTFT(filter_length=filter_length,
                                  hop_length=hop_length,
                                  win_length=win_length,
                                  sampling_rate=sampling_rate,
                                  mel_fmin=0.0,
                                  mel_fmax=8000.0)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
     self.data_folder = data_folder
     self.audio_format = audio_format
Пример #2
0
def data_generator(data_base,chunk_length_in_sec,label_order_list,batch_size):    
    stft = TacotronSTFT(**SFT_CONFIG)
    keys = data_base.get_db_keys()    
    times = data_base.get_db_wv_times(keys[0])
    batch_x = []
    batch_y = []
    while True:        
            for k in keys:                
                label = np.array([label_order_list.index(k)])                 
                sampling_rate, speech = data_base.get_wav(keys[0],*times[0])
                chunks = int(len(speech)/sampling_rate/chunk_length_in_sec)
                audio_length = sampling_rate*chunk_length_in_sec
                for chunk in range(chunks):                
                    audio = speech[chunk*audio_length:(chunk+1)*audio_length]
                    audio_norm = audio / MAX_WAV_VALUE    
                    audio_norm = torch.from_numpy(audio_norm).float()
                    audio_norm = audio_norm.unsqueeze(0)
                    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
                    melspec = stft.mel_spectrogram(audio_norm)
                    mel_np = melspec.detach().numpy()
                    for i in range(mel_np.shape[1]):
                        channel_mean = np.mean(mel_np[0,i,:])  
                        mel_np[0,i,:] = mel_np[0,i,:] - channel_mean
                    
                    #normalized_mel = torch.from_numpy(mel_np)
                    batch_x.append(mel_np)
                    batch_y.append(label)
                    #yield normalized_mel.unsqueeze(1), Variable(y_tensor)
                    if len(batch_x) >= batch_size:
                        x = torch.from_numpy(np.array(batch_x))
                        y = Variable(torch.from_numpy(np.concatenate(batch_y)).long())
                        batch_x = []
                        batch_y = []
                        yield x,y
Пример #3
0
 def __init__(self,
              training_files,
              validation_files,
              segment_length,
              filter_length,
              hop_length,
              win_length,
              sampling_rate,
              mel_fmin,
              mel_fmax,
              train=True):
     if train:
         self.audio_files = files_to_list(training_files)
     else:
         self.audio_files = files_to_list(validation_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin,
                              mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Пример #4
0
    def __init__(self,
                 training_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 debug=False):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.debug = debug

        valid_files = []
        paths = files_to_list(training_files)
        for path in paths:
            dur = duration(path)
            if dur >= self.segment_length:
                valid_files.append(path)
        self.audio_files = valid_files
Пример #5
0
 def __init__(self, filter_length, hop_length, win_length,
              sampling_rate, mel_fmin, mel_fmax):
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin, mel_fmax=mel_fmax)
Пример #6
0
 def __init__(self,
              training_files,
              segment_length,
              filter_length,
              hop_length,
              win_length,
              sampling_rate,
              mel_fmin,
              mel_fmax,
              load_mel_from_disk=False):
     self.load_mel_from_disk = load_mel_from_disk
     self.hop_length = hop_length
     self.audio_files = audiopaths_and_melpaths(
         training_files) if self.load_mel_from_disk else files_to_list(
             training_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin,
                              mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Пример #7
0
 def __init__(self, training_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate):
     self.audio_files = files_to_list(training_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Пример #8
0
 def __init__(self, training_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
     self.audio_files = files_to_list(training_files)
     random.seed(1234)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin, mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
     self.dataset = self.pack()
Пример #9
0
 def __init__(self, training_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, checkpoint_store_command):
     self.audio_files = files_to_list(training_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin, mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Пример #10
0
class Tacotron(AudioProcessing):
    """Preprocesses audio as in the Tacotron2 code."""
    def __init__(
        self,
        sampling_rate,
        n_mel_channels,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        mel_fmin=0.0,
        mel_fmax=8000.0,
    ):
        super(Tacotron, self).__init__(sampling_rate, n_mel_channels)
        self.taco_stft = TacotronSTFT(
            filter_length=filter_length,
            hop_length=hop_length,
            win_length=win_length,
            sampling_rate=sampling_rate,
            n_mel_channels=n_mel_channels,
            mel_fmin=mel_fmin,
            mel_fmax=mel_fmax,
        )

    def audio_to_mel(self, audio):
        audio = torch.tensor(audio)
        audio = audio.unsqueeze(0)
        melspec = self.taco_stft.mel_spectrogram(audio)
        melspec = torch.squeeze(melspec, 0)
        return melspec.T

    def mel_to_audio(self, mel):
        # TODO make it work in batch mode
        mel = mel.unsqueeze(0)
        mel_decompress = self.taco_stft.spectral_de_normalize(mel)
        mel_decompress = mel_decompress.transpose(1, 2).data.cpu()

        spec_from_mel_scaling = 1000

        spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis)
        spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
        spec_from_mel = spec_from_mel * spec_from_mel_scaling

        GRIFFIN_ITERS = 60
        audio = griffin_lim(
            spec_from_mel[:, :, :-1],
            self.taco_stft.stft_fn,
            GRIFFIN_ITERS,
        )
        audio = audio.squeeze()
        audio = audio.cpu().numpy()

        return audio
Пример #11
0
 def __init__(self, data_path, valid, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
     self.audio_files = files_to_list(data_path)
     self.valid = valid
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin, mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Пример #12
0
 def __init__(self, training_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
     self.audio_files = files_to_list(training_files)
     self.audio_files_segment_pos = [-1 for _ in self.audio_files]
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin,
                              mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
Пример #13
0
    def __init__(self, training_files, segment_length, mu_quantization,
                 filter_length, hop_length, win_length, sampling_rate):
        audio_files = utils.files_to_list(training_files)
        self.audio_files = audio_files
        random.seed(1234)
        random.shuffle(self.audio_files)

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate)

        self.segment_length = segment_length
        self.mu_quantization = mu_quantization
        self.sampling_rate = sampling_rate
Пример #14
0
    def __init__(self, training_files, validation_files, validation_windows,
                 segment_length, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax, load_mel_from_disk,
                 preempthasis):
        self.audio_files = load_filepaths_and_text(training_files)

        print("Files before checking: ", len(self.audio_files))

        i = 0
        i_offset = 0
        for i_ in range(len(self.audio_files)):
            i = i_ + i_offset
            if i == len(self.audio_files): break
            file = self.audio_files[i]
            if not os.path.exists(file[0]):
                print(file[0], "does not exist")
                self.audio_files.remove(file)
                i_offset -= 1
                continue

            audio_data, sample_r = load_wav_to_torch(file[0])
            if audio_data.size(0) <= segment_length:
                print(file[0], "is too short")
                self.audio_files.remove(file)
                i_offset -= 1
                continue

        print("Files after checking: ", len(self.audio_files))

        self.load_mel_from_disk = load_mel_from_disk
        self.speaker_ids = self.create_speaker_lookup_table(self.audio_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 n_mel_channels=160,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        if preempthasis:
            self.preempthasise = PreEmphasis(preempthasis)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.hop_length = hop_length
        self.win_length = win_length
Пример #15
0
class Mel2SampOnehot(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, mu_quantization,
                 filter_length, hop_length, win_length, sampling_rate,
                 mel_fmin, mel_fmax):
        audio_files = utils.files_to_list(training_files)
        self.audio_files = audio_files
        random.seed(1234)
        random.shuffle(self.audio_files)
        mel_fmax = None if mel_fmax == -1 else mel_fmax

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)

        self.segment_length = segment_length
        self.mu_quantization = mu_quantization
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / utils.MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = utils.load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

            # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE,
                                    self.mu_quantization)
        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Пример #16
0
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, num_workers,
                 use_multi_speaker, speaker_embedding_path, use_speaker_embedding_model):
        self.audio_files = files_to_list(training_files)

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.num_workers = num_workers
        self.use_multi_speaker = use_multi_speaker
        self.speaker_embedding_path = speaker_embedding_path
        self.use_speaker_embedding_model = use_speaker_embedding_model
        if not self.use_speaker_embedding_model:
            self.spk_id_map = pickle.load(open(self.speaker_embedding_path, "rb"))
Пример #17
0
 def __init__(
     self,
     sampling_rate,
     n_mel_channels,
     filter_length=1024,
     hop_length=256,
     win_length=1024,
     mel_fmin=0.0,
     mel_fmax=8000.0,
 ):
     super(Tacotron, self).__init__(sampling_rate, n_mel_channels)
     self.taco_stft = TacotronSTFT(
         filter_length=filter_length,
         hop_length=hop_length,
         win_length=win_length,
         sampling_rate=sampling_rate,
         n_mel_channels=n_mel_channels,
         mel_fmin=mel_fmin,
         mel_fmax=mel_fmax,
     )
def data_generator(data_base, chunk_length_in_sec, label_order_list):
    stft = TacotronSTFT(**SFT_CONFIG)
    keys = data_base.get_db_keys()
    batch_train_x = []
    batch_train_y = []

    while True:
        for k in keys:
            label = np.array([label_order_list.index(k)])
            for t in data_base.get_db_wv_times(k):
                sampling_rate, speech = data_base.get_wav(k, *t)
                chunks = int(len(speech) / sampling_rate / chunk_length_in_sec)
                audio_length = sampling_rate * chunk_length_in_sec
                for chunk in range(chunks):
                    audio = speech[chunk * audio_length:(chunk + 1) *
                                   audio_length]
                    audio_norm = audio / MAX_WAV_VALUE
                    audio_norm = torch.from_numpy(audio_norm).float()
                    audio_norm = audio_norm.unsqueeze(0)
                    audio_norm = torch.autograd.Variable(audio_norm,
                                                         requires_grad=False)
                    melspec = stft.mel_spectrogram(audio_norm)
                    mel_np = melspec.detach().numpy()
                    for i in range(mel_np.shape[1]):
                        channel_mean = np.mean(mel_np[0, i, :])
                        mel_np[0, i, :] = mel_np[0, i, :] - channel_mean

                    batch_train_x.append(mel_np)
                    batch_train_y.append(label)
            if len(batch_train_x) > 0 and len(batch_train_y) > 0:
                yield np.array(batch_train_x), np.concatenate(
                    np.array(batch_train_y))
            else:
                yield np.array([]), np.array([])

            batch_train_x = []
            batch_train_y = []

        yield None, None
Пример #19
0
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.everything = self.pack()


        self.max_time = self.segment_length

        best = -1
        score = 0.0
        if self.max_time == 0: ##auto configuration for maximum efficiency
            for x in range(250000, 1000000,10000):
                self.max_time = x
                self.do_binpacking()

                utilized =   np.asarray(self.volumes).mean()/self.max_time
                if utilized > score:
                    score = utilized
                    best= x

        self.max_time = best
        self.do_binpacking()

        ##import pdb; pdb.set_trace()
        perm = list(range(len(self.balancer)))
        random.shuffle(perm)
        self.volumes = [self.volumes[p] for p in perm  ]
        self.balancer = [self.balancer[p] for p in perm  ]
Пример #20
0
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        #过滤短音频
        # i = 0
        # for file in files_to_list(training_files):
        #     audio_data, sample_r = load_wav_to_torch(file)

        #     if audio_data.size(0) < segment_length:
        #         i += 1
        #         print(file)
        #         self.audio_files.remove(file)
        # print("{} files shorter than segment_len".format(i))

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
Пример #21
0
class Get_mel():
    def __init__(self, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec
Пример #22
0
class Mel2SampWaveglow(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, segment_length, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, filepath):
        audio, sr = load_wav_to_torch(filepath)
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec
Пример #23
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """

    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, num_workers,
                 use_multi_speaker, speaker_embedding_path, use_speaker_embedding_model):
        self.audio_files = files_to_list(training_files)

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.num_workers = num_workers
        self.use_multi_speaker = use_multi_speaker
        self.speaker_embedding_path = speaker_embedding_path
        self.use_speaker_embedding_model = use_speaker_embedding_model
        if not self.use_speaker_embedding_model:
            self.spk_id_map = pickle.load(open(self.speaker_embedding_path, "rb"))

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_item(self, index):
        # Read audio
        filename = self.audio_files[index]
        # filename = os.path.join(self.npy_dir, os.path.basename(filename) + ".npy")
        filename = filename + ".npy"

        audio = np.load(filename)

        audio = torch.from_numpy(audio).float()

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(audio, (0, self.segment_length - audio.size(0)), 'constant').data
        mel = self.get_mel(audio)
        # todo: check whether get side effect to result quality
        audio = audio / MAX_WAV_VALUE

        if self.use_multi_speaker:
            if self.use_speaker_embedding_model:
                speaker_embedding_path = os.path.join(self.speaker_embedding_path,
                                                      os.path.basename(self.audio_files[index]) + ".npy")
                if not os.path.isfile(speaker_embedding_path):
                    print("nothing spk embed", speaker_embedding_path)
                    raise Exception("nothing spk embed", speaker_embedding_path)
                speaker_embedding = self.get_speaker_embedding(speaker_embedding_path)
            else:
                spk_file_name = os.path.splitext(os.path.basename(self.audio_files[index]))[0]
                if spk_file_name not in self.spk_id_map:
                    print("nothing spk embed id", spk_file_name)
                    raise Exception("nothing spk embed id", spk_file_name)
                speaker_embedding = self.spk_id_map[spk_file_name]

            return (mel, audio, speaker_embedding)
        else:
            return (mel, audio)

    def get_speaker_embedding(self, filename):
        speaker_embedding_np = np.load(filename)
        speaker_embedding_np = torch.autograd.Variable(torch.FloatTensor(speaker_embedding_np.astype(np.float32)),
                                                       requires_grad=False)
        # speaker_embedding_np = speaker_embedding_np.half() if self.is_fp16 else speaker_embedding_np
        return speaker_embedding_np

    def __getitem__(self, index):
        # Read audio
        while True:
            try:
                return self.get_item(index)
            except:
                index = random.randint(0, len(self.audio_files) - 1)

    def __len__(self):
        return len(self.audio_files)
Пример #24
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 training_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 data_folder,
                 audio_format,
                 return_stft=False):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.return_stft = return_stft
        if self.return_stft:
            self.stft = STFT(filter_length=filter_length,
                             hop_length=hop_length,
                             win_length=win_length)
        else:
            self.stft = TacotronSTFT(filter_length=filter_length,
                                     hop_length=hop_length,
                                     win_length=win_length,
                                     sampling_rate=sampling_rate,
                                     mel_fmin=0.0,
                                     mel_fmax=8000.0)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.data_folder = data_folder
        self.audio_format = audio_format

    def get_stft(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        if self.return_stft:
            magnitudes, phases = self.stft.transform(audio_norm)
            magnitudes = dynamic_range_compression(magnitudes)
            magnitudes = torch.squeeze(magnitudes, 0)
            return magnitudes
        else:
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
            return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        filename = os.path.join(self.data_folder, filename)
        audio, sampling_rate = load_wav_to_torch(filename, self.audio_format)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data
            print('{} - NOT ENOUGH FRAMES'.format(filename))
        stft = self.get_stft(audio)
        audio = audio / MAX_WAV_VALUE

        return (stft, audio)

    def __len__(self):
        return len(self.audio_files)
Пример #25
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self,
                 training_files,
                 segment_length,
                 filter_length,
                 hop_length,
                 win_length,
                 sampling_rate,
                 mel_fmin,
                 mel_fmax,
                 load_mel_from_disk=False):
        self.load_mel_from_disk = load_mel_from_disk
        self.hop_length = hop_length
        self.audio_files = audiopaths_and_melpaths(
            training_files) if self.load_mel_from_disk else files_to_list(
                training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def get_mel_from_file(self, mel_path):
        melspec = np.load(mel_path)
        melspec = torch.autograd.Variable(melspec, requires_grad=False)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename[0]) if self.load_mel_from_disk \
                        else load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        if (self.load_mel_from_disk):
            # Take segment
            mel = np.load(filename[1])
            assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times of self.hop_length'
            max_mel_length = int(self.segment_length / self.hop_length)
            audio_ = audio.data.cpu().numpy()
            if (mel.shape[1] > len(audio_) / self.hop_length):  #handling error
                diff = int(mel.shape[1] - len(audio_) / self.hop_length)
                mel = mel[:, :-diff]
            if (mel.shape[1] < len(audio_) / self.hop_length):
                print(filename, mel.shape, len(audio))
            if audio.size(0) >= self.segment_length:
                max_mel_start = int(
                    (audio.size(0) - self.segment_length) / self.hop_length
                )  # audio.size(0)%self.hop_length is the remainder
                mel_start = random.randint(0, max_mel_start)
                audio_start = mel_start * self.hop_length
                audio = audio[audio_start:audio_start + self.segment_length]
                mel = mel[:, mel_start:mel_start + max_mel_length]
            else:
                len_pad = int((self.segment_length / self.hop_length) -
                              mel.shape[1])
                pad = np.ones((80, len_pad), dtype=np.float32) * -11.512925
                mel = np.append(mel, pad, axis=1)
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data

            mel = torch.from_numpy(mel).float()
            audio = audio / MAX_WAV_VALUE
            # if(mel.shape[1] != int(self.segment_length/self.hop_length)):
            #     print()
        else:
            # Take segment
            if audio.size(0) >= self.segment_length:
                max_audio_start = audio.size(0) - self.segment_length
                audio_start = random.randint(0, max_audio_start)
                audio = audio[audio_start:audio_start + self.segment_length]
            else:
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data

            mel = self.get_mel(audio)  #
            audio = audio / MAX_WAV_VALUE
        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Пример #26
0
class Mel2Samp2(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)

        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.everything = self.pack()


        self.max_time = self.segment_length

        best = -1
        score = 0.0
        if self.max_time == 0: ##auto configuration for maximum efficiency
            for x in range(250000, 1000000,10000):
                self.max_time = x
                self.do_binpacking()

                utilized =   np.asarray(self.volumes).mean()/self.max_time
                if utilized > score:
                    score = utilized
                    best= x

        self.max_time = best
        self.do_binpacking()

        ##import pdb; pdb.set_trace()
        perm = list(range(len(self.balancer)))
        random.shuffle(perm)
        self.volumes = [self.volumes[p] for p in perm  ]
        self.balancer = [self.balancer[p] for p in perm  ]

        
    def pack(self):
        for file in self.audio_files:
            audio, sampling_rate = load_wav_to_torch(file)
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            timings.append(audio.size(0))



    def get_timings(self):
        timings = np.zeros(len(self.audio_files))
        for file in self.audio_files:
            audio, sampling_rate = load_wav_to_torch(file)
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            timings.append(audio.size(0))

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        print(index)

        idxs = self.balancer[index]
        time = self.volumes[index]

        pad = (self.max_time - time) // (len(idxs)- 1)
        print(pad)
        print(time)
        print(idxs)
        audios = []
        for k,idx in enumerate(idxs):
            filename = self.audio_files[idx]
            audio, sampling_rate = load_wav_to_torch(filename)
            
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))

            print("before pad %d: %s" % ( idx ,audio.shape))

            if k != len(idxs) - 1:
                audio = torch.nn.functional.pad(audio, (0, pad), 'constant').data
                print("after pad %d: %s" % ( idx ,audio.shape))
            
            audios.append(audio)

        audio = torch.cat(audios)
        print("after cat: %s" % audio.shape)

        if audio.size(0) < self.max_time:
            audio = torch.nn.functional.pad(audio, (0, self.max_time- audio.size(0)), 'constant').data
        print("after last pad: %s" % audio.shape)

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)



    def __len__(self):
        return len(self.balancer)
Пример #27
0
def train(num_gpus,
          rank,
          group_name,
          output_directory,
          epochs,
          learning_rate,
          sigma,
          loss_empthasis,
          iters_per_checkpoint,
          batch_size,
          seed,
          fp16_run,
          checkpoint_path,
          with_tensorboard,
          logdirname,
          datedlogdir,
          warm_start=False):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    global WaveGlow
    global WaveGlowLoss

    ax = True  # this is **really** bad coding practice :D
    if ax:
        from efficient_model_ax import WaveGlow
        from efficient_loss import WaveGlowLoss
    else:
        if waveglow_config[
                "yoyo"]:  # efficient_mode # TODO: Add to Config File
            from efficient_model import WaveGlow
            from efficient_loss import WaveGlowLoss
        else:
            from glow import WaveGlow, WaveGlowLoss

    criterion = WaveGlowLoss(sigma, loss_empthasis)
    model = WaveGlow(**waveglow_config).cuda()
    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======
    STFT = [
        TacotronSTFT(filter_length=window,
                     hop_length=data_config['hop_length'],
                     win_length=window,
                     sampling_rate=data_config['sampling_rate'],
                     n_mel_channels=160,
                     mel_fmin=0,
                     mel_fmax=16000)
        for window in data_config['validation_windows']
    ]

    loader_STFT = TacotronSTFT(filter_length=data_config['filter_length'],
                               hop_length=data_config['hop_length'],
                               win_length=data_config['win_length'],
                               sampling_rate=data_config['sampling_rate'],
                               n_mel_channels=160,
                               mel_fmin=data_config['mel_fmin'],
                               mel_fmax=data_config['mel_fmax'])

    optimizer = "LAMB"
    optimizer_fused = True  # use Apex fused optimizer, should be identical to normal but slightly faster
    if optimizer_fused:
        from apex import optimizers as apexopt
        if optimizer == "Adam":
            optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate)
        elif optimizer == "LAMB":
            optimizer = apexopt.FusedLAMB(model.parameters(),
                                          lr=learning_rate,
                                          max_grad_norm=1000)
    else:
        if optimizer == "Adam":
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        elif optimizer == "LAMB":
            from lamb import Lamb as optLAMB
            optimizer = optLAMB(model.parameters(), lr=learning_rate)
            #import torch_optimizer as optim
            #optimizer = optim.Lamb(model.parameters(), lr=learning_rate)
            #raise# PyTorch doesn't currently include LAMB optimizer.

    if fp16_run:
        global amp
        from apex import amp
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          min_loss_scale=1.0)
    else:
        amp = None

    ## LEARNING RATE SCHEDULER
    if True:
        from torch.optim.lr_scheduler import ReduceLROnPlateau
        min_lr = 1e-5
        factor = 0.1**(
            1 / 5)  # amount to scale the LR by on Validation Loss plateau
        scheduler = ReduceLROnPlateau(optimizer,
                                      'min',
                                      factor=factor,
                                      patience=20,
                                      cooldown=2,
                                      min_lr=min_lr,
                                      verbose=True)
        print("ReduceLROnPlateau used as Learning Rate Scheduler.")
    else:
        scheduler = False

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path != "":

        #warm_start = 0 # WARM START THE MODEL AND RESET ANY INVALID LAYERS

        model, optimizer, iteration, scheduler = load_checkpoint(
            checkpoint_path,
            model,
            optimizer,
            scheduler,
            fp16_run,
            warm_start=warm_start)
        iteration += 1  # next iteration is iteration + 1

    trainset = Mel2Samp(**data_config, check_files=True)
    speaker_lookup = trainset.speaker_ids
    # =====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        train_sampler = DistributedSampler(trainset, shuffle=True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True
    # =====END:   ADDED FOR DISTRIBUTED======
    train_loader = DataLoader(trainset,
                              num_workers=2,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True)

    # Get shared output_directory ready
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        from tensorboardX import SummaryWriter
        if datedlogdir:
            timestr = time.strftime("%Y_%m_%d-%H_%M_%S")
            log_directory = os.path.join(output_directory, logdirname, timestr)
        else:
            log_directory = os.path.join(output_directory, logdirname)
        logger = SummaryWriter(log_directory)

    moving_average = int(min(len(train_loader),
                             100))  # average loss over entire Epoch
    rolling_sum = StreamingMovingAverage(moving_average)
    start_time = time.time()
    start_time_single_batch = time.time()
    model.train()

    # best (averaged) training loss
    if os.path.exists(os.path.join(output_directory, "best_model") + ".txt"):
        best_model_loss = float(
            str(
                open(os.path.join(output_directory, "best_model") + ".txt",
                     "r",
                     encoding="utf-8").read()).split("\n")[0])
    else:
        best_model_loss = -6.20

    # best (validation) MSE on inferred spectrogram.
    if os.path.exists(
            os.path.join(output_directory, "best_val_model") + ".txt"):
        best_MSE = float(
            str(
                open(os.path.join(output_directory, "best_val_model") + ".txt",
                     "r",
                     encoding="utf-8").read()).split("\n")[0])
    else:
        best_MSE = 9e9

    epoch_offset = max(0, int(iteration / len(train_loader)))

    pytorch_total_params = sum(p.numel() for p in model.parameters())
    print("{:,} total parameters in model".format(pytorch_total_params))
    pytorch_total_params = sum(p.numel() for p in model.parameters()
                               if p.requires_grad)
    print("{:,} trainable parameters.".format(pytorch_total_params))

    training = True
    while training:
        try:
            if rank == 0:
                epochs_iterator = tqdm(range(epoch_offset, epochs),
                                       initial=epoch_offset,
                                       total=epochs,
                                       smoothing=0.01,
                                       desc="Epoch",
                                       position=1,
                                       unit="epoch")
            else:
                epochs_iterator = range(epoch_offset, epochs)
            # ================ MAIN TRAINING LOOP! ===================
            for epoch in epochs_iterator:
                print(f"Epoch: {epoch}")
                if num_gpus > 1: train_sampler.set_epoch(epoch)

                if rank == 0:
                    iters_iterator = tqdm(enumerate(train_loader),
                                          desc=" Iter",
                                          smoothing=0,
                                          total=len(train_loader),
                                          position=0,
                                          unit="iter",
                                          leave=True)
                else:
                    iters_iterator = enumerate(train_loader)
                for i, batch in iters_iterator:
                    # run external code every iter, allows the run to be adjusted without restarts
                    if (i == 0 or iteration % param_interval == 0):
                        try:
                            with open("run_every_epoch.py") as f:
                                internal_text = str(f.read())
                                if len(internal_text) > 0:
                                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                                    ldict = {'iteration': iteration}
                                    exec(internal_text, globals(), ldict)
                                else:
                                    print(
                                        "No Custom code found, continuing without changes."
                                    )
                        except Exception as ex:
                            print(f"Custom code FAILED to run!\n{ex}")
                        globals().update(ldict)
                        locals().update(ldict)
                        if show_live_params:
                            print(internal_text)
                    if not iteration % 50:  # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR)
                        learning_rate = optimizer.param_groups[0]['lr']
                    # Learning Rate Schedule
                    if custom_lr:
                        old_lr = learning_rate
                        if iteration < warmup_start:
                            learning_rate = warmup_start_lr
                        elif iteration < warmup_end:
                            learning_rate = (iteration - warmup_start) * (
                                (A_ + C_) - warmup_start_lr
                            ) / (
                                warmup_end - warmup_start
                            ) + warmup_start_lr  # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                        else:
                            if iteration < decay_start:
                                learning_rate = A_ + C_
                            else:
                                iteration_adjusted = iteration - decay_start
                                learning_rate = (
                                    A_ * (e**(-iteration_adjusted / B_))) + C_
                        assert learning_rate > -1e-8, "Negative Learning Rate."
                        if old_lr != learning_rate:
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = learning_rate
                    else:
                        scheduler.patience = scheduler_patience
                        scheduler.cooldown = scheduler_cooldown
                        if override_scheduler_last_lr:
                            scheduler._last_lr = override_scheduler_last_lr
                            print(
                                "Scheduler last_lr overriden. scheduler._last_lr =",
                                scheduler._last_lr)
                        if override_scheduler_best:
                            scheduler.best = override_scheduler_best
                            print(
                                "Scheduler best metric overriden. scheduler.best =",
                                override_scheduler_best)

                    model.zero_grad()
                    mel, audio, speaker_ids = batch
                    mel = torch.autograd.Variable(mel.cuda(non_blocking=True))
                    audio = torch.autograd.Variable(
                        audio.cuda(non_blocking=True))
                    if waveglow_config['WN_config']['speaker_embed_dim'] > 0:
                        speaker_ids = speaker_ids.cuda(
                            non_blocking=True).long().squeeze(1)
                        outputs = model(mel, audio, speaker_ids)
                    else:
                        outputs = model(mel, audio, None)

                    loss = criterion(outputs)
                    if num_gpus > 1:
                        reduced_loss = reduce_tensor(loss.data,
                                                     num_gpus).item()
                    else:
                        reduced_loss = loss.item()

                    if iteration > 1e3 and (
                        (reduced_loss > LossExplosionThreshold) or
                        (math.isnan(reduced_loss))):
                        raise LossExplosion(
                            f"\n\n\nLOSS EXPLOSION EXCEPTION: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n"
                        )

                    if fp16_run:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()

                    grad_clip = False
                    grad_clip_thresh = 10000
                    if grad_clip:
                        if fp16_run:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), grad_clip_thresh)
                        else:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                model.parameters(), grad_clip_thresh)
                        is_overflow = math.isinf(grad_norm) or math.isnan(
                            grad_norm)
                    else:
                        is_overflow = False
                        grad_norm = 0.00001

                    optimizer.step()
                    if not is_overflow and with_tensorboard and rank == 0:
                        if (iteration % 100000 == 0):
                            # plot distribution of parameters
                            for tag, value in model.named_parameters():
                                tag = tag.replace('.', '/')
                                logger.add_histogram(tag,
                                                     value.data.cpu().numpy(),
                                                     iteration)
                        logger.add_scalar('training_loss', reduced_loss,
                                          iteration)
                        #logger.add_scalar('training_loss_exp', 500*(exp(reduced_loss)), iteration)
                        logger.add_scalar('training_loss_samples',
                                          reduced_loss, iteration * batch_size)
                        if (iteration % 20 == 0):
                            logger.add_scalar('learning.rate', learning_rate,
                                              iteration)
                        if (iteration % 10 == 0):
                            logger.add_scalar(
                                'duration', ((time.time() - start_time) / 10),
                                iteration)
                        start_time_single_batch = time.time()

                    average_loss = rolling_sum.process(reduced_loss)
                    if rank == 0:
                        if (iteration % 10 == 0):
                            tqdm.write(
                                "{} {}:  {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)  {:.2f}s/iter {:.4f}s/item"
                                .format(
                                    time.strftime("%H:%M:%S"), iteration,
                                    reduced_loss, average_loss,
                                    round(grad_norm, 3), learning_rate,
                                    min((grad_clip_thresh / grad_norm) *
                                        learning_rate, learning_rate),
                                    (time.time() - start_time) / 10,
                                    ((time.time() - start_time) / 10) /
                                    (batch_size * num_gpus)))
                            start_time = time.time()
                        else:
                            tqdm.write(
                                "{} {}:  {:.3f}  {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)"
                                .format(
                                    time.strftime("%H:%M:%S"), iteration,
                                    reduced_loss, average_loss,
                                    round(grad_norm, 3), learning_rate,
                                    min((grad_clip_thresh / grad_norm) *
                                        learning_rate, learning_rate)))

                    if rank == 0 and (len(rolling_sum.values) >
                                      moving_average - 2):
                        if (average_loss +
                                best_model_margin) < best_model_loss:
                            checkpoint_path = os.path.join(
                                output_directory, "best_model")
                            try:
                                save_checkpoint(model, optimizer,
                                                learning_rate, iteration, amp,
                                                scheduler, speaker_lookup,
                                                checkpoint_path)
                            except KeyboardInterrupt:  # Avoid corrupting the model.
                                save_checkpoint(model, optimizer,
                                                learning_rate, iteration, amp,
                                                scheduler, speaker_lookup,
                                                checkpoint_path)
                            text_file = open((f"{checkpoint_path}.txt"),
                                             "w",
                                             encoding="utf-8")
                            text_file.write(
                                str(average_loss) + "\n" + str(iteration))
                            text_file.close()
                            best_model_loss = average_loss  #Only save the model if X better than the current loss.
                    if rank == 0 and ((iteration % iters_per_checkpoint == 0)
                                      or
                                      (os.path.exists(save_file_check_path))):
                        checkpoint_path = f"{output_directory}/waveglow_{iteration}"
                        save_checkpoint(model, optimizer, learning_rate,
                                        iteration, amp, scheduler,
                                        speaker_lookup, checkpoint_path)
                        start_time_single_batch = time.time()
                        if (os.path.exists(save_file_check_path)):
                            os.remove(save_file_check_path)

                    if (iteration % validation_interval == 0):
                        if rank == 0:
                            MSE, MAE = validate(
                                model, loader_STFT, STFT, logger, iteration,
                                data_config['validation_files'],
                                speaker_lookup, sigma, output_directory,
                                data_config)
                            if scheduler:
                                MSE = torch.tensor(MSE, device='cuda')
                                if num_gpus > 1:
                                    broadcast(MSE, 0)
                                scheduler.step(MSE.item())
                                if MSE < best_MSE:
                                    checkpoint_path = os.path.join(
                                        output_directory, "best_val_model")
                                    try:
                                        save_checkpoint(
                                            model, optimizer, learning_rate,
                                            iteration, amp, scheduler,
                                            speaker_lookup, checkpoint_path)
                                    except KeyboardInterrupt:  # Avoid corrupting the model.
                                        save_checkpoint(
                                            model, optimizer, learning_rate,
                                            iteration, amp, scheduler,
                                            speaker_lookup, checkpoint_path)
                                    text_file = open(
                                        (f"{checkpoint_path}.txt"),
                                        "w",
                                        encoding="utf-8")
                                    text_file.write(
                                        str(MSE.item()) + "\n" +
                                        str(iteration))
                                    text_file.close()
                                    best_MSE = MSE.item(
                                    )  #Only save the model if X better than the current loss.
                        else:
                            if scheduler:
                                MSE = torch.zeros(1, device='cuda')
                                broadcast(MSE, 0)
                                scheduler.step(MSE.item())
                        learning_rate = optimizer.param_groups[0][
                            'lr']  #check actual learning rate (because I sometimes see learning_rate variable go out-of-sync with real LR)
                    iteration += 1
            training = False  # exit the While loop

        except LossExplosion as ex:  # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
            print(ex)  # print Loss
            if checkpoint_path == '':
                checkpoint_path = os.path.join(output_directory,
                                               "best_val_model")
            assert 'best_val_model' in checkpoint_path, "Automatic restarts require checkpoint set to best_val_model"
            model.eval()
            model, optimizer, iteration, scheduler = load_checkpoint(
                checkpoint_path, model, optimizer, scheduler, fp16_run)
            learning_rate = optimizer.param_groups[0]['lr']
            epoch_offset = max(0, int(iteration / len(train_loader)))
            model.train()
            iteration += 1
            pass  # and continue training.
Пример #28
0
class Mel2SampSplit(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin, mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.dataset = self.pack()





    def pack(self):
        timings = np.zeros(len(self.audio_files), dtype= np.int32)
        PAD = 350
        assert(self.sampling_rate % PAD == 0)

        for i,file in enumerate(self.audio_files):
            audio, sampling_rate = load_wav_to_torch(file)
            if sampling_rate != self.sampling_rate:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, self.sampling_rate))
            t = audio.size(0)
            t2 = t + t % PAD
            
            timings[i] = t2

        segment_len = self.sampling_rate

        total_time = timings.sum() 
        n_data = int(total_time // segment_len) if total_time % segment_len  == 0 else int((total_time // segment_len) + 1)
        
        ##import pdb; pdb.set_trace()
        
        dataset = torch.zeros([ n_data,segment_len], dtype=torch.float32 ) ## all data will be here
        offset = 0
        cur = 0
        for i,file in enumerate(self.audio_files):
            audio, _ = load_wav_to_torch(file)
            audio = torch.nn.functional.pad(audio, (0, timings[i] - audio.size(0)), 'constant').data
            assert(timings[i]  == audio.size(0))
            data_left =  audio.size(0)
            data_offset = 0
            space = segment_len - offset
            while (data_left >= space): ## fill the next data segment to the end
                dataset.data[cur,offset:offset+space] = audio[data_offset:data_offset+space]
                data_left = data_left - space
                data_offset = data_offset + space
                offset = 0
                space = segment_len
                cur = cur + 1

            ## append whats left in the next data segement
            if data_left > 0:
                new_offset = offset + data_left
                dataset.data[cur,offset:new_offset] = audio[data_offset:]
                offset = new_offset
                
        return dataset

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        audio = self.dataset.data[index,:]
        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE
        return (mel, audio)

    def __len__(self):
        return self.dataset.size(0)
Пример #29
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, validation_files, validation_windows,
                 segment_length, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax, load_mel_from_disk,
                 preempthasis):
        self.audio_files = load_filepaths_and_text(training_files)

        print("Files before checking: ", len(self.audio_files))

        i = 0
        i_offset = 0
        for i_ in range(len(self.audio_files)):
            i = i_ + i_offset
            if i == len(self.audio_files): break
            file = self.audio_files[i]
            if not os.path.exists(file[0]):
                print(file[0], "does not exist")
                self.audio_files.remove(file)
                i_offset -= 1
                continue

            audio_data, sample_r = load_wav_to_torch(file[0])
            if audio_data.size(0) <= segment_length:
                print(file[0], "is too short")
                self.audio_files.remove(file)
                i_offset -= 1
                continue

        print("Files after checking: ", len(self.audio_files))

        self.load_mel_from_disk = load_mel_from_disk
        self.speaker_ids = self.create_speaker_lookup_table(self.audio_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 n_mel_channels=160,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        if preempthasis:
            self.preempthasise = PreEmphasis(preempthasis)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate
        self.hop_length = hop_length
        self.win_length = win_length

    def create_speaker_lookup_table(self, audiopaths_and_text):
        speaker_ids = np.sort(np.unique([x[2] for x in audiopaths_and_text]))
        d = {int(speaker_ids[i]): i for i in range(len(speaker_ids))}
        return d

    def get_speaker_id(self, speaker_id):
        return torch.IntTensor([self.speaker_ids[int(speaker_id)]])

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm).squeeze(0)
        return melspec

    def get_segment(self,
                    audio,
                    mel,
                    segment_length,
                    hop_length,
                    n_mel_channels=160):
        mel_segment_length = int(segment_length / hop_length)  # 8400/600 = 14
        if audio.size(0) >= segment_length:
            max_mel_start = int(
                (audio.size(0) - segment_length) /
                hop_length)  # audio.size(0)%self.hop_length is the remainder
            mel_start = random.randint(0, max_mel_start)
            audio_start = mel_start * hop_length
            audio = audio[audio_start:audio_start + segment_length]
            mel = mel[:, mel_start:mel_start + mel_segment_length]
        else:
            mel_start = 0
            n_mel_channels = 160  # TODO take from config file
            len_pad = int((segment_length / hop_length) - mel.shape[1])
            pad = np.ones(
                (n_mel_channels, len_pad), dtype=np.float32) * -11.512925
            mel = np.append(mel, pad, axis=1)
            audio = torch.nn.functional.pad(
                audio, (0, segment_length - audio.size(0)), 'constant').data
        return audio, mel, mel_start, mel_start + mel_segment_length

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename[0])
        assert audio.shape[
            0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}"
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        if (self.load_mel_from_disk):
            # Take segment
            mel = np.load(filename[1])
            assert self.segment_length % self.hop_length == 0, 'self.segment_length must be n times of self.hop_length'
            #            if (mel.shape[1] > ceil(len(audio)/self.hop_length)):
            #                print('mel is longer than audio file')
            #                print('path', filename[1], '\nmel_length', mel.shape[1], '\naudio length', len(audio), '\naudio_hops', ceil(len(audio)/self.hop_length))
            #                raise Exception
            #            if (mel.shape[1] < ceil(len(audio)/self.hop_length)):
            #                print('mel is shorter than audio file')
            #                print('path', filename[1], '\nmel_length', mel.shape[1], '\naudio length', len(audio), '\naudio_hops', ceil(len(audio)/self.hop_length))
            #                raise Exception
            loop = 0
            while True:
                audio_, mel_, start_step, stop_step = self.get_segment(
                    audio, mel, self.segment_length,
                    self.hop_length)  # get random segment of audio file
                std = torch.std(audio_)
                if std > 250:
                    break  # if sample is not silent, use sample for WaveGlow.
                loop += 1
                if loop > 20:
                    print("No Silent Sample Found, filename:", filename[0])
                    break
            #print(f"STD: {std} Loops: {loop}")
            audio, mel = audio_, mel_

            mel = torch.from_numpy(mel).float()
        else:
            # Take segment
            if audio.size(0) >= self.segment_length:
                max_audio_start = audio.size(0) - self.segment_length
                std = 9e9
                loop = 0
                while True:
                    audio_start = random.randint(0, max_audio_start)
                    audio_segment = audio[audio_start:audio_start +
                                          self.segment_length]
                    std = torch.std(audio_segment)
                    if std > 250:
                        break  # if sample is not silent, use sample for WaveGlow.
                    loop += 1
                    if loop > 20:
                        print("No Silent Sample Found, filename:", filename[0])
                        break
                audio = audio_
            else:
                audio = torch.nn.functional.pad(
                    audio, (0, self.segment_length - audio.size(0)),
                    'constant').data
            assert audio.shape[
                0], f"Audio has 0 length.\nFile: {filename[0]}\nIndex: {index}"
            mel = self.get_mel(audio)  # generate mel from audio segment

        audio = audio / MAX_WAV_VALUE

        if hasattr(self, 'preempthasise'):
            audio = self.preempthasise(
                audio.unsqueeze(0).unsqueeze(0)).squeeze()

        speaker_id = self.get_speaker_id(filename[2])

        #mel = (mel+5.2)*0.5 # shift values between approx -4 and 4
        return (mel, audio, speaker_id)  # (mel, audio, speaker_id)

    def __len__(self):
        return len(self.audio_files)
Пример #30
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        #过滤短音频
        # i = 0
        # for file in files_to_list(training_files):
        #     audio_data, sample_r = load_wav_to_torch(file)

        #     if audio_data.size(0) < segment_length:
        #         i += 1
        #         print(file)
        #         self.audio_files.remove(file)
        # print("{} files shorter than segment_len".format(i))

        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            print("Warning: short wav")
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)