示例#1
0
    def get_random_wav_and_label(self, tar_wavfiles, ntar_wavfiles):
        """

        :return: wav: raw wave. float32. shape=(t, ),
                 label: 1 if target, 0 otherwise. int32.
                 melspec: mel-spectrogram. float32. shape=(t, n_mels)
        """
        wavfiles, label = (
            tar_wavfiles,
            self.tar_labels) if np.random.sample(1) <= self.tar_ratio else (
                ntar_wavfiles, self.ntar_labels)
        wavfile = wavfiles[np.random.randint(0, len(wavfiles))]
        if type(wavfile) == bytes:
            wavfile = wavfile.decode()
        if wavfile.endswith('arr'):  # pyarrow format
            wav = read_wav_from_arr(wavfile)
        else:
            wav = read_wav(wavfile, sr=hp.signal.sr)
        wav = trim_wav(wav)

        wav = crop_random_wav(wav, self.length)
        wav = augment_volume(wav)
        wav = fix_length(wav, self.length)  # padding
        melspec = wav2melspec_db(wav,
                                 sr=hp.signal.sr,
                                 n_fft=hp.signal.n_fft,
                                 win_length=hp.signal.win_length,
                                 hop_length=hp.signal.hop_length,
                                 n_mels=hp.signal.n_mels,
                                 min_db=hp.signal.min_db,
                                 max_db=hp.signal.max_db)
        melspec = np.float32(melspec)
        label = np.float32(label)
        return wav, melspec, label
    def _get_wav_and_melspec(wav_file, length, is_training=True):
        '''
        the range of values of wav is [-1, 1].
        '''

        wav = read_wav(wav_file, sr=hp.signal.sr)
        wav = trim_wav(wav)
        # divide wav into chunks that have the given length and one is randomly selected in training, but first chunk in generation.
        n_clips = math.ceil(len(wav) / length) if is_training else 1
        idx = random.randrange(n_clips)
        start, end = length * idx, length * (idx + 1)
        wav = wav[start:end]
        assert (len(wav) <= length)
        wav = fix_length(wav, length)  # padding in case of last chunk.

        melspec = wav2melspec_db(wav,
                                 sr=hp.signal.sr,
                                 n_fft=hp.signal.n_fft,
                                 win_length=hp.signal.win_length,
                                 hop_length=hp.signal.hop_length,
                                 n_mels=hp.signal.n_mels,
                                 min_db=hp.signal.min_db,
                                 max_db=hp.signal.max_db)
        wav = np.expand_dims(wav, -1)
        return wav, melspec.astype(np.float32)
示例#3
0
 def _load_random_wav(self, speaker_id):
     wavfile = self.audio_meta.get_random_audio(speaker_id)
     wav = read_wav(wavfile, hp.signal.sr)
     # wav = trim_wav(wav)
     length = int(hp.signal.duration * hp.signal.sr)
     wav = crop_random_wav(wav, length=length)
     wav = fix_length(wav, length, mode='reflect')
     return wav  # (t, n_mel)
示例#4
0
 def get_random_wav(self, wavfile):
     """
     :param: wavfile: a raw wave file.
     :return: wav: raw wave. float32. shape=(t, ),
              melspec: mel-spectrogram. float32. shape=(t, n_mels),
              wavfile: the raw wave file.
     """
     wav = read_wav(wavfile, sr=hp.signal.sr)
     wav = trim_wav(wav)
     wav = fix_length(wav, self.length)  # crop from the beginning.
     melspec = wav2melspec_db(wav,
                              sr=hp.signal.sr,
                              n_fft=hp.signal.n_fft,
                              win_length=hp.signal.win_length,
                              hop_length=hp.signal.hop_length,
                              n_mels=hp.signal.n_mels,
                              min_db=hp.signal.min_db,
                              max_db=hp.signal.max_db)
     melspec = np.float32(melspec)
     return wav, melspec, wavfile
    def _get_wav_and_melspec(wav_file, length=None, is_training=True):
        wav = read_wav(wav_file, sr=hp.signal.sr)
        wav = trim_wav(wav)
        if length:
            n_clips = math.ceil(len(wav) / length) if is_training else 1
            idx = random.randrange(n_clips)
            start, end = length * idx, length * (idx + 1)
            wav = wav[start:end]
            assert (len(wav) <= length)
            wav = fix_length(wav, length)  # padding

        melspec = wav2melspec_db(wav,
                                 sr=hp.signal.sr,
                                 n_fft=hp.signal.n_fft,
                                 win_length=hp.signal.win_length,
                                 hop_length=hp.signal.hop_length,
                                 n_mels=hp.signal.n_mels,
                                 min_db=hp.signal.min_db,
                                 max_db=hp.signal.max_db)
        wav = np.expand_dims(wav, -1)
        return wav, melspec.astype(np.float32)
    ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(hp.logdir)

    pred_conf = PredictConfig(
        model=model,
        input_names=['x'],
        output_names=['embedding/embedding', 'prediction'],
        session_init=SaverRestore(ckpt) if ckpt else None)
    embedding_pred = OfflinePredictor(pred_conf)

    embedding, pred_speaker_id = embedding_pred(mel_spec)

    # get a random audio of the predicted speaker.
    wavfile_pred_speaker = np.array(map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id))
    length = int(hp.signal.duration * hp.signal.sr)
    wav_pred_speaker = np.array(
        map(lambda w: fix_length(read_wav(w, hp.signal.sr, duration=hp.signal.duration), length),
            wavfile_pred_speaker))

    # write audio
    tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10)
    tf.summary.audio('wav_pred', wav_pred_speaker, hp.signal.sr, max_outputs=10)

    # write prediction
    speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id]
    pred_speaker_name = [audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id]

    meta = [tuple(audio_meta.meta_dict[sid][k] for k in audio_meta.target_meta_field()) for sid in speaker_id] if hp.embed.meta_path else None
    pred_meta = [tuple(audio_meta_train.meta_dict[sid][k] for k in audio_meta_train.target_meta_field()) for sid in pred_speaker_id] if hp.train.meta_path else None
    prediction = ['{} ({}) -> {} ({})'.format(s, s_meta, p, p_meta)
                  for s, p, s_meta, p_meta in zip(speaker_name, pred_speaker_name, meta, pred_meta)]
    tf.summary.text('prediction', tf.convert_to_tensor(prediction))
示例#7
0
        output_names=['embedding/embedding', 'prediction'],
        session_init=SaverRestore(ckpt) if ckpt else None,
    )

    embedding_pred = OfflinePredictor(pred_conf)

    embedding, pred_speaker_id = embedding_pred(mel_spec)

    # get a random audio of the predicted speaker.
    wavfile_pred_speaker = np.array(
        map(lambda s: audio_meta_train.get_random_audio(s), pred_speaker_id))
    length = int(hp.signal.duration * hp.signal.sr)
    wav_pred_speaker = np.array(
        map(
            lambda w: fix_length(
                read_wav(w, hp.signal.sr, duration=hp.signal.duration), length
            ), wavfile_pred_speaker))

    # write audio
    tf.summary.audio('wav', wav, hp.signal.sr, max_outputs=10)
    tf.summary.audio('wav_pred',
                     wav_pred_speaker,
                     hp.signal.sr,
                     max_outputs=10)

    # write prediction
    speaker_name = [audio_meta.speaker_dict[sid] for sid in speaker_id]
    pred_speaker_name = [
        audio_meta_train.speaker_dict[sid] for sid in pred_speaker_id
    ]