示例#1
0
def predict_acoustic(device, labels, acoustic_model, acoustic_in_scaler,
        acoustic_out_scaler, binary_dict, continuous_dict,
        subphone_features="coarse_coding",
        pitch_indices=None, log_f0_conditioning=True):

    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(labels,
                                                  binary_dict, continuous_dict,
                                                  add_frame_features=True,
                                                  subphone_features=subphone_features)

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(
                _midi_to_hz(linguistic_features, idx, log_f0_conditioning),
                    kind="slinear")

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))
    pred_acoustic = acoustic_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()

    # Apply denormalization
    pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)

    return pred_acoustic
示例#2
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
示例#3
0
def predict_duration(device, labels, duration_model, duration_in_scaler, duration_out_scaler,
        lag, binary_dict, continuous_dict, pitch_indices=None, log_f0_conditioning=True):
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels, binary_dict, continuous_dict,
        add_frame_features=False, subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning),
                    kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(duration_linguistic_features)

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))
    pred_durations = duration_model(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()

    # Apply denormalization
    pred_durations = duration_out_scaler.inverse_transform(pred_durations)
    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
 def collect_features(self, wav_path):
     
     # x: Raw audio, (Sample_length, )
     x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64)
     
     
     # f0: F0, (Frame_length, ) 
     # lf0: log(f0) --> interp1d (Frame_length, )
     # vuv: voice/unvoiced (Frame_length, )
     f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     lf0 = f0.copy()
     lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)])
     lf0 = interp1d(lf0, kind="slinear")
     vuv = (lf0 != 0).astype(np.float32)
     
     
     # spec: Spectrogram, (Frame_length x Dim), Dim = 513
     # bap: coded aperiodicity, (Frame_length, )
     # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60
     spec = pyworld.cheaptrick(x, f0, timeaxis, fs)
     aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)        
     bap = pyworld.code_aperiodicity(aperiodicity, fs)
     mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs))
     
     
     # Stacking Features: total dimesnion = 64
     features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec))
     return features.astype(np.float32)
def _process_feature(out_dir, index, wav_path, label_path):

    # get list of wav files
    wav_files = os.listdir(os.path.dirname(wav_path))
    # check wav_file
    assert len(
        wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!"

    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    n_frames = len(f0)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    features = np.hstack((mgc, lf0, vuv, bap))

    # get list of lab files
    lab_files = os.listdir(os.path.dirname(label_path))
    # check wav_file
    assert len(
        lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!"

    # Cut silence frames by HTS alignment
    labels = hts.load(label_path)
    features = features[:labels.num_frames()]
    indices = labels.silence_frame_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the acoustic to disk:
    acoustic_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, acoustic_filename),
            features.astype(np.float32),
            allow_pickle=False)

    dataset_ids.append(acoustic_filename[:-4])
    with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'),
              'wb') as pklFile:
        pickle.dump(dataset_ids, pklFile)

    # Return a tuple describing this training example:
    return (acoustic_filename, n_frames, voiced_frames)
示例#6
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
示例#7
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.linguistic_features(
         labels, self.binary_dict, self.continuous_dict,
         add_frame_features=self.add_frame_features,
         subphone_features=self.subphone_features)
     if self.log_f0_conditioning:
         for idx in self.pitch_idx:
             features[:, idx] = interp1d(_midi_to_hz(features, idx, True), kind="slinear")
     return features.astype(np.float32)
示例#8
0
文件: gen.py 项目: xzm2004260/nnsvs
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-30, 30]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)
    y = timelag_model(x, [x.shape[1]]).squeeze(0).cpu()

    # De-normalization and rounding
    lag = np.round(timelag_out_scaler.inverse_transform(y.data.numpy()))

    # Clip to the allowed range
    lag = np.clip(lag, allowed_range[0], allowed_range[1])

    # frames -> 100 ns
    lag *= 50000

    return lag
示例#9
0
def get_acoustic_feature(lab_path, wav_path, sampling_rate, hop_size_in_ms,
                         mcep_order, windows):
    fs, audio = wavfile.read(wav_path)
    audio = audio.astype(np.float64) / 2**15
    if fs != sampling_rate:
        audio = audio.astype(np.float32)
        audio = librosa.resample(audio, fs, sampling_rate)
        audio = (audio * 2**15).astype(np.float64)
    # extract f0
    f0, timeaxis = pyworld.dio(audio,
                               sampling_rate,
                               frame_period=hop_size_in_ms)
    # modify f0
    f0 = pyworld.stonemask(audio, f0, timeaxis, sampling_rate)
    # voiced/unvoiced flag
    vuv = (f0 > 0)[:, None].astype(np.float32)
    # calculate log f0
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    # interpolate f0 in log-domain
    lf0 = interp1d(lf0, kind='slinear')[:, None]

    # calculate mel-cepstrum
    spectrogram = pyworld.cheaptrick(audio, f0, timeaxis, sampling_rate)
    mgc = pysptk.sp2mc(spectrogram,
                       order=mcep_order,
                       alpha=pysptk.util.mcepalpha(sampling_rate))
    # calculate aperiodicity parameter
    aperiodicity = pyworld.d4c(audio, f0, timeaxis, sampling_rate)
    bap = pyworld.code_aperiodicity(aperiodicity, sampling_rate)

    # calculate dynamic features
    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    feature = np.hstack((mgc, lf0, vuv, bap))

    # cut silence frames by HTS alignment
    labels = hts.load(lab_path)
    feature = feature[:labels.num_frames()]
    if labels.num_frames() > len(feature):
        return
    indices = labels.silence_frame_indices()
    feature = np.delete(feature, indices, axis=0)

    return feature.astype(np.float32)
示例#10
0
def _extract_static_feats(wav, sr):
    f0, timeaxis = pyworld.dio(wav, sr, frame_period=5)
    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, sr)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, sr)

    mgc = pysptk.sp2mc(spectrogram, order=59, alpha=pysptk.util.mcepalpha(sr))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")
    bap = pyworld.code_aperiodicity(aperiodicity, sr)

    feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32)
    stream_sizes = [mgc.shape[1], lf0.shape[1], vuv.shape[1], bap.shape[1]]

    return feats, stream_sizes
示例#11
0
    def collect_features(self, wav_path, label_path):
        #print(wav_path)
        #fs, x = wavfile.read(wav_path)
        d = wavio.read(wav_path)
        fs, x = d.rate, d.data
        print(fs, wav_path)
        if len(x.shape) > 1:
            x = x[:, 0]
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)  #1
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)  #180
        lf0 = apply_delta_windows(lf0, windows)  #3
        bap = apply_delta_windows(bap, windows)  #3 biaobei 15

        features = np.hstack((mgc, lf0, vuv, bap))  # 187 biaobei 199
        #print('mgc:',mgc.shape)
        #print('lf0:', lf0.shape)
        #print('vuv:', vuv.shape)
        #print('bap:', bap.shape)

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        if len(indices) > 0:
            features = np.delete(features, indices, axis=0)
        #print(features.shape) #
        return features.astype(np.float32)
示例#12
0
def predict_acoustic(device,
                     labels,
                     acoustic_model,
                     acoustic_config,
                     acoustic_in_scaler,
                     acoustic_out_scaler,
                     binary_dict,
                     continuous_dict,
                     subphone_features="coarse_coding",
                     pitch_indices=None,
                     log_f0_conditioning=True):

    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features=subphone_features)

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(_midi_to_hz(
                linguistic_features, idx, log_f0_conditioning),
                                                   kind="slinear")

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range
        linguistic_features = np.clip(linguistic_features,
                                      acoustic_in_scaler.feature_range[0],
                                      acoustic_in_scaler.feature_range[1])

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        log_pi, log_sigma, mu = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # (B, T, D_out)
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)

            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * acoustic_out_scaler.var_
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_acoustic = acoustic_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic, acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features)

    return pred_acoustic
示例#13
0
def gen_waveform(labels,
                 acoustic_features,
                 binary_dict,
                 continuous_dict,
                 stream_sizes,
                 has_dynamic_features,
                 subphone_features="coarse_coding",
                 log_f0_conditioning=True,
                 pitch_idx=None,
                 num_windows=3,
                 post_filter=True,
                 sample_rate=48000,
                 frame_period=5,
                 relative_f0=True):
    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(stream_sizes,
                                                      has_dynamic_features,
                                                      len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features,
                                             static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                               sample_rate, fftlen)

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(
            labels,
            binary_dict,
            continuous_dict,
            add_frame_features=True,
            subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    # 音量を小さくする(音割れ防止)
    # TODO: ここのかける定数をいい感じにする
    spectrogram *= 0.000000001
    sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60)

    return f0, sp, bap, generated_waveform
示例#14
0
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=True,
            subphone_features="coarse_coding")

        f0_score = _midi_to_hz(l_features, self.pitch_idx, False)
        notes = l_features[:, self.pitch_idx]
        notes = notes[notes > 0]
        # allow 1-tone upper/lower
        min_f0 = librosa.midi_to_hz(min(notes) - 2)
        max_f0 = librosa.midi_to_hz(max(notes) + 2)
        assert max_f0 > min_f0

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period,
            f0_floor=min_f0, f0_ceil=max_f0)
        else:
            f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period,
                f0_floor=min_f0, f0_ceil=max_f0)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order,
                           alpha=pysptk.util.mcepalpha(fs))
        # F0 of speech
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        # Adjust lengths
        mgc = mgc[:labels.num_frames()]
        lf0 = lf0[:labels.num_frames()]
        vuv = vuv[:labels.num_frames()]
        bap = bap[:labels.num_frames()]

        if self.relative_f0:
            # # F0 derived from the musical score
            f0_score = f0_score[:, None]
            lf0_score = f0_score.copy()
            nonzero_indices = np.nonzero(f0_score)
            lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
            lf0_score = interp1d(lf0_score, kind="slinear")
            # relative f0
            diff_lf0 = lf0 - lf0_score
            diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

            f0_target = diff_lf0
        else:
            f0_target = lf0

        mgc = apply_delta_windows(mgc, self.windows)
        f0_target = apply_delta_windows(f0_target, self.windows)
        bap = apply_delta_windows(bap, self.windows)

        features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32)

        # Align waveform and features
        wave = x.astype(np.float32) / 2**15
        T = int(features.shape[0] * (fs * self.frame_period / 1000))
        if len(wave) < T:
            if T - len(wave) > 100:
                print("Warn!!", T, len(wave), T-len(wave))
                print("you have unepxcted input. Please debug though ipdb")
                import ipdb; ipdb.set_trace()
            else:
                pass
            wave = np.pad(wave, (0, T-len(wave)))
        assert wave.shape[0] >= T
        wave = wave[:T]

        return features, wave
示例#15
0
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(labels,
                                            self.binary_dict,
                                            self.continuous_dict,
                                            add_frame_features=True,
                                            subphone_features="coarse_coding")

        f0_score = midi_to_hz(l_features, self.pitch_idx, False)
        # TODO: better to set the margin carefully
        max_f0 = int(max(f0_score)) + 100
        min_f0 = int(max(self.f0_floor, min(f0_score[f0_score > 0]) - 20))
        assert max_f0 > min_f0

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(x,
                                           fs,
                                           frame_period=self.frame_period,
                                           f0_floor=min_f0,
                                           f0_ceil=max_f0)
        else:
            f0, timeaxis = pyworld.dio(x,
                                       fs,
                                       frame_period=frame_period,
                                       f0_floor=min_f0,
                                       f0_ceil=max_f0)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x,
                                         f0,
                                         timeaxis,
                                         fs,
                                         f0_floor=self.f0_floor)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=self.mgc_order,
                           alpha=pysptk.util.mcepalpha(fs))
        # F0 of speech
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        # # F0 derived from the musical score
        f0_score = f0_score[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(f0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        # Adjust lengths
        mgc = mgc[:labels.num_frames()]
        lf0 = lf0[:labels.num_frames()]
        vuv = vuv[:labels.num_frames()]
        bap = bap[:labels.num_frames()]

        diff_lf0 = lf0 - lf0_score
        diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

        mgc = apply_delta_windows(mgc, self.windows)
        diff_lf0 = apply_delta_windows(diff_lf0, self.windows)
        bap = apply_delta_windows(bap, self.windows)

        features = np.hstack((mgc, diff_lf0, vuv, bap))

        return features.astype(np.float32)
示例#16
0
def test_interp1d():
    f0 = np.random.rand(100, 1).astype(np.float32)
    f0[len(f0) // 2] = 0
    assert not np.all(f0 != 0)
    if0 = interp1d(f0)
    assert np.all(if0 != 0)
示例#17
0
文件: gen.py 项目: r9y9/nnsvs
def gen_spsvs_static_features(
    labels,
    acoustic_features,
    binary_dict,
    numeric_dict,
    stream_sizes,
    has_dynamic_features,
    subphone_features="coarse_coding",
    pitch_idx=None,
    num_windows=3,
    frame_period=5,
    relative_f0=True,
    vibrato_scale=1.0,
    vuv_threshold=0.3,
    force_fix_vuv=True,
):
    """Generate static features from predicted acoustic features

    Args:
        labels (HTSLabelFile): HTS labels
        acoustic_features (ndarray): predicted acoustic features
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        stream_sizes (list): stream sizes
        has_dynamic_features (list): whether each stream has dynamic features
        subphone_features (str): subphone feature type
        pitch_idx (int): index of pitch features
        num_windows (int): number of windows
        frame_period (float): frame period
        relative_f0 (bool): whether to use relative f0
        vibrato_scale (float): vibrato scale
        vuv_threshold (float): vuv threshold
        force_fix_vuv (bool): whether to use post-processing to fix VUV.

    Returns:
        tuple: tuple of mgc, lf0, vuv and bap.
    """
    if np.any(has_dynamic_features):
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, num_windows
        )
    else:
        static_stream_sizes = stream_sizes

    # Copy here to avoid inplace operations on input acoustic features
    acoustic_features = acoustic_features.copy()

    # Split multi-stream features
    streams = split_streams(acoustic_features, static_stream_sizes)

    if len(streams) == 4:
        mgc, target_f0, vuv, bap = streams
        vib, vib_flags = None, None
    elif len(streams) == 5:
        # Assuming diff-based vibrato parameters
        mgc, target_f0, vuv, bap, vib = streams
        vib_flags = None
    elif len(streams) == 6:
        # Assuming sine-based vibrato parameters
        mgc, target_f0, vuv, bap, vib, vib_flags = streams
    else:
        raise RuntimeError("Not supported streams")

    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features=subphone_features,
    )

    # Correct V/UV based on special phone flags
    if force_fix_vuv:
        vuv = correct_vuv_by_phone(vuv, binary_dict, linguistic_features)

    # F0
    if relative_f0:
        diff_lf0 = target_f0
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < vuv_threshold] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0
        f0[vuv < vuv_threshold] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    if vib is not None:
        if vib_flags is not None:
            # Generate sine-based vibrato
            vib_flags = vib_flags.flatten()
            m_a, m_f = vib[:, 0], vib[:, 1]

            # Fill zeros for non-vibrato frames
            m_a[vib_flags < 0.5] = 0
            m_f[vib_flags < 0.5] = 0

            # Gen vibrato
            sr_f0 = int(1 / (frame_period * 0.001))
            f0 = gen_sine_vibrato(f0.flatten(), sr_f0, m_a, m_f, vibrato_scale)
        else:
            # Generate diff-based vibrato
            f0 = f0.flatten() + vibrato_scale * vib.flatten()

    # NOTE: Back to log-domain for convenience
    lf0 = f0.copy()
    lf0[np.nonzero(lf0)] = np.log(f0[np.nonzero(lf0)])
    # NOTE: interpolation is necessary
    lf0 = interp1d(lf0, kind="slinear")

    lf0 = lf0[:, None] if len(lf0.shape) == 1 else lf0
    vuv = vuv[:, None] if len(vuv.shape) == 1 else vuv

    return mgc, lf0, vuv, bap
示例#18
0
文件: data_source.py 项目: r9y9/nnsvs
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(
            labels,
            self.binary_dict,
            self.continuous_dict,
            add_frame_features=True,
            subphone_features="coarse_coding",
        )

        f0_score = _midi_to_hz(l_features, self.pitch_idx, False)
        notes = l_features[:, self.pitch_idx]
        notes = notes[notes > 0]

        # allow 200 cent upper/lower to properly handle F0 estimation of
        # preparation, vibrato and overshoot.
        # NOET: set the minimum f0 to 63.5 Hz (125 - 3*20.5)
        # https://acoustics.jp/qanda/answer/50.html
        # NOTE: sinsy allows 30-150 cent frequency range for vibrato (as of 2010)
        # https://staff.aist.go.jp/m.goto/PAPER/SIGMUS201007oura.pdf
        min_f0 = max(63.5, librosa.midi_to_hz(min(notes) - 2))
        max_f0 = librosa.midi_to_hz(max(notes) + 2)
        assert max_f0 > min_f0

        # Workaround segfault issues of WORLD's CheapTrick
        min_f0 = min(min_f0, 500)

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if fs != self.sample_rate:
            raise RuntimeError(
                "Sample rate mismatch! {} != {}".format(fs, self.sample_rate)
            )

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0
            )
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=self.frame_period, f0_floor=min_f0, f0_ceil=max_f0
            )
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)

        # Workaround for https://github.com/r9y9/nnsvs/issues/7
        f0 = np.maximum(f0, 0)

        # Correct V/UV (and F0) based on the musical score information
        # treat frames where musical notes are not assigned as unvoiced
        if self.correct_vuv:
            # Use smoothed mask so that we don't mask out overshoot or something
            # that could happen at the start/end of notes
            # 0.5 sec. window (could be tuned for better results)
            win_length = int(0.5 / (self.frame_period * 0.001))
            mask = np.convolve(f0_score, np.ones(win_length) / win_length, "same")
            if len(f0) > len(mask):
                mask = np.pad(mask, (0, len(f0) - len(mask)), "constant")
            elif len(f0) < len(mask):
                mask = mask[: len(f0)]
            f0 = f0 * np.sign(mask)

        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=min_f0)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs, threshold=self.d4c_threshold)

        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)

        # F0 -> continuous F0
        lf0 = interp1d(lf0, kind="slinear")

        # Vibrato parameter extraction
        sr_f0 = int(1 / (self.frame_period * 0.001))
        if self.vibrato_mode == "sine":
            win_length = 64
            n_fft = 256
            threshold = 0.12

            if self.use_harvest:
                # NOTE: harvest is not supported here since the current implemented algorithm
                # relies on v/uv flags to find vibrato sections.
                # We use DIO since it provides more accurate v/uv detection in my experience.
                _f0, _timeaxis = pyworld.dio(
                    x,
                    fs,
                    frame_period=self.frame_period,
                    f0_floor=min_f0,
                    f0_ceil=max_f0,
                )
                _f0 = pyworld.stonemask(x, _f0, _timeaxis, fs)
                f0_smooth = extract_smoothed_f0(_f0, sr_f0, cutoff=8)
            else:
                f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=8)

            f0_smooth_cent = hz_to_cent_based_c4(f0_smooth)
            vibrato_likelihood = extract_vibrato_likelihood(
                f0_smooth_cent, sr_f0, win_length=win_length, n_fft=n_fft
            )
            vib_flags, m_a, m_f = extract_vibrato_parameters(
                f0_smooth_cent, vibrato_likelihood, sr_f0, threshold=threshold
            )
            m_a = interp1d(m_a, kind="linear")
            m_f = interp1d(m_f, kind="linear")
            vib = np.stack([m_a, m_f], axis=1)
            vib_flags = vib_flags[:, np.newaxis]
        elif self.vibrato_mode == "diff":
            # NOTE: vibrato is known to have 3 ~ 8 Hz range (in general)
            # remove higher frequency than 3 to separate vibrato from the original F0
            f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=3)
            vib = (f0 - f0_smooth)[:, np.newaxis]
            vib_flags = None
        elif self.vibrato_mode == "none":
            vib, vib_flags = None, None
        else:
            raise RuntimeError("Unknown vibrato mode: {}".format(self.vibrato_mode))

        mgc = pysptk.sp2mc(
            spectrogram, order=self.mgc_order, alpha=pysptk.util.mcepalpha(fs)
        )

        # Post-processing for aperiodicy
        # ref: https://github.com/MTG/WGANSing/blob/mtg/vocoder.py
        if self.interp_unvoiced_aperiodicity:
            is_voiced = (vuv > 0).reshape(-1)
            if not np.any(is_voiced):
                pass  # all unvoiced, do nothing
            else:
                for k in range(aperiodicity.shape[1]):
                    aperiodicity[~is_voiced, k] = np.interp(
                        np.where(~is_voiced)[0],
                        np.where(is_voiced)[0],
                        aperiodicity[is_voiced, k],
                    )
        bap = pyworld.code_aperiodicity(aperiodicity, fs)

        # Parameter trajectory smoothing
        if self.trajectory_smoothing:
            modfs = int(1 / 0.005)
            for d in range(mgc.shape[1]):
                mgc[:, d] = lowpass_filter(
                    mgc[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff
                )
            for d in range(bap.shape[1]):
                bap[:, d] = lowpass_filter(
                    bap[:, d], modfs, cutoff=self.trajectory_smoothing_cutoff
                )

        # Adjust lengths
        mgc = mgc[: labels.num_frames()]
        lf0 = lf0[: labels.num_frames()]
        vuv = vuv[: labels.num_frames()]
        bap = bap[: labels.num_frames()]
        vib = vib[: labels.num_frames()] if vib is not None else None
        vib_flags = vib_flags[: labels.num_frames()] if vib_flags is not None else None

        if self.relative_f0:
            # # F0 derived from the musical score
            f0_score = f0_score[:, None]
            if len(f0_score) > len(f0):
                print(
                    "Warning! likely to have mistakes in alignment in {}".format(
                        label_path
                    )
                )
                print(f0_score.shape, f0.shape)
                f0_score = f0_score[: len(f0)]

            lf0_score = f0_score.copy()
            nonzero_indices = np.nonzero(f0_score)
            lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
            lf0_score = interp1d(lf0_score, kind="slinear")
            # relative f0
            diff_lf0 = lf0 - lf0_score
            diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

            f0_target = diff_lf0
        else:
            f0_target = lf0

        mgc = apply_delta_windows(mgc, self.windows)
        f0_target = apply_delta_windows(f0_target, self.windows)
        bap = apply_delta_windows(bap, self.windows)
        vib = apply_delta_windows(vib, self.windows) if vib is not None else None

        if vib is None and vib_flags is None:
            features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32)
        elif vib is not None and vib_flags is None:
            features = np.hstack((mgc, f0_target, vuv, bap, vib)).astype(np.float32)
        elif vib is not None and vib_flags is not None:
            features = np.hstack((mgc, f0_target, vuv, bap, vib, vib_flags)).astype(
                np.float32
            )
        else:
            raise RuntimeError("Unknown combination of features")

        # Align waveform and features
        wave = x.astype(np.float32) / 2 ** 15
        T = int(features.shape[0] * (fs * self.frame_period / 1000))
        if len(wave) < T:
            if T - len(wave) > int(fs * 0.005):
                print("Warn!!", T, len(wave), T - len(wave))
                print("you have unepxcted input. Please debug though ipdb")
                import ipdb

                ipdb.set_trace()
            else:
                pass
            wave = np.pad(wave, (0, T - len(wave)))
        assert wave.shape[0] >= T
        wave = wave[:T]

        return features, wave
示例#19
0
文件: gen.py 项目: r9y9/nnsvs
def predict_timelag(
    device,
    labels,
    timelag_model,
    timelag_config,
    timelag_in_scaler,
    timelag_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    allowed_range=None,
    allowed_range_rest=None,
    force_clip_input_features=False,
):
    """Predict time-lag from HTS labels

    Args:
        device (torch.device): device
        labels (nnmnkwii.io.hts.HTSLabelFile): HTS-style labels
        timelag_model (nn.Module): time-lag model
        timelag_config (dict): time-lag model config
        timelag_in_scaler (sklearn.preprocessing.MinMaxScaler): input scaler
        timelag_out_scaler (sklearn.preprocessing.MinMaxScaler): output scaler
        binary_dict (dict): binary feature dict
        numeric_dict (dict): numeric feature dict
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to condition on log f0
        allowed_range (list): allowed range of time-lag
        allowed_range_rest (list): allowed range of time-lag for rest
        force_clip_input_features (bool): whether to clip input features

    Returns;
        ndarray: time-lag predictions
    """
    if allowed_range is None:
        allowed_range = [-20, 20]
    if allowed_range_rest is None:
        allowed_range_rest = [-40, 40]
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(timelag_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features
    )
    if force_clip_input_features and isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(timelag_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        timelag_linguistic_features[:, non_pitch_indices] = np.clip(
            timelag_linguistic_features[:, non_pitch_indices],
            timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1],
        )

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * timelag_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_timelag = (
            timelag_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag,
                timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features,
            )

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range_rest[0], allowed_range_rest[1]
            )
        else:
            pred_timelag[idx] = np.clip(
                pred_timelag[idx], allowed_range[0], allowed_range[1]
            )

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
示例#20
0
文件: gen.py 项目: r9y9/nnsvs
def predict_acoustic(
    device,
    labels,
    acoustic_model,
    acoustic_config,
    acoustic_in_scaler,
    acoustic_out_scaler,
    binary_dict,
    numeric_dict,
    subphone_features="coarse_coding",
    pitch_indices=None,
    log_f0_conditioning=True,
    force_clip_input_features=False,
):
    """Predict acoustic features from HTS labels

    MLPG is applied to the predicted features if the output features have
    dynamic features.

    Args:
        device (torch.device): device to use
        labels (HTSLabelFile): HTS labels
        acoustic_model (nn.Module): acoustic model
        acoustic_config (AcousticConfig): acoustic configuration
        acoustic_in_scaler (sklearn.preprocessing.StandardScaler): input scaler
        acoustic_out_scaler (sklearn.preprocessing.StandardScaler): output scaler
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        subphone_features (str): subphone feature type
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to use log f0 conditioning
        force_clip_input_features (bool): whether to force clip input features

    Returns:
        ndarray: predicted acoustic features
    """
    # Musical/linguistic features
    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features=subphone_features,
    )

    if log_f0_conditioning:
        for idx in pitch_indices:
            linguistic_features[:, idx] = interp1d(
                _midi_to_hz(linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Apply normalization
    linguistic_features = acoustic_in_scaler.transform(linguistic_features)
    if force_clip_input_features and isinstance(acoustic_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        linguistic_features[:, non_pitch_indices] = np.clip(
            linguistic_features[:, non_pitch_indices],
            acoustic_in_scaler.feature_range[0],
            acoustic_in_scaler.feature_range[1],
        )

    # Predict acoustic features
    x = torch.from_numpy(linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if acoustic_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = acoustic_model.inference(x, [x.shape[1]])
        if np.any(acoustic_config.has_dynamic_features):
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = (
                max_sigma.squeeze(0).cpu().data.numpy() ** 2 * acoustic_out_scaler.var_
            )
            max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
            max_mu = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )

            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                max_mu,
                max_sigma_sq,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features,
            )
        else:
            # Apply denormalization
            pred_acoustic = acoustic_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy()
            )
    else:
        # (T, D_out)
        pred_acoustic = (
            acoustic_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_acoustic = acoustic_out_scaler.inverse_transform(pred_acoustic)
        if np.any(acoustic_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_acoustic = multi_stream_mlpg(
                pred_acoustic,
                acoustic_out_scaler.var_,
                get_windows(acoustic_config.num_windows),
                acoustic_config.stream_sizes,
                acoustic_config.has_dynamic_features,
            )

    return pred_acoustic
示例#21
0
文件: gen.py 项目: r9y9/nnsvs
def predict_duration(
    device,
    labels,
    duration_model,
    duration_config,
    duration_in_scaler,
    duration_out_scaler,
    binary_dict,
    numeric_dict,
    pitch_indices=None,
    log_f0_conditioning=True,
    force_clip_input_features=False,
):
    """Predict phoneme durations from HTS labels

    Args:
        device (torch.device): device to run the model on
        labels (nnmnkwii.io.hts.HTSLabelFile): labels
        duration_model (nn.Module): duration model
        duration_config (dict): duration config
        duration_in_scaler (sklearn.preprocessing.MinMaxScaler): duration input scaler
        duration_out_scaler (sklearn.preprocessing.MinMaxScaler): duration output scaler
        binary_dict (dict): binary feature dictionary
        numeric_dict (dict): numeric feature dictionary
        pitch_indices (list): indices of pitch features
        log_f0_conditioning (bool): whether to use log-f0 conditioning
        force_clip_input_features (bool): whether to clip input features

    Returns:
        np.ndarray: predicted durations
    """
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=False,
        subphone_features=None,
    ).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(
                _midi_to_hz(duration_linguistic_features, idx, log_f0_conditioning),
                kind="slinear",
            )

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features
    )
    if force_clip_input_features and isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range (except for pitch-related features)
        non_pitch_indices = [
            idx
            for idx in range(duration_linguistic_features.shape[1])
            if idx not in pitch_indices
        ]
        duration_linguistic_features[:, non_pitch_indices] = np.clip(
            duration_linguistic_features[:, non_pitch_indices],
            duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1],
        )

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        max_mu, max_sigma = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            raise RuntimeError(
                "Dynamic features are not supported for duration modeling"
            )
        # Apply denormalization
        max_sigma_sq = (
            max_sigma.squeeze(0).cpu().data.numpy() ** 2 * duration_out_scaler.var_
        )
        max_sigma_sq = np.maximum(max_sigma_sq, 1e-14)
        max_mu = duration_out_scaler.inverse_transform(
            max_mu.squeeze(0).cpu().data.numpy()
        )

        return max_mu, max_sigma_sq
    else:
        # (T, D_out)
        pred_durations = (
            duration_model.inference(x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        )
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations,
                duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features,
            )

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations
示例#22
0
                                                    win_length=win_length,
                                                    n_fft=n_fft)
    results, m_a, m_f = extract_vibrato_parameters(f0_smooth_cent,
                                                   vibrato_likelihood,
                                                   sr_f0,
                                                   threshold=threshold)

    fig, ax = plt.subplots(3, 1, figsize=(16, 12), sharex=True)
    ax[0].plot(timeaxis, f0, label="Original F0")
    ax[0].plot(timeaxis, f0_smooth, label="Smoothed F0")
    ax[0].plot(timeaxis, results * 15, "*", label="Vibrato sections")
    ax[0].set_ylim(12)
    ax[0].set_ylabel("Frequency [cent]")
    ax[0].legend()
    ax[0].set_title("F0")
    ax[1].plot(timeaxis, interp1d(m_a))
    ax[1].set_title("m_a(t)")
    ax[1].set_ylabel("Frequency [cent]")
    ax[2].plot(timeaxis, interp1d(m_f))
    ax[2].set_title("m_f(t)")
    ax[2].set_ylabel("Frequency [Hz]")
    plt.tight_layout()
    plt.show()

    # Let's reconstruct vibrato
    f0_no_vib = f0.copy()
    segments = nonzero_segments(f0)
    for s, e in segments:
        f0_no_vib[s:e] = lowpass_filter(f0[s:e], sr_f0, cutoff=1)
    f0_gen = gen_sine_vibrato(f0_no_vib, sr_f0, m_a, m_f)
示例#23
0
def predict_timelag(device,
                    labels,
                    timelag_model,
                    timelag_config,
                    timelag_in_scaler,
                    timelag_out_scaler,
                    binary_dict,
                    continuous_dict,
                    pitch_indices=None,
                    log_f0_conditioning=True,
                    allowed_range=[-20, 20],
                    allowed_range_rest=[-40, 40]):
    # round start/end times just in case.
    labels.round_()

    # Extract note-level labels
    note_indices = get_note_indices(labels)
    note_labels = labels[note_indices]

    # Extract musical/linguistic context
    timelag_linguistic_features = fe.linguistic_features(
        note_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    # Adjust input features if we use log-f0 conditioning
    if log_f0_conditioning:
        if pitch_indices is None:
            raise ValueError("Pitch feature indices must be specified!")
        for idx in pitch_indices:
            timelag_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                timelag_linguistic_features, idx, log_f0_conditioning),
                                                           kind="slinear")

    # Normalization
    timelag_linguistic_features = timelag_in_scaler.transform(
        timelag_linguistic_features)
    if isinstance(timelag_in_scaler, MinMaxScaler):
        # clip to feature range
        timelag_linguistic_features = np.clip(
            timelag_linguistic_features, timelag_in_scaler.feature_range[0],
            timelag_in_scaler.feature_range[1])

    # Run model
    x = torch.from_numpy(timelag_linguistic_features).unsqueeze(0).to(device)

    # Run model
    if timelag_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = timelag_model.inference(x, [x.shape[1]])
        if np.any(timelag_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * timelag_out_scaler.var_
            max_mu = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_timelag = timelag_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_timelag = timelag_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_timelag = timelag_out_scaler.inverse_transform(pred_timelag)
        if np.any(timelag_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_timelag = multi_stream_mlpg(
                pred_timelag, timelag_out_scaler.var_,
                get_windows(timelag_config.num_windows),
                timelag_config.stream_sizes,
                timelag_config.has_dynamic_features)

    # Rounding
    pred_timelag = np.round(pred_timelag)

    # Clip to the allowed range
    for idx in range(len(pred_timelag)):
        if _is_silence(note_labels.contexts[idx]):
            pred_timelag[idx] = np.clip(pred_timelag[idx],
                                        allowed_range_rest[0],
                                        allowed_range_rest[1])
        else:
            pred_timelag[idx] = np.clip(pred_timelag[idx], allowed_range[0],
                                        allowed_range[1])

    # frames -> 100 ns
    pred_timelag *= 50000

    return pred_timelag
示例#24
0
    sp = pyworld.cheaptrick(x, f0, timeaxis, fs,
                            fft_size=fft_len)  # Spectrogram
    ap = pyworld.d4c(x, f0, timeaxis, fs, fft_size=fft_len)  # Aperiodicity

    plt.subplot(3, 1, 1)
    plt.plot(f0)
    plt.subplot(3, 1, 2)
    plt.plot(lf0)
    plt.subplot(3, 1, 3)
    librosa.display(sp.T, sr=sr, hop_length=hop_length, y_axis='linear')
    plt.show()

    y = pyworld.synthesize(f0, sp, ap, fs, frame_period)

    play_audio(y)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=self.order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc = apply_delta_windows(mgc, self.windows)
    lf0 = apply_delta_windows(lf0, self.windows)
    bap = apply_delta_windows(bap, self.windows)
示例#25
0
def predict_duration(device,
                     labels,
                     duration_model,
                     duration_config,
                     duration_in_scaler,
                     duration_out_scaler,
                     lag,
                     binary_dict,
                     continuous_dict,
                     pitch_indices=None,
                     log_f0_conditioning=True):
    # Extract musical/linguistic features
    duration_linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)

    if log_f0_conditioning:
        for idx in pitch_indices:
            duration_linguistic_features[:, idx] = interp1d(_midi_to_hz(
                duration_linguistic_features, idx, log_f0_conditioning),
                                                            kind="slinear")

    # Apply normalization
    duration_linguistic_features = duration_in_scaler.transform(
        duration_linguistic_features)
    if isinstance(duration_in_scaler, MinMaxScaler):
        # clip to feature range
        duration_linguistic_features = np.clip(
            duration_linguistic_features, duration_in_scaler.feature_range[0],
            duration_in_scaler.feature_range[1])

    # Apply model
    x = torch.from_numpy(duration_linguistic_features).float().to(device)
    x = x.view(1, -1, x.size(-1))

    if duration_model.prediction_type() == PredictionType.PROBABILISTIC:
        # (B, T, D_out)
        log_pi, log_sigma, mu = duration_model.inference(x, [x.shape[1]])
        if np.any(duration_config.has_dynamic_features):
            max_sigma, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            # (B, T, D_out) -> (T, D_out)
            max_sigma_sq = max_sigma.squeeze(
                0).cpu().data.numpy()**2 * duration_out_scaler.var_
            max_mu = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())

            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                max_mu, max_sigma_sq, get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)
        else:
            _, max_mu = mdn_get_most_probable_sigma_and_mu(
                log_pi, log_sigma, mu)
            # Apply denormalization
            pred_durations = duration_out_scaler.inverse_transform(
                max_mu.squeeze(0).cpu().data.numpy())
    else:
        # (T, D_out)
        pred_durations = duration_model.inference(
            x, [x.shape[1]]).squeeze(0).cpu().data.numpy()
        # Apply denormalization
        pred_durations = duration_out_scaler.inverse_transform(pred_durations)
        if np.any(duration_config.has_dynamic_features):
            # (T, D_out) -> (T, static_dim)
            pred_durations = multi_stream_mlpg(
                pred_durations, duration_out_scaler.var_,
                get_windows(duration_config.num_windows),
                duration_config.stream_sizes,
                duration_config.has_dynamic_features)

    pred_durations[pred_durations <= 0] = 1
    pred_durations = np.round(pred_durations)

    return pred_durations