Exemplo n.º 1
0
def test_meanvar():
    # Pick acoustic features for testing
    _, X = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_mean, X_var = P.meanvar(X)
    X_std = np.sqrt(X_var)
    assert np.isfinite(X_mean).all()
    assert np.isfinite(X_var).all()
    assert X_mean.shape[-1] == D
    assert X_var.shape[-1] == D

    _, X_std_hat = P.meanstd(X)
    assert np.allclose(X_std, X_std_hat)

    x = X[0]
    x_scaled = P.scale(x, X_mean, X_std)
    assert np.isfinite(x_scaled).all()

    # For padded dataset
    _, X = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_mean_hat, X_var_hat = P.meanvar(X, lengths)
    assert np.allclose(X_mean, X_mean_hat)
    assert np.allclose(X_var, X_var_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std)
    assert np.allclose(x, x_hat, atol=1e-5)
Exemplo n.º 2
0
 def __getitem__(self, idx):
     x = P.minmax_scale(self.X[idx],
                        min_=self.X_data_min,
                        scale_=self.X_data_scale,
                        feature_range=(0.01, 0.99))
     y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std)
     return x, y
Exemplo n.º 3
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
Exemplo n.º 4
0
    def __getitem__(self, idx):
        x, t = self.xs[idx], self.ts[idx]
        x = minmax_scale(x,
                         self.x_stat['min'],
                         self.x_stat['max'],
                         feature_range=(0.01, 0.99))
        t = scale(t, self.t_stat['mean'], np.sqrt(self.t_stat['var']))

        pad_x = self._padding(x)
        pad_t = self._padding(t)

        return pad_x, pad_t, len(self.xs[idx]), len(self.ts[idx])
 def __getitem__(self, index):
     file = self.metadata[index]
     x = np.load(os.path.join(self.X_path, '{}.npy'.format(file))).reshape(
         -1, self.x_dim)
     y = np.load(os.path.join(self.Y_path, '{}.npy'.format(file))).reshape(
         -1, self.y_dim)
     norm_x = minmax_scale(x,
                           self.X_min[self.train],
                           self.X_max[self.train],
                           feature_range=(0.01, 0.99))
     norm_y = scale(y, self.Y_mean[self.train], self.Y_scale[self.train])
     return norm_x, norm_y
Exemplo n.º 6
0
    def __getitem__(self, idx):
        x = P.minmax_scale(self.X[idx],
                           min_=self.X_data_min,
                           scale_=self.X_data_scale,
                           feature_range=(0.01, 0.99))
        y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std)

        # To handle inconsistent static-delta relationship after normalization
        # This is required to use MSE + MGE loss work
        if hp.recompute_delta_features:
            y = recompute_delta_features(y, self.Y_data_mean, self.Y_data_std,
                                         hp.windows, hp.stream_sizes,
                                         hp.has_dynamic_features)
        return x, y
Exemplo n.º 7
0
 def __getitem__(self, index):
     
     if self.train_flag is True:
         # x: Zero-padded raw_audio
         x = np.zeros(5000, dtype=np.float64)
         zpad_end = 5000 - (self.sample_end[index] - self.sample_start[index])
         x[zpad_end:] = self.X_raw[self.sample_start[index]:self.sample_end[index]].astype(np.float64)
 #        assert(len(x)==5000)
     else:
         x = self.X_raw[self.sample_start[index]:self.sample_end[index]].astype(np.float64)
 
     # x_mulaw: 8bit Mulaw encoded 
     x_mulaw = mu_law_encode(x).astype(np.uint8)
     
     # cond: features to be used as a conditional input. mfcc or pyspec
     f0, timeaxis = pyworld.dio(x, 16000, frame_period=5) #(T,)
     f0 = pyworld.stonemask(x, f0, timeaxis, 16000)
     lf0 = f0.copy()
     lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) #(T,)
     #vuv = (lf0 != 0).astype(np.uint8) #(T,)
     
     if self.cond_sel is 'pyspec':
         cond = pyworld.cheaptrick(x, f0, timeaxis, 16000) # (T,d=513)
         cond = cond / self.pyspec_max
     elif self.cond_sel is 'mfcc':
         melspec = librosa.feature.melspectrogram(y=x, sr=16000, power=2.0, n_fft=400, hop_length=80, n_mels=128)
         cond = librosa.feature.mfcc(S=librosa.power_to_db(melspec), sr=16000, n_mfcc=25) #(d=25, T)
         cond = np.transpose(cond)  # (T,d)
         cond = scale(cond, self.mfcc_mean, self.mfcc_std)
         
         
     # Stack cond
     cond = np.hstack((lf0[:,None], cond))
     
     # cond: transpose to (d,T) and Resample
     cond = librosa.core.resample(np.transpose(cond), 200, 16000, res_type='kaiser_fast', fix=True, scale=False)
     
     # Resize and transpose
     cond = librosa.util.fix_length(cond, len(x_mulaw), mode='edge') # (d,T)
     
     return index, torch.LongTensor(x_mulaw), cond.astype(np.float32)
Exemplo n.º 8
0
    def __call__(self, path):
        feat = np.load(path).astype(np.float32)
        if self.domain == 'wld':
            #feat= P.scale(feat, self.mean, self.std)
            mean = feat.mean(axis=0)
            std = feat.std(axis=0)
            feat = P.scale(feat, mean, std)
            mean = np.vstack((mean, np.vstack((mean, mean))))
            std = np.vstack((std, np.vstack((std, std))))
            feat = np.vstack((np.vstack((feat, mean)), std))
        elif self.domain == 'mel':
            feat = (feat - self.mean) / (self.std * self.std)
            feat = torch.from_numpy(feat)

            if feat.shape[0] % 3 != 0:
                #print(feat.shape,5-feat.shape[0]%5)
                feat = F.pad(feat, (0, 0, 0, 3 - feat.shape[0] % 3))
            feat = feat.contiguous().view(-1, self.dim * 3)
            feat = F.pad(feat, (0, 0, 1, 0), value=1)  #pad sos
            feat = F.pad(feat, (0, 0, 0, 1))  #pad eos
            return feat.type(torch.FloatTensor)
        else:
            return torch.from_numpy(feat)
Exemplo n.º 9
0
 def __getitem__(self, idx):
     x = P.scale(self.X[idx], self.data_mean, self.data_std)
     y = P.scale(self.Y[idx], self.data_mean, self.data_std)
     return x, y
Exemplo n.º 10
0
def normalize(feature, mean, std):
    return P.scale(feature, mean, std)
Exemplo n.º 11
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs
Exemplo n.º 12
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes,
                                         hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs