示例#1
0
def test_linguistic_and_duration_features_for_duration_model():
    qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)

    # Phone-level linguistic features
    # Linguistic features
    input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
    labels = hts.load(input_state_label)
    assert labels.is_state_alignment_label()
    x = fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               add_frame_features=False,
                               subphone_features=None)
    y = np.fromfile(join(DATA_DIR, "binary_label_416", "arctic_a0001.lab"),
                    dtype=np.float32).reshape(-1, x.shape[-1])
    assert np.allclose(x, y)

    # Duration features
    labels = hts.load(input_state_label)
    x = fe.duration_features(labels,
                             feature_type="numerical",
                             unit_size="state",
                             feature_size="phoneme")
    y = np.fromfile(join(DATA_DIR, "duration_untrimmed", "arctic_a0001.dur"),
                    dtype=np.float32).reshape(-1, x.shape[-1])

    assert np.allclose(x, y)
示例#2
0
 def collect_features(self, label_score_path, label_align_path):
     label_score = hts.load(label_score_path)
     label_align = hts.load(label_align_path)
     timelag = np.asarray(label_align.start_times) - np.asarray(label_score.start_times)
     # 100ns -> num frames
     timelag = timelag.astype(np.float32) / 50000
     return timelag.reshape(-1, 1)
示例#3
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.duration_features(labels)
     indices = labels.silence_phone_indices()
     features = np.delete(features, indices, axis=0)
     #print('DurationFeature:',features.shape)
     return features.astype(np.float32)
示例#4
0
def _process_utterance(out_dir, index, wav_path, text):
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = dv3.audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        assert labels[0][-1] == "silB"
        assert labels[-1][-1] == "silE"
        b = int(labels[0][1] * 1e-7 * sr)
        e = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[b:e]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram =dv3.audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = dv3.audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'jsut-spec-%05d.npy' % index
    mel_filename = 'jsut-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#5
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
    def gen_duration(self, utt_id, label_path):
        # prepare phoneme-level linguistic feature
        labels = hts.load(lab_path)

        feature = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=False,
            subphone_features=None).astype(np.float32)

        # normalize
        feature = self.scaler['X']['duration'].transform(feature)

        # add speaker information
        feature = self.add_speaker_code(utt_id, feature)

        # predict phoneme durations
        feature = torch.from_numpy(feature).to(device)
        duration = self.duration_model.predict(feature)['mean'].data.cpu().numpy()

        # denormalize
        duration = self.scaler['Y']['duration'].inverse_transform(duration)
        duration = np.round(duration)

        # set minimum duration to 1
        duration[duration <= 0] = 1
        labels.set_durations(duration)

        return labels
示例#7
0
def _process_audio(out_dir, index, wav_path):
    sr = hparams.sample_rate
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if os.path.exists(lab_path):
        labels = hts.load(lab_path)
        assert labels[0][-1] == "silB"
        assert labels[-1][-1] == "silE"
        begin = int(labels[0][1] * 1e-7 * sr)
        end = int(labels[-1][0] * 1e-7 * sr)
        wav = wav[begin:end]
    else:
        wav, _ = librosa.effects.trim(wav, top_db=30)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    filename = 'jsut-target-%05d.tfrecords' % index
    write_preprocessed_target_data(index, spectrogram.T, mel_spectrogram.T,
                                   os.path.join(out_dir, filename))

    # Return a tuple describing this training example:
    return TargetMetaData(index, filename, n_frames)
示例#8
0
def get_linguistic_feature(lab_path, question_path, level='phone'):
    if level == 'phone':
        add_frame_features = False
        subphone_features = None
    elif level == 'frame':
        add_frame_features = True
        subphone_features = 'coarse_coding'
    else:
        raise ValueError(
            f'phone and frame are supported, but level={level} is given.')

    binary_dict, continuous_dict = hts.load_question_set(question_path)
    labels = hts.load(lab_path)
    feature = fe.linguistic_features(labels,
                                     binary_dict,
                                     continuous_dict,
                                     add_frame_features=add_frame_features,
                                     subphone_features=subphone_features)

    if add_frame_features:
        indices = labels.silence_frame_indices().astype(int)
    else:
        indices = labels.silence_phone_indices()
    feature = np.delete(feature, indices, axis=0)

    return feature.astype(np.float32)
示例#9
0
def gen_duration(device, label_path, binary_dict, continuous_dict, X_min,
                 X_max, Y_mean, Y_scale, duration_model):

    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)
    # Apply normalization
    ty = "duration"
    duration_linguistic_features = minmax_scale(duration_linguistic_features,
                                                X_min[ty],
                                                X_max[ty],
                                                feature_range=(0.01, 0.99))

    # # Apply model
    # # duration_model = duration_model.cpu()
    duration_model.eval()
    x = torch.FloatTensor(duration_linguistic_features)
    duration_predicted = duration_model(x.unsqueeze(0)).data.numpy()
    print("duration_predicted shape: {}".format(duration_predicted.shape))

    # Apply denormalization
    duration_predicted = duration_predicted * Y_scale[ty] + Y_mean[ty]
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
示例#10
0
def _process_feature(out_dir,
                     index,
                     label_path,
                     add_frame_features=False,
                     subphone_features=None,
                     question_path=None):

    labels = hts.load(label_path)
    binary_dict, continuous_dict = hts.load_question_set(question_path)
    features = fe.linguistic_features(labels,
                                      binary_dict,
                                      continuous_dict,
                                      add_frame_features=add_frame_features,
                                      subphone_features=subphone_features)
    n_frames = len(features)
    if add_frame_features:
        indices = labels.silence_frame_indices().astype(np.int)
    else:
        indices = labels.silence_phone_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the linguistic to disk:
    linguistic_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, linguistic_filename),
            features.astype(np.float32),
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (linguistic_filename, n_frames, voiced_frames)
示例#11
0
文件: waveform.py 项目: qxde01/CTTS
def test_one_utt(txt, duration_model, acoustic_model, post_filter=True):
    # Predict durations
    #txt = '中华人民共和国中央人民政府今天成立了'
    label = txt2label(txt)
    #hts_labels = hts.load(path=label_path)
    hts_labels = hts.load(lines=label)
    duration_modified_hts_labels = gen_duration(hts_labels, duration_model)
    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features="coarse_coding")
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)
    linguistic_features = X_acoustic_mms.transform(linguistic_features)
    if len(acoustic_model.inputs[0].shape) == 3:
        # RNN
        n1, n2 = linguistic_features.shape
        linguistic_features = linguistic_features.reshape(1, n1, n2)
        acoustic_predicted = acoustic_model.predict(linguistic_features)
        acoustic_predicted = acoustic_predicted.reshape(
            acoustic_predicted.shape[1], acoustic_predicted.shape[2])
    else:
        acoustic_predicted = acoustic_model.predict(linguistic_features)

    acoustic_predicted = Y_acoustic_std.inverse_transform(acoustic_predicted)
    out = gen_waveform(acoustic_predicted, post_filter)
    out = out.astype(np.int16)

    return out
示例#12
0
def process_lab(lab_files, out_dir, shift_in_cent):
    shift_in_note = args.shift_in_cent // 100

    for lab_file in tqdm(lab_files):
        labels = hts.load(lab_file)
        name = basename(lab_file)
        new_contexts = []
        for label in labels:
            context = label[-1]

            for pre, post in [("/D:", "!"), ("/E:", "]"), ("/F:", "#")]:
                match = re.search(f"{pre}([A-Z][b]?[0-9]+){post}", context)
                # if not "xx"
                if match is not None:
                    assert len(match.groups()) == 1
                    note = match.group(0)[3:-1]
                    note_index = NOTE_MAPPING[note]
                    note_shifted = MIDI_MAPPING[note_index + shift_in_note]
                    context = context.replace(match.group(0),
                                              f"{pre}{note_shifted}{post}", 1)
            new_contexts.append(context)

        labels.contexts = new_contexts
        postfix = str(shift_in_cent).replace("-", "minus") + "cent_aug"
        dst_lab_file = join(out_dir, name.replace(".lab", f"_{postfix}.lab"))
        with open(dst_lab_file, "w") as of:
            of.write(str(labels))
示例#13
0
def test_hts_append():
    lab_path = join(DATA_DIR, "BASIC5000_0001.lab")
    test_labels = hts.load(lab_path)
    print("\n{}".format(test_labels))

    # should get same string representation
    labels = hts.HTSLabelFile()
    assert str(labels) == ""
    for label in test_labels:
        labels.append(label)
    assert str(test_labels) == str(labels)

    @raises(ValueError)
    def test_invalid_start_time():
        l = hts.HTSLabelFile()
        l.append((100000, 0, "NG"))

    def test_succeeding_times():
        l = hts.HTSLabelFile()
        l.append((0, 1000000, "OK"))
        l.append((1000000, 2000000, "OK"))

    @raises(ValueError)
    def test_non_succeeding_times():
        l = hts.HTSLabelFile()
        l.append((0, 1000000, "OK"))
        l.append((1500000, 2000000, "NG"))

    test_invalid_start_time()
    test_succeeding_times()
    test_non_succeeding_times()
示例#14
0
def test_singing_voice_question():
    # Test SVS case
    """
QS "L-Phone_Yuusei_Boin"           {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*}
CQS "e1" {/E:(\\NOTE)]}
    """
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "test_jp_svs.hed"),
        append_hat_for_LL=False,
        convert_svs_pattern=True)
    input_phone_label = join(DATA_DIR, "song070_f00001_063.lab")
    labels = hts.load(input_phone_label)
    feats = fe.linguistic_features(labels, binary_dict, continuous_dict)
    assert feats.shape == (74, 3)

    # CQS e1: get the current MIDI number
    C_e1 = continuous_dict[0]
    for idx, lab in enumerate(labels):
        context = lab[-1]
        if C_e1.search(context) is not None:
            from nnmnkwii.frontend import NOTE_MAPPING
            assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]

    # CQS e57: get pitch diff
    # In contrast to other continous features, the pitch diff has a prefix "m" or "p"
    # to indiecate th sign of numbers.
    C_e57 = continuous_dict[1]
    for idx, lab in enumerate(labels):
        context = lab[-1]
        if "~p2+" in context:
            assert C_e57.search(context).group(1) == "p2"
            assert feats[idx, 2] == 2
        if "~m2+" in context:
            assert C_e57.search(context).group(1) == "m2"
            assert feats[idx, 2] == -2
示例#15
0
def tts_from_label(models,
                   label_path,
                   X_min,
                   X_max,
                   Y_mean,
                   Y_std,
                   post_filter=False,
                   apply_duration_model=True,
                   coef=1.4,
                   fs=16000):
    duration_model, acoustic_model = models["duration"], models["acoustic"]

    if use_cuda:
        duration_model = duration_model.cuda()
        acoustic_model = acoustic_model.cuda()

    # Predict durations
    if apply_duration_model:
        duration_modified_hts_labels = gen_duration(label_path, duration_model,
                                                    X_min, X_max, Y_mean,
                                                    Y_std)
    else:
        duration_modified_hts_labels = hts.load(label_path)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=hp_acoustic.add_frame_features,
        subphone_features=hp_acoustic.subphone_features)
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = P.minmax_scale(linguistic_features,
                                         X_min[ty],
                                         X_max[ty],
                                         feature_range=(0.01, 0.99))

    # Predict acoustic features
    acoustic_model.eval()
    x = Variable(torch.from_numpy(linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy()
    acoustic_predicted = acoustic_predicted.reshape(
        -1, acoustic_predicted.shape[-1])

    return gen_waveform(acoustic_predicted,
                        Y_mean,
                        Y_std,
                        post_filter,
                        coef=coef,
                        fs=fs)
def _process_feature(out_dir, index, wav_path, label_path):

    # get list of wav files
    wav_files = os.listdir(os.path.dirname(wav_path))
    # check wav_file
    assert len(
        wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!"

    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    n_frames = len(f0)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    features = np.hstack((mgc, lf0, vuv, bap))

    # get list of lab files
    lab_files = os.listdir(os.path.dirname(label_path))
    # check wav_file
    assert len(
        lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!"

    # Cut silence frames by HTS alignment
    labels = hts.load(label_path)
    features = features[:labels.num_frames()]
    indices = labels.silence_frame_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the acoustic to disk:
    acoustic_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, acoustic_filename),
            features.astype(np.float32),
            allow_pickle=False)

    dataset_ids.append(acoustic_filename[:-4])
    with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'),
              'wb') as pklFile:
        pickle.dump(dataset_ids, pklFile)

    # Return a tuple describing this training example:
    return (acoustic_filename, n_frames, voiced_frames)
示例#17
0
def test_invalid_linguistic_features():
    binary_dict, continuous_dict = hts.load_question_set(
        example_question_file())
    phone_labels = hts.load(example_label_file(phone_level=True))
    state_labels = hts.load(example_label_file(phone_level=False))

    @raises(ValueError)
    def __test(labels, subphone_features, add_frame_features):
        fe.linguistic_features(labels,
                               binary_dict,
                               continuous_dict,
                               subphone_features=subphone_features,
                               add_frame_features=add_frame_features)

    yield __test, phone_labels, "full", True
    yield __test, phone_labels, "full", False
    yield __test, state_labels, "full", False
示例#18
0
 def collect_features(self, path):
     # 1.Load labels --> 2.Load dict from question --> 3.Parse linguistic feat.
     labels = hts.load(path)
     features = fe.linguistic_features(
             labels, self.binary_dict, self.continuous_dict,
             add_frame_features=True, subphone_features='coarse_coding') # subphone_feature = None or 'coarse_coding', coarse_coded_features[:,416,417,418,419]
     
     return features.astype(np.float32)
示例#19
0
def test_labels_number_of_frames():
    # https://github.com/r9y9/nnmnkwii/issues/85
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "jp.hed"))
    labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab"))
    linguistic_features = fe.linguistic_features(
        labels, binary_dict, continuous_dict, add_frame_features=True)
    assert labels.num_frames() == linguistic_features.shape[0]
示例#20
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.linguistic_features(
         labels, self.binary_dict, self.continuous_dict,
         add_frame_features=self.add_frame_features,
         subphone_features=self.subphone_features)
     if self.log_f0_conditioning:
         for idx in self.pitch_idx:
             features[:, idx] = interp1d(_midi_to_hz(features, idx, True), kind="slinear")
     return features.astype(np.float32)
示例#21
0
def test_invalid_duration_features():
    phone_labels = hts.load(example_label_file(phone_level=True))

    @raises(ValueError)
    def __test(labels, unit_size, feature_size):
        fe.duration_features(labels,
                             unit_size=unit_size,
                             feature_size=feature_size)

    yield __test, phone_labels, None, "frame"
示例#22
0
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
    # modified version of LJSpeech _process_utterance
    audio.set_hparams(hparams)

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    sr = hparams.sample_rate
    # Added from the multispeaker version
    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        wav = clean_by_phoneme(labels, wav, sr)
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)
    # End added from the multispeaker version

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.max_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) > hparams.max_audio_length:
        return None
    if hparams.min_audio_length != 0 and librosa.core.get_duration(
            y=wav, sr=sr) < hparams.min_audio_length:
        return None

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]
    spectrogram_filename = 'spec-{}.npy'.format(wav_name)
    mel_filename = 'mel-{}.npy'.format(wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
示例#23
0
def _process_utterance(out_dir, text, wav_path, speaker_id=None):

    # check whether singlespeaker_mode
    if speaker_id is None:
        return _process_utterance_single(out_dir, text, wav_path)
    # modified version of VCTK _process_utterance
    sr = hparams.sample_rate

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")
    if not exists(lab_path):
        lab_path = os.path.splitext(wav_path)[0] + '.lab'

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    else:
        if hparams.process_only_htk_aligned:
            return None
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    # Get filename from wav_path
    wav_name = os.path.basename(wav_path)
    wav_name = os.path.splitext(wav_name)[0]

    # case if wave files across different speakers have the same naming format.
    # e.g. Recording0.wav
    spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name)
    mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
示例#24
0
def synthesis(config, device, label_path, question_path, timelag_model,
              timelag_config, timelag_in_scaler, timelag_out_scaler,
              duration_model, duration_config, duration_in_scaler,
              duration_out_scaler, acoustic_model, acoustic_config,
              acoustic_in_scaler, acoustic_out_scaler):
    # load labels and question
    labels = hts.load(label_path).round_()
    binary_dict, continuous_dict = hts.load_question_set(
        question_path, append_hat_for_LL=False)

    # pitch indices in the input features
    # TODO: configuarable
    pitch_idx = len(binary_dict) + 1
    pitch_indices = np.arange(len(binary_dict), len(binary_dict) + 3)

    log_f0_conditioning = config.log_f0_conditioning

    if config.ground_truth_duration:
        # Use provided alignment
        duration_modified_labels = labels
    else:
        # Time-lag
        lag = predict_timelag(device, labels, timelag_model, timelag_config,
                              timelag_in_scaler, timelag_out_scaler,
                              binary_dict, continuous_dict, pitch_indices,
                              log_f0_conditioning,
                              config.timelag.allowed_range)

        # Timelag predictions
        durations = predict_duration(device, labels, duration_model,
                                     duration_config, duration_in_scaler,
                                     duration_out_scaler, lag, binary_dict,
                                     continuous_dict, pitch_indices,
                                     log_f0_conditioning)

        # Normalize phoneme durations
        duration_modified_labels = postprocess_duration(labels, durations, lag)

    # Predict acoustic features
    acoustic_features = predict_acoustic(
        device, duration_modified_labels, acoustic_model, acoustic_config,
        acoustic_in_scaler, acoustic_out_scaler, binary_dict, continuous_dict,
        config.acoustic.subphone_features, pitch_indices, log_f0_conditioning)

    # Waveform generation
    generated_waveform = gen_waveform(
        duration_modified_labels, acoustic_features, binary_dict,
        continuous_dict, acoustic_config.stream_sizes,
        acoustic_config.has_dynamic_features,
        config.acoustic.subphone_features, log_f0_conditioning, pitch_idx,
        acoustic_config.num_windows, config.acoustic.post_filter,
        config.sample_rate, config.frame_period, config.acoustic.relative_f0)

    return generated_waveform
示例#25
0
def test_htk_style_question_basics():
    binary_dict, continuous_dict = hts.load_question_set(
        join(DATA_DIR, "test_question.hed"))
    # sil k o n i ch i w a sil
    input_phone_label = join(DATA_DIR, "hts-nit-atr503", "phrase01.lab")
    labels = hts.load(input_phone_label)

    # Test if we can handle wildcards correctly
    # also test basic phon contexts (LL, L, C, R, RR)
    """
QS "LL-Phone_Muon1"  {sil^,pau^}    # without wildcards (*)
QS "LL-Phone_Muon2"  {sil^*,pau^*}  # with *, should be equivalent with above
QS "L-Phone_Muon1"   {*^sil-*,*^pau-*}
QS "C-Phone_sil"     {*-sil+*}
QS "R-Phone_o"       {*+o=*}
QS "RR-Phone_o"      {*=o/A:*}
    """
    LL_muon1 = binary_dict[0][0]
    LL_muon2 = binary_dict[1][0]
    L_muon1 = binary_dict[2][0]
    C_sil = binary_dict[3][0]
    R_phone_o = binary_dict[4][0]
    RR_phone_o = binary_dict[5][0]

    # xx^xx-sil+k=o
    label = labels[0][-1]
    assert LL_muon1.search(label) is None
    assert LL_muon2.search(label) is None
    assert L_muon1.search(label) is None
    assert C_sil.search(label)
    assert R_phone_o.search(label) is None
    assert RR_phone_o.search(label)

    # xx^sil-k+o=N
    label = labels[1][-1]
    assert LL_muon1.search(label) is None
    assert LL_muon2.search(label) is None
    assert L_muon1.search(label)
    assert C_sil.search(label) is None
    assert R_phone_o.search(label)
    assert RR_phone_o.search(label) is None

    # sil^k-o+N=n
    label = labels[2][-1]
    assert LL_muon1.search(label)
    assert LL_muon2.search(label)
    assert L_muon1.search(label) is None
    assert C_sil.search(label) is None
    assert R_phone_o.search(label) is None
    assert RR_phone_o.search(label) is None

    # Slice/list indexing
    assert str(labels[:2]) == str(labels[[0, 1]])
示例#26
0
def test_correct_vuv_by_phone():
    wav_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.wav"
    lab_path = Path(__file__).parent / "data" / "nitech_jp_song070_f001_004.lab"

    binary_dict, numeric_dict = hts.load_question_set(
        Path(__file__).parent / "data" / "jp_test.hed"
    )

    labels = hts.load(lab_path)
    sr, wav = wavfile.read(wav_path)
    wav = wav.astype(np.float64)
    assert sr == 48000

    out_feats, stream_sizes = _extract_static_feats(wav, sr)
    has_dynamic_features = [False] * len(stream_sizes)
    pitch_idx = len(binary_dict) + 1

    linguistic_features = fe.linguistic_features(
        labels,
        binary_dict,
        numeric_dict,
        add_frame_features=True,
        subphone_features="coarse_coding",
    )

    params = {
        "labels": labels,
        "acoustic_features": out_feats,
        "binary_dict": binary_dict,
        "numeric_dict": numeric_dict,
        "stream_sizes": stream_sizes,
        "has_dynamic_features": has_dynamic_features,
        "pitch_idx": pitch_idx,
        "relative_f0": False,
        "frame_period": 5,
    }

    out_vuv_idx = 61
    vuv = out_feats[:, out_vuv_idx : out_vuv_idx + 1]

    vuv_corrected = correct_vuv_by_phone(vuv, binary_dict, linguistic_features)
    # by correcting VUV should make a difference
    _, _, vuv_fixed, _ = gen_spsvs_static_features(**{**params, "force_fix_vuv": True})
    assert np.any(vuv_corrected != vuv)

    # 0: Rest 1: Voiced 2: Unvoiced
    rest_idx = 0
    voiced_idx = 1
    unvoiced_idx = 2
    assert np.all(vuv_corrected[linguistic_features[:, rest_idx] > 0] < 0.5)
    assert np.all(vuv_corrected[linguistic_features[:, voiced_idx] > 0] > 0.5)
    assert np.all(vuv_corrected[linguistic_features[:, unvoiced_idx] > 0] < 0.5)
示例#27
0
    def collect_features(self, path):
        labels = hts.load(path)
        features = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=self.add_frame_features,
            subphone_features=self.subphone_features)
        if self.add_frame_features:
            indices = labels.silence_frame_indices().astype(np.int)
        else:
            indices = labels.silence_phone_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
示例#28
0
def test_mono():
    lab_path = join(DATA_DIR, "BASIC5000_0001.lab")
    labels = hts.load(lab_path)
    assert not labels.is_state_alignment_label()

    # Should detect begin/end sil regions
    sil_regex = re.compile("sil")

    for indices in [
            labels.silence_label_indices(sil_regex),
            labels.silence_phone_indices(sil_regex)]:
        assert len(indices) == 2
        assert indices[0] == 0
        assert indices[1] == len(labels) - 1
示例#29
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
示例#30
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate
    filename = os.path.basename(wav_path).replace('.wav', '')

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    # Librosa trim seems to cut off the ending part of speech
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Save trimmed wav
    save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path)
    dir = os.path.dirname(save_wav_path)
    if not os.path.exists(dir):
        os.system('mkdir {} -p'.format(dir))
    audio.save_wav(wav, save_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = '{}-spec.npy'.format(filename)
    mel_filename = '{}-mel.npy'.format(filename)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
示例#31
0
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std,
                   post_filter=False,
                   apply_duration_model=True, coef=1.4, fs=16000,
                   mge_training=True):
    duration_model, acoustic_model = models["duration"], models["acoustic"]

    if use_cuda:
        duration_model = duration_model.cuda()
        acoustic_model = acoustic_model.cuda()

    # Predict durations
    if apply_duration_model:
        duration_modified_hts_labels = gen_duration(
            label_path, duration_model, X_min, X_max, Y_mean, Y_std)
    else:
        duration_modified_hts_labels = hts.load(label_path)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_acoustic.add_frame_features,
        subphone_features=hp_acoustic.subphone_features)
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = P.minmax_scale(
        linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Predict acoustic features
    acoustic_model.eval()
    x = Variable(torch.from_numpy(linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy()
    acoustic_predicted = acoustic_predicted.reshape(-1, acoustic_predicted.shape[-1])

    return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter,
                        coef=coef, fs=fs, mge_training=mge_training)
示例#32
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(
        duration_linguistic_features,
        X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Apply models
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    duration_predicted = duration_model(x, [xl]).data.cpu().numpy()
    duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty])
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
示例#33
0
 def collect_features(self, path):
     labels = hts.load(path)
     features = fe.duration_features(labels)
     indices = labels.silence_phone_indices()
     features = np.delete(features, indices, axis=0)
     return features.astype(np.float32)