示例#1
0
def test_extract_input_with_dataset(
    sampling_length: int,
    f0_path: Path,
    phoneme_path: Path,
    phoneme_list_path: Path,
    silence_path: Path,
    spectrogram_path: Path,
    volume_path: Path,
    f0_process_mode: F0ProcessMode,
    time_mask_max_second: float,
    time_mask_num: int,
):
    f0 = SamplingData.load(f0_path)
    phoneme = SamplingData.load(phoneme_path)
    phoneme_list = JvsPhoneme.load_julius_list(phoneme_list_path)
    silence = SamplingData.load(silence_path)
    spectrogram = SamplingData.load(spectrogram_path)
    volume_data = SamplingData.load(volume_path)

    FeatureDataset.extract_input(
        sampling_length=sampling_length,
        f0_data=f0,
        phoneme_data=phoneme,
        spec_data=spectrogram,
        silence_data=silence,
        phoneme_list_data=phoneme_list,
        volume_data=volume_data,
        f0_process_mode=f0_process_mode,
        time_mask_max_second=time_mask_max_second,
        time_mask_num=time_mask_num,
    )
示例#2
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         f0=SamplingData.load(self.path_f0),
         phoneme=SamplingData.load(self.path_phoneme),
     )
示例#3
0
 def generate(self):
     return Input(
         f0=SamplingData.load(self.f0_path),
         phoneme=SamplingData.load(self.phoneme_path),
         spec=SamplingData.load(self.spec_path),
         silence=SamplingData.load(self.silence_path),
         phoneme_list=(self.phoneme_class.load_julius_list(
             self.phoneme_list_path)
                       if self.phoneme_list_path is not None else None),
         volume=(SamplingData.load(self.volume_path)
                 if self.volume_path is not None else None),
     )
示例#4
0
def convert_f0(
    model_config: Path,
    input_glob: str,
    input_f0_statistics: Path,
    target_f0_statistics: Path,
    output_dir: Path,
):
    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", convert_f0, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    input_stat = numpy.load(input_f0_statistics, allow_pickle=True).item()
    target_stat = numpy.load(target_f0_statistics, allow_pickle=True).item()

    paths = list(map(Path, glob(input_glob)))

    for p in tqdm(paths, desc="convert_f0"):
        data = SamplingData.load(p)

        if data.array.shape[1] == (config.network.voiced_feature_size + 1 +
                                   config.network.phoneme_feature_size):
            f0_index = config.network.voiced_feature_size
        elif data.array.shape[1] == (1 + 1 + 40):
            f0_index = 1
        else:
            raise ValueError(data.array.shape[1])

        data.array[:, f0_index] += target_stat["mean"] - input_stat["mean"]
        data.save(output_dir / (p.stem + ".npy"))
示例#5
0
def process_wo_context(
    local_paths: Sequence[Path],
    speaker_nums: Optional[Sequence[int]],
    generator: Generator,
    postfix="_woc",
):
    try:
        local_datas = [
            SamplingData.load(local_path) for local_path in local_paths
        ]
        size = int((time_length + 5) * local_datas[0].rate)
        local_arrays = [
            local_data.array[:size]
            if len(local_data.array) >= size else np.pad(
                local_data.array,
                ((0, size - len(local_data.array)), (0, 0)),
                mode="edge",
            ) for local_data in local_datas
        ]

        waves = generator.generate(
            time_length=time_length,
            sampling_policy=sampling_policy,
            num_generate=len(local_arrays),
            local_array=np.stack(local_arrays),
            speaker_nums=speaker_nums,
        )
        for wave, local_path in zip(waves, local_paths):
            wave.save(output_dir / (local_path.stem + postfix + ".wav"))
    except:
        import traceback

        traceback.print_exc()
示例#6
0
    def __getitem__(self, i):
        data = self.datas[i]
        input = SamplingData.load(data.input_path).array
        vowel = numpy.squeeze(SamplingData.load(data.vowel_path).array)
        speaker_num = data.speaker_num

        assert len(vowel) <= len(
            input), f'{data.input_path.stem} cannot be processed.'
        if abs(len(vowel) - len(input)) >= 10:
            warn(f'{data.input_path.stem} is not matched.')

        input_vowel = input[:len(vowel)][vowel]
        i = numpy.random.randint(len(input_vowel))

        return default_convert(dict(
            input=input_vowel[i],
            target=speaker_num,
        ))
示例#7
0
    def generate(self):
        wave = Wave.load(self.path_wave)

        try:
            local = SamplingData.load(self.path_local)
        except:
            local_rate = 80
            local_array = to_log_melspectrogram(wave=wave, rate=local_rate)
            local = SamplingData(array=local_array, rate=local_rate)

            with NamedTemporaryFile(suffix=".npy", delete=False) as f:
                self.path_local = Path(f.name)
                local.save(self.path_local)

        return Input(
            wave=wave,
            silence=SamplingData.load(self.path_silence),
            local=local,
        )
示例#8
0
def process_local_data(local_paths: Sequence[Path], time_length: float):
    local_datas = [SamplingData.load(local_path) for local_path in local_paths]
    size = int((time_length + 1) * local_datas[0].rate)
    local_arrays = [
        local_data.array[:size]
        if len(local_data.array) >= size
        else np.pad(
            local_data.array, ((0, size - len(local_data.array)), (0, 0)), mode="edge",
        )
        for local_data in local_datas
    ]
    return local_arrays
def process(args: Tuple[int, Path], sampling_lengths: Sequence[int]):
    i_data, path = args
    vector = numpy.empty(len(sampling_lengths), dtype=numpy.int32)

    data = SamplingData.load(path)
    array = ~numpy.squeeze(data.array)
    for i_length, sampling_length in enumerate(sampling_lengths):
        m = numpy.convolve(numpy.ones(sampling_length, dtype=numpy.int32),
                           array,
                           mode='valid').max()
        vector[i_length] = m

    return i_data, vector
示例#10
0
 def generate(self):
     return Input(
         phoneme_list=self.phoneme_class.load_julius_list(
             self.phoneme_list_path),
         start_accent_list=numpy.array([
             bool(int(s))
             for s in self.start_accent_list_path.read_text().split()
         ]),
         end_accent_list=numpy.array([
             bool(int(s))
             for s in self.end_accent_list_path.read_text().split()
         ]),
         start_accent_phrase_list=numpy.array([
             bool(int(s)) for s in
             self.start_accent_phrase_list_path.read_text().split()
         ]),
         end_accent_phrase_list=numpy.array([
             bool(int(s))
             for s in self.end_accent_phrase_list_path.read_text().split()
         ]),
         f0=SamplingData.load(self.f0_path),
         volume=(SamplingData.load(self.volume_path)
                 if self.volume_path is not None else None),
     )
示例#11
0
def process(
    generator: Generator,
    local_paths: Sequence[Path],
    local_sampling_rate: Optional[int],
    time_length: float,
    speaker_nums: Optional[Sequence[int]],
    sampling_policy: SamplingPolicy,
    output_dir: Path,
    postfix="",
):
    local_datas = [SamplingData.load(local_path) for local_path in local_paths]

    if local_sampling_rate is None:
        rate = local_datas[0].rate
        local_arrays = [l.array for l in local_datas]
    else:
        rate = local_sampling_rate
        local_arrays = [l.resample(rate) for l in local_datas]

    size = int((time_length + 5) * local_datas[0].rate)
    local_arrays = [
        l[:size] if len(l) >= size else numpy.pad(
            l,
            ((0, size - len(l)), (0, 0)),
            mode="edge",
        ) for l in local_arrays
    ]

    waves = generator.generate(
        time_length=time_length,
        sampling_policy=sampling_policy,
        num_generate=len(local_arrays),
        local_array=numpy.stack(local_arrays),
        speaker_nums=speaker_nums,
    )
    for wave, local_path in zip(waves, local_paths):
        wave.save(output_dir / (local_path.stem + postfix + ".wav"))
示例#12
0
 def generate(self):
     return Input(
         f0=SamplingData.load(self.f0_path),
         phoneme=SamplingData.load(self.phoneme_path),
         silence=SamplingData.load(self.silence_path),
     )
示例#13
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         local=SamplingData.load(self.path_local),
     )
示例#14
0
def generate(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    output_dir: Path,
    batch_size: int,
    num_test: int,
    from_train_data: bool,
    time_second: float,
    val_local_glob: str,
    val_speaker_id: Optional[int],
    noise_schedule_start: float,
    noise_schedule_stop: float,
    noise_schedule_num: int,
    use_gpu: bool,
):
    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate, locals())

    if model_config is None:
        model_config = model_dir / "config.yaml"
    config = Config.from_dict(yaml.safe_load(model_config.open()))

    model_path = _get_predictor_model_path(
        model_dir=model_dir,
        iteration=model_iteration,
    )
    print("model path: ", model_path)
    generator = Generator(
        config=config,
        noise_schedule_config=NoiseScheduleModelConfig(
            start=noise_schedule_start,
            stop=noise_schedule_stop,
            num=noise_schedule_num),
        predictor=model_path,
        sampling_rate=config.dataset.sampling_rate,
        use_gpu=use_gpu,
    )

    local_padding_second = 1
    local_padding_length = config.dataset.sampling_rate * local_padding_second

    config.dataset.sampling_length = int(config.dataset.sampling_rate *
                                         time_second)
    config.dataset.local_padding_length = local_padding_length
    dataset = create_dataset(
        config.dataset)["test" if not from_train_data else "train"]

    if isinstance(dataset, SpeakerWavesDataset):
        wave_paths = [
            input.path_wave for input in dataset.wave_dataset.inputs[:num_test]
        ]
    elif isinstance(dataset, WavesDataset):
        wave_paths = [input.path_wave for input in dataset.inputs[:num_test]]
    else:
        raise Exception()

    for data, wave_path in tqdm(
            zip(chunked(dataset, batch_size), chunked(wave_paths, batch_size)),
            desc="generate",
    ):
        data = concat_examples(data)
        output = generator.generate(
            local=data["local"],
            local_padding_length=local_padding_length,
            speaker_id=data["speaker_id"] if "speaker_id" in data else None,
        )

        for wave, p in zip(output, wave_path):
            wave.save(output_dir / (p.stem + ".wav"))

    # validation
    if val_local_glob is not None:
        local_paths = sorted([Path(p) for p in glob(val_local_glob)])
        speaker_ids = [val_speaker_id] * len(local_paths)
        for local_path, speaker_id in zip(chunked(local_paths, batch_size),
                                          chunked(speaker_ids, batch_size)):
            datas = [SamplingData.load(p) for p in local_path]
            size = int(
                (time_second + local_padding_second * 2) * datas[0].rate)
            local = numpy.stack([
                (data.array[:size].T if len(data.array) >= size else numpy.pad(
                    data.array,
                    ((0, size - len(data.array)), (0, 0)),
                    mode="edge",
                ).T) for data in datas
            ])

            output = generator.generate(
                local=local,
                local_padding_length=local_padding_length,
                speaker_id=(numpy.stack(speaker_id)
                            if speaker_id[0] is not None else None),
            )

            for wave, p in zip(output, local_path):
                wave.save(output_dir / (p.stem + ".wav"))
示例#15
0
def main():
    model_dir: Path = arguments.model_dir
    model_iteration: int = arguments.model_iteration
    model_config: Path = arguments.model_config
    time_length: float = arguments.time_length
    gpu: int = arguments.gpu

    config = create_config(model_config)
    model_path = _get_predictor_model_path(model_dir, model_iteration)

    sr = config.dataset.sampling_rate

    model = create_predictor(config.model)
    chainer.serializers.load_npz(str(model_path), model)
    if gpu is not None:
        model.to_gpu(gpu)
        cuda.get_device_from_id(gpu).use()

    chainer.global_config.train = False
    chainer.global_config.enable_backprop = False

    wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))])
    local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))])
    assert len(wave_paths) == len(local_paths)

    np.random.RandomState(config.dataset.seed).shuffle(wave_paths)
    np.random.RandomState(config.dataset.seed).shuffle(local_paths)
    wave_path = wave_paths[0]
    local_path = local_paths[0]
    w_data = Wave.load(wave_path, sampling_rate=sr)
    l_data = SamplingData.load(local_path)

    length = int(sr * time_length)
    l_scale = int(sr // l_data.rate)
    l_sl = length // l_scale
    length = l_sl * l_scale

    w = w_data.wave[:length]
    l = l_data.array[:l_sl]
    coarse, fine = encode_16bit(w)

    c, f, hc, hf = model(
        c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis],
        f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis],
        l_array=model.xp.asarray(l)[np.newaxis],
    )

    c = chainer.functions.softmax(c)

    c = chainer.cuda.to_cpu(c[0].data)
    f = chainer.cuda.to_cpu(f[0].data)

    fig = plt.figure(figsize=[32 * time_length, 10])

    plt.imshow(c, aspect='auto', interpolation='nearest')
    plt.colorbar()

    plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true')
    plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted')
    plt.legend()

    fig.savefig('output.eps')
示例#16
0
 def generate(self):
     return InputData(
         spectrogram=SamplingData.load(str(TempCache(
             self.spectrogram_path))),
         silence=SamplingData.load(str(TempCache(self.silence_path))),
     )
示例#17
0
 def generate(self):
     return Input(
         f0=SamplingData.load(self.f0_path),
         phoneme=SamplingData.load(self.phoneme_path),
         phoneme_list=JvsPhoneme.load_julius_list(self.phoneme_list_path),
     )
示例#18
0
def create_data(
    f0_dir: Path,
    phoneme_list_dir: Path,
    loudness_dir: Path,
    accent_start_dir: Path,
    accent_end_dir: Path,
    accent_phrase_start_dir: Path,
    accent_phrase_end_dir: Path,
    speaker_valid_filter: Optional[str],
    utterance_valid_filter: Optional[str],
    data_num: Optional[int],
):
    f0_paths = sorted(f0_dir.rglob("*.npy"))
    if data_num is not None:
        f0_paths = f0_paths[:data_num]
    assert len(f0_paths) > 0

    phoneme_list_paths = sorted(phoneme_list_dir.rglob("*.lab"))
    if data_num is not None:
        phoneme_list_paths = phoneme_list_paths[:data_num]
    assert len(f0_paths) == len(phoneme_list_paths)

    loudness_paths = sorted(loudness_dir.rglob("*.npy"))
    if data_num is not None:
        loudness_paths = loudness_paths[:data_num]
    assert len(f0_paths) == len(loudness_paths)

    accent_start_paths = sorted(accent_start_dir.rglob("*.txt"))
    if data_num is not None:
        accent_start_paths = accent_start_paths[:data_num]
    assert len(f0_paths) == len(accent_start_paths)

    accent_end_paths = sorted(accent_end_dir.rglob("*.txt"))
    if data_num is not None:
        accent_end_paths = accent_end_paths[:data_num]
    assert len(f0_paths) == len(accent_end_paths)

    accent_phrase_start_paths = sorted(accent_phrase_start_dir.rglob("*.txt"))
    if data_num is not None:
        accent_phrase_start_paths = accent_phrase_start_paths[:data_num]
    assert len(f0_paths) == len(accent_phrase_start_paths)

    accent_phrase_end_paths = sorted(accent_phrase_end_dir.rglob("*.txt"))
    if data_num is not None:
        accent_phrase_end_paths = accent_phrase_end_paths[:data_num]
    assert len(f0_paths) == len(accent_phrase_end_paths)

    datas = [
        InputData(
            name=f0_path.stem,
            f0=SamplingData.load(f0_path),
            phoneme_list=JvsPhoneme.load_julius_list(phoneme_list_path),
            loudness=SamplingData.load(loudness_path),
            accent_start=[
                bool(int(s)) for s in accent_start_path.read_text().split()
            ],
            accent_end=[
                bool(int(s)) for s in accent_end_path.read_text().split()
            ],
            accent_phrase_start=[
                bool(int(s))
                for s in accent_phrase_start_path.read_text().split()
            ],
            accent_phrase_end=[
                bool(int(s))
                for s in accent_phrase_end_path.read_text().split()
            ],
        ) for (
            f0_path,
            phoneme_list_path,
            loudness_path,
            accent_start_path,
            accent_end_path,
            accent_phrase_start_path,
            accent_phrase_end_path,
        ) in zip(
            f0_paths,
            phoneme_list_paths,
            loudness_paths,
            accent_start_paths,
            accent_end_paths,
            accent_phrase_start_paths,
            accent_phrase_end_paths,
        )
    ]

    train_datas: List[InputData] = []
    valid_datas: List[InputData] = []
    for d in datas:
        if (speaker_valid_filter is not None and speaker_valid_filter
                in d.name) or (utterance_valid_filter is not None
                               and utterance_valid_filter in d.name):
            valid_datas.append(d)
        else:
            train_datas.append(d)

    return train_datas, valid_datas