示例#1
0
def calc_mcd(
    path1: Optional[Path] = None,
    path2: Optional[Path] = None,
    wave1: Optional[Wave] = None,
    wave2: Optional[Wave] = None,
):
    wave1 = Wave.load(path1) if wave1 is None else wave1
    wave2 = Wave.load(path2) if wave2 is None else wave2
    assert wave1.sampling_rate == wave2.sampling_rate

    sampling_rate = wave1.sampling_rate

    min_length = min(len(wave1.wave), len(wave2.wave))
    wave1.wave = wave1.wave[:min_length]
    wave2.wave = wave2.wave[:min_length]

    mc1 = to_melcepstrum(
        x=wave1.wave,
        sampling_rate=sampling_rate,
        n_fft=2048,
        win_length=1024,
        hop_length=256,
        order=24,
    )
    mc2 = to_melcepstrum(
        x=wave2.wave,
        sampling_rate=sampling_rate,
        n_fft=2048,
        win_length=1024,
        hop_length=256,
        order=24,
    )
    return _mcd(mc1, mc2)
示例#2
0
    def forward(
        self,
        wave: Tensor,
        local: Tensor,
        speaker_id: Tensor = None,
    ):
        batch_size = len(wave)

        local_padding_length = int(self.generator.sampling_rate *
                                   self.local_padding_time_second)

        output = self.generator.generate(
            local=local,
            local_padding_length=local_padding_length,
            speaker_id=speaker_id,
        )

        mcd_list = []
        for wi, wo in zip(wave.cpu().numpy(), output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)
            try:
                mcd = calc_mcd(wave1=wi, wave2=wo)
            except Exception:
                mcd = numpy.nan
            mcd_list.append(mcd)

        scores = {
            "mcd": (numpy.mean(mcd_list), batch_size),
        }

        report(scores, self)
        return scores
示例#3
0
    def generate(
        self,
        local: Union[numpy.ndarray, torch.Tensor],
        local_padding_length: int = 0,
        speaker_id: Union[numpy.ndarray, torch.Tensor] = None,
    ):
        if isinstance(local, numpy.ndarray):
            local = torch.from_numpy(local)
        local = local.to(self.device)

        if speaker_id is not None:
            if isinstance(speaker_id, numpy.ndarray):
                speaker_id = torch.from_numpy(speaker_id)
            speaker_id = speaker_id.to(self.device)

        with torch.no_grad():
            output = self.inference_forward(
                local=local,
                local_padding_length=local_padding_length,
                speaker_id=speaker_id,
            )

        waves = output.cpu().numpy()
        if self.mulaw:
            waves = decode_mulaw(waves)

        return [Wave(wave=wave, sampling_rate=self.sampling_rate) for wave in waves]
示例#4
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         f0=SamplingData.load(self.path_f0),
         phoneme=SamplingData.load(self.path_phoneme),
     )
示例#5
0
    def __call__(
        self,
        wave: Tensor,
        silence: Tensor,
        local: Tensor,
        source: Tensor,
        source2: Tensor,
        signal: Tensor,
        speaker_id: Tensor = None,
    ):
        batch_size = len(wave)

        local_padding_length = int(self.generator.sampling_rate *
                                   self.local_padding_time_length)

        output = self.generator.generate(
            local=local,
            source=source,
            speaker_id=speaker_id,
            local_padding_length=local_padding_length,
        )

        mcd_list = []
        for wi, wo in zip(wave.cpu().numpy(), output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)
            mcd = calc_mcd(wave1=wi, wave2=wo)
            mcd_list.append(mcd)

        scores = {
            "mcd": (numpy.mean(mcd_list), batch_size),
        }

        report(scores, self)
        return scores
示例#6
0
    def __getitem__(self, i: int):
        sampling_rate = self.sampling_rate
        length = self.sampling_length
        frequency = numpy.random.uniform(self.frequency_range[0],
                                         self.frequency_range[1])
        rand = numpy.random.rand()

        wave = numpy.sin(
            (2 * numpy.pi) * (numpy.arange(length, dtype=numpy.float32) *
                              frequency / sampling_rate + rand))

        local = numpy.log(
            numpy.ones(shape=(length // self.local_scale, 1),
                       dtype=numpy.float32) * frequency)

        silence = numpy.zeros(shape=(length, ), dtype=numpy.bool)

        return default_convert(
            self.make_input(
                wave_data=Wave(wave=wave, sampling_rate=sampling_rate),
                silence_data=SamplingData(array=silence, rate=sampling_rate),
                local_data=SamplingData(array=local,
                                        rate=sampling_rate //
                                        self.local_scale),
            ))
示例#7
0
    def __call__(
        self,
        wave: np.ndarray,
        local: Optional[np.ndarray],
        speaker_num: Optional[np.ndarray] = None,
    ):
        batchsize = len(wave)
        wave = chainer.cuda.to_cpu(wave)

        wave_output = self.generator.generate(
            time_length=self.time_length + self.local_padding_time_length * 2,
            sampling_policy=self.sampling_policy,
            num_generate=batchsize,
            local_array=local,
            speaker_nums=speaker_num,
        )

        mcd_list = []
        for wi, wo in zip(wave, wave_output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)

            if self.local_padding_time_length > 0:
                pad = int(wo.sampling_rate * self.local_padding_time_length)
                wo.wave = wo.wave[pad:-pad]

            mcd = calc_mcd(wave1=wi, wave2=wo)
            mcd_list.append(mcd)

        scores = {"mcd": (self.generator.xp.asarray(mcd_list).mean(), batchsize)}

        chainer.report(scores, self)
        return scores
示例#8
0
    def generate(
        self,
        local: Union[numpy.ndarray, torch.Tensor],
        source: Union[numpy.ndarray, torch.Tensor],
        speaker_id: Union[numpy.ndarray, torch.Tensor] = None,
        local_padding_length: int = 0,
    ):
        if isinstance(local, numpy.ndarray):
            local = torch.from_numpy(local)
        local = local.to(self.device)

        if isinstance(source, numpy.ndarray):
            source = torch.from_numpy(source)
        source = source.to(self.device)

        if speaker_id is not None:
            if isinstance(speaker_id, numpy.ndarray):
                speaker_id = torch.from_numpy(speaker_id)
            speaker_id = speaker_id.to(self.device)

        # generate
        with torch.no_grad():
            output = self.predictor(
                source=source,
                local=local,
                local_padding_length=local_padding_length,
                speaker_id=speaker_id,
            )

        output = output.cpu().numpy()
        return [Wave(wave=o, sampling_rate=self.sampling_rate) for o in output]
示例#9
0
    def test_convert_to_dict(self):
        sampling_rate = 800
        local_sampling_rate = 200
        scale = sampling_rate // local_sampling_rate
        time_length = 10
        sampling_length = 16

        wave_data = Wave(
            wave=numpy.linspace(
                0,
                sampling_rate * time_length,
                sampling_rate * time_length,
                endpoint=False,
            ),
            sampling_rate=sampling_rate,
        )
        silence_data = SamplingData(
            array=numpy.zeros((sampling_rate * time_length, ), dtype=bool),
            rate=sampling_rate,
        )
        local_data = SamplingData(
            array=numpy.linspace(
                0,
                sampling_rate * time_length,
                local_sampling_rate * time_length,
                endpoint=False,
            ),
            rate=local_sampling_rate,
        )

        wave, silence, local = BaseWaveDataset.extract_input(
            sampling_length,
            wave_data=wave_data,
            silence_data=silence_data,
            local_data=local_data,
            local_sampling_rate=local_sampling_rate,
            local_padding_size=0,
            local_mask_max_second=0,
            local_mask_num=0,
        )

        dataset = BaseWaveDataset(
            sampling_rate=sampling_rate,
            sampling_length=sampling_length,
            bit=10,
            mulaw=False,
            wave_random_max_second=0,
            wave_random_num=0,
            local_sampling_rate=local_sampling_rate,
            local_padding_size=0,
            local_mask_max_second=0,
            local_mask_num=0,
        )
        d = dataset.convert_input(wave, silence, local)
        self.assertEqual(len(d["coarse"]), sampling_length)
        self.assertEqual(len(d["encoded_coarse"]), sampling_length)
        self.assertEqual(len(d["silence"]), sampling_length - 1)
        self.assertEqual(len(d["local"]), sampling_length // scale)
示例#10
0
    def test_extract_input(self):
        for sampling_rate, local_sampling_rate, sampling_length, time_length in [
            [800, 200, 16, 10],
            [24000, 24000 / 256, 1024, 100],
        ]:
            with self.subTest(
                    sampling_rate=sampling_rate,
                    local_sampling_rate=local_sampling_rate,
                    sampling_length=sampling_length,
                    time_length=time_length,
            ):
                scale = sampling_rate // local_sampling_rate

                wave_data = Wave(
                    wave=numpy.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(sampling_rate * time_length),
                        endpoint=False,
                    ),
                    sampling_rate=sampling_rate,
                )
                silence_data = SamplingData(
                    array=numpy.zeros((sampling_rate * time_length, ),
                                      dtype=bool),
                    rate=sampling_rate,
                )
                local_data = SamplingData(
                    array=numpy.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(local_sampling_rate * time_length),
                        endpoint=False,
                    ),
                    rate=local_sampling_rate,
                )

                for _ in range(10):
                    wave, silence, local = BaseWaveDataset.extract_input(
                        sampling_length,
                        wave_data=wave_data,
                        silence_data=silence_data,
                        local_data=local_data,
                        local_sampling_rate=local_sampling_rate,
                        local_padding_size=0,
                        local_mask_max_second=0,
                        local_mask_num=0,
                    )

                    self.assertEqual(len(wave), sampling_length)
                    self.assertEqual(len(silence), sampling_length)
                    self.assertEqual(len(local), sampling_length // scale)

                    wave_as_local = wave.reshape(int(sampling_length // scale),
                                                 -1).min(axis=1)
                    self.assertTrue(numpy.all(wave_as_local == local))
示例#11
0
def _process(path: Path, bit: int, gaussian_noise_sigma: float):
    wave = Wave.load(path).wave

    if gaussian_noise_sigma > 0:
        wave += numpy.random.randn(*wave.shape) * gaussian_noise_sigma

    encoded = encode_single(encode_mulaw(wave, mu=2 ** bit), bit=bit)
    return numpy.histogram(encoded, bins=2 ** bit, range=(0, 2 ** bit))[0].astype(
        numpy.uint64
    )
示例#12
0
def calc_silence_rate(
    path1: Optional[Path] = None,
    path2: Optional[Path] = None,
    wave1: Optional[Wave] = None,
    wave2: Optional[Wave] = None,
):
    wave1 = Wave.load(path1) if wave1 is None else wave1
    wave2 = Wave.load(path2) if wave2 is None else wave2
    assert wave1.sampling_rate == wave2.sampling_rate

    silence1 = ~librosa.effects._signal_to_frame_nonsilent(wave1.wave)
    silence2 = ~librosa.effects._signal_to_frame_nonsilent(wave2.wave)

    tp = numpy.logical_and(silence1, silence2).sum(dtype=float)
    tn = numpy.logical_and(~silence1, ~silence2).sum(dtype=float)
    fn = numpy.logical_and(silence1, ~silence2).sum(dtype=float)
    fp = numpy.logical_and(~silence1, silence2).sum(dtype=float)

    accuracy = (tp + tn) / (tp + tn + fn + fp)
    return accuracy
示例#13
0
 def setUp(self):
     waves = [
         np.ones(self.num // 2) * -1,
         np.ones(self.num // 2),
     ]
     self.inputs = [
         Input(
             wave=Wave(wave=w, sampling_rate=self.sampling_rate),
             local=SamplingData(array=np.empty((len(w), 0)),
                                rate=self.sampling_rate),
             silence=SamplingData(array=np.zeros((len(w), ), dtype=bool),
                                  rate=self.sampling_rate),
         ) for w in waves
     ]
示例#14
0
    def generate(self):
        wave = Wave.load(self.path_wave)

        try:
            local = SamplingData.load(self.path_local)
        except:
            local_rate = 80
            local_array = to_log_melspectrogram(wave=wave, rate=local_rate)
            local = SamplingData(array=local_array, rate=local_rate)

            with NamedTemporaryFile(suffix=".npy", delete=False) as f:
                self.path_local = Path(f.name)
                local.save(self.path_local)

        return Input(
            wave=wave,
            silence=SamplingData.load(self.path_silence),
            local=local,
        )
示例#15
0
    def __call__(
        self,
        wave: Tensor,
        local: Optional[Tensor],
        speaker_num: Optional[Tensor] = None,
    ):
        batchsize = len(wave)

        wave_output = self.generator.generate(
            time_length=self.time_length + self.local_padding_time_length * 2,
            sampling_policy=self.sampling_policy,
            num_generate=batchsize,
            local_array=local,
            speaker_nums=speaker_num,
        )

        mcd_list = []
        sil_acc_list = []
        for wi, wo in zip(wave.cpu().numpy(), wave_output):
            wi = Wave(wave=wi, sampling_rate=wo.sampling_rate)

            if self.local_padding_time_length > 0:
                pad = int(wo.sampling_rate * self.local_padding_time_length)
                wo.wave = wo.wave[pad:-pad]

            mcd = calc_mcd(wave1=wi, wave2=wo)
            mcd_list.append(mcd)

            accuracy = calc_silence_rate(wave1=wi, wave2=wo)
            sil_acc_list.append(accuracy)

        scores = {
            "mcd": (numpy.mean(mcd_list), batchsize),
            "sil_acc": (numpy.mean(sil_acc_list), batchsize),
        }

        report(scores, self)
        return scores
示例#16
0
def process(
    input_paths: Tuple[Path, Path],
    output_dir: Path,
):
    input_wave, input_f0 = input_paths

    wave_data = Wave.load(input_wave)
    f0_data = F0.load(input_f0)

    y = wave_data.wave.astype(np.float64)
    sr = wave_data.sampling_rate

    f0 = np.exp(f0_data.array[:, 0].astype(np.float64))
    if f0_data.with_vuv:
        f0[~f0_data.array[:, 1]] = 0

    t = np.arange(0, len(f0), dtype=np.float64) / f0_data.rate
    sp = pyworld.cheaptrick(y, f0, t, sr)
    ap = pyworld.d4c(y, f0, t, sr)

    y = pyworld.synthesize(f0, sp, ap, sr)

    out = output_dir / f"{input_f0.stem}.wav"
    librosa.output.write_wav(out, y.astype(np.float32), sr)
示例#17
0
    def test_extract_input_with_local_padding(self):
        for sampling_rate, local_sampling_rate, sampling_length, time_length, local_padding_size in [
            [800, 200, 16, 1, 100],
            [24000, 24000 / 256, 1024, 4, 1024],
        ]:
            with self.subTest(
                    sampling_rate=sampling_rate,
                    local_sampling_rate=local_sampling_rate,
                    sampling_length=sampling_length,
                    time_length=time_length,
                    local_padding_size=local_padding_size,
            ):
                scale = sampling_rate // local_sampling_rate

                wave_data = Wave(
                    wave=np.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(sampling_rate * time_length),
                        endpoint=False,
                    ),
                    sampling_rate=sampling_rate,
                )
                silence_data = SamplingData(
                    array=np.zeros((sampling_rate * time_length, ),
                                   dtype=bool),
                    rate=sampling_rate,
                )
                local_data = SamplingData(
                    array=np.linspace(
                        0,
                        int(sampling_rate * time_length),
                        int(local_sampling_rate * time_length),
                        endpoint=False,
                    ),
                    rate=local_sampling_rate,
                )
                for _ in range(10000):
                    wave, silence, local = BaseWaveDataset.extract_input(
                        sampling_length,
                        wave_data=wave_data,
                        silence_data=silence_data,
                        local_data=local_data,
                        local_padding_size=local_padding_size,
                        padding_value=np.nan,
                    )

                    self.assertEqual(len(wave), sampling_length)
                    self.assertEqual(len(silence), sampling_length)
                    self.assertEqual(
                        len(local),
                        (sampling_length + local_padding_size * 2) // scale)

                    num_pad = np.isnan(local).sum()
                    self.assertLessEqual(num_pad, local_padding_size)

                    self.assertTrue(not np.isnan(local[0])
                                    or not np.isnan(local[-1]))

                    wave_as_local = wave.reshape(int(sampling_length // scale),
                                                 -1).min(axis=1)
                    pad = int(local_padding_size // scale)
                    local_wo_pad = local[pad:-pad]
                    self.assertTrue(np.all(wave_as_local == local_wo_pad))
示例#18
0
    def main_forward(
        self,
        length: int,
        sampling_policy: SamplingPolicy,
        num_generate: int,
        local_array: np.ndarray = None,
        s_one: np.ndarray = None,
    ):
        if self.model.with_local:
            with chainer.using_config("train", False), chainer.using_config(
                    "enable_backprop", False):
                local_array = self.model.forward_encode(l_array=local_array,
                                                        s_one=s_one).data

        c = self.xp.zeros([num_generate], dtype=np.float32)
        c = encode_single(c, bit=self.single_bit)

        hidden_coarse = self.model.gru.init_hx(local_array)[0].data

        if self.use_cpp_inference and sampling_policy == SamplingPolicy.random:
            import yukarin_autoreg_cpp

            wave = np.zeros((length, num_generate), dtype=np.int32)
            yukarin_autoreg_cpp.inference(
                batch_size=num_generate,
                length=length,
                output=wave,
                x=to_numpy(c),
                l_array=to_numpy(self.xp.transpose(local_array, (1, 0, 2))),
                hidden=to_numpy(hidden_coarse),
            )
        else:
            if sampling_policy == SamplingPolicy.random:
                fast_forward_params = get_fast_forward_params(self.model)
                w_list = fast_generate(
                    length=length,
                    x=c,
                    l_array=local_array,
                    h=hidden_coarse,
                    **fast_forward_params,
                )
            else:
                w_list = []
                hc = hidden_coarse
                for i in tqdm(range(length), desc="generate"):
                    with chainer.using_config("train",
                                              False), chainer.using_config(
                                                  "enable_backprop", False):
                        c, hc = self.model.forward_one(
                            prev_x=c,
                            prev_l=local_array[:, i],
                            hidden=hc,
                        )

                    if sampling_policy == SamplingPolicy.random:
                        is_random = True
                    elif sampling_policy == SamplingPolicy.maximum:
                        is_random = False
                    else:
                        raise ValueError(sampling_policy)

                    c = self.model.sampling(c, maximum=not is_random)
                    w_list.append(c)

            wave = self.xp.stack(w_list)
            wave = cuda.to_cpu(wave)

        wave = wave.T
        wave = decode_single(wave, bit=self.single_bit)
        if self.mulaw:
            wave = decode_mulaw(wave, mu=2**self.single_bit)

        return [
            Wave(wave=w_one, sampling_rate=self.sampling_rate)
            for w_one in wave
        ]
示例#19
0
    def test_convert_to_dict(self):
        sampling_rate = 800
        local_sampling_rate = 200
        scale = sampling_rate // local_sampling_rate
        time_length = 10
        sampling_length = 16

        wave_data = Wave(
            wave=np.linspace(0,
                             sampling_rate * time_length,
                             sampling_rate * time_length,
                             endpoint=False),
            sampling_rate=sampling_rate,
        )
        silence_data = SamplingData(
            array=np.zeros((sampling_rate * time_length, ), dtype=bool),
            rate=sampling_rate,
        )
        local_data = SamplingData(
            array=np.linspace(0,
                              sampling_rate * time_length,
                              local_sampling_rate * time_length,
                              endpoint=False),
            rate=local_sampling_rate,
        )

        wave, silence, local = BaseWaveDataset.extract_input(
            sampling_length,
            wave_data=wave_data,
            silence_data=silence_data,
            local_data=local_data,
            local_padding_size=0,
        )

        dataset = BaseWaveDataset(
            sampling_length=sampling_length,
            to_double=True,
            bit=16,
            mulaw=False,
            local_padding_size=0,
        )
        d = dataset.convert_to_dict(wave, silence, local)
        self.assertEqual(len(d['coarse']), sampling_length)
        self.assertEqual(len(d['fine']), sampling_length - 1)
        self.assertEqual(len(d['encoded_coarse']), sampling_length)
        self.assertEqual(len(d['encoded_fine']), sampling_length)
        self.assertEqual(len(d['silence']), sampling_length - 1)
        self.assertEqual(len(d['local']), sampling_length // scale)

        dataset = BaseWaveDataset(
            sampling_length=sampling_length,
            to_double=False,
            bit=10,
            mulaw=False,
            local_padding_size=0,
        )
        d = dataset.convert_to_dict(wave, silence, local)
        self.assertEqual(len(d['coarse']), sampling_length)
        self.assertIsNone(d['fine'])
        self.assertEqual(len(d['encoded_coarse']), sampling_length)
        self.assertIsNone(d['encoded_fine'])
        self.assertEqual(len(d['silence']), sampling_length - 1)
        self.assertEqual(len(d['local']), sampling_length // scale)
示例#20
0
def generate_dataset(
    dataset_directory: Path,
    data_num: int,
    sampling_rate: int,
    local_rate: int,
    phoneme_size: int,
    speaker_size: int,
):
    if dataset_directory.exists():
        for p in dataset_directory.rglob("*"):
            if not p.is_dir():
                p.unlink()
    else:
        dataset_directory.mkdir()

    f0_dir = dataset_directory.joinpath("f0")
    phoneme_dir = dataset_directory.joinpath("phoneme")
    wave_dir = dataset_directory.joinpath("wave")
    silence_dir = dataset_directory.joinpath("silence")

    f0_dir.mkdir(exist_ok=True)
    phoneme_dir.mkdir(exist_ok=True)
    wave_dir.mkdir(exist_ok=True)
    silence_dir.mkdir(exist_ok=True)

    for i_data in range(data_num):
        local_length = int(numpy.random.randint(low=100, high=200))
        sampling_length = int(local_length / local_rate * sampling_rate)

        f0 = numpy.random.rand(local_length, 1).astype(numpy.float32)
        f0[f0 < 0.2] = 0
        f0 *= 7
        SamplingData(array=f0, rate=local_rate).save(f0_dir.joinpath(f"{i_data}.npy"))

        phoneme = numpy.random.randint(0, phoneme_size, size=local_length).astype(
            numpy.int32
        )
        phoneme = numpy.identity(phoneme_size)[phoneme].astype(numpy.int32)
        SamplingData(array=phoneme, rate=local_rate).save(
            phoneme_dir.joinpath(f"{i_data}.npy")
        )

        rand = numpy.random.rand()
        wave = numpy.concatenate(
            [
                numpy.sin(
                    (2 * numpy.pi)
                    * (
                        numpy.arange(sampling_length // len(f0), dtype=numpy.float32)
                        * numpy.exp(one_f0)
                        / sampling_rate
                        + rand
                    )
                )
                for one_f0 in f0.tolist()
            ]
        )
        Wave(wave=wave, sampling_rate=sampling_rate).save(
            wave_dir.joinpath(f"{i_data}.wav")
        )

        silence = numpy.zeros_like(wave).astype(bool)
        SamplingData(array=silence, rate=sampling_rate).save(
            silence_dir.joinpath(f"{i_data}.npy")
        )

    speaker_dict = defaultdict(list)
    for i_data in range(data_num):
        speaker_dict[str(i_data % speaker_size)].append(str(i_data))
    json.dump(speaker_dict, dataset_directory.joinpath("speaker_dict.json").open("w"))
示例#21
0
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)):
    with TemporaryDirectory() as d:
        tmp_dir = Path(d)
        input_audio_path = tmp_dir.joinpath("input.wav")
        input_audio_path.write_bytes(await wave.read())

        # openjtalk
        phonemes = [
            p.label
            for p in openjtalk_label_getter(
                text,
                openjtalk_command="open_jtalk",
                dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"),
                htsvoice_path=Path(
                    "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice"
                ),
                output_wave_path=tmp_dir.joinpath("wave.wav"),
                output_log_path=tmp_dir.joinpath("log.txt"),
                output_type=OutputType.phoneme,
                without_span=False,
            )
        ]

        # julius
        julius_audio_path = tmp_dir.joinpath("julius.wav")
        subprocess.check_call(
            f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split()
        )

        julius_phonemes = [
            p if p not in _jvs_to_julius else _jvs_to_julius[p]
            for p in phonemes
            if p != "sil"
        ]

        julius_dict_path = tmp_dir.joinpath("2nd.dict")
        julius_dict = sp_inserter.gen_julius_dict_2nd(
            " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm
        )
        julius_dict_path.write_text(julius_dict)

        julius_dfa_path = tmp_dir.joinpath("2nd.dfa")
        julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n"))
        julius_dfa_path.write_text(julius_dfa)

        julius_output = sp_inserter.julius_phone_alignment(
            str(julius_audio_path),
            str(tmp_dir.joinpath("2nd")),
            _hmm_model,
            model_type=sp_inserter.ModelType.gmm,
            options=None,
        )

        time_alignment_list = sp_inserter.frame_to_second(
            sp_inserter.get_time_alimented_list(julius_output)
        )

        i_phoneme = 0
        new_phonemes = []
        for p in phonemes:
            if p == "pau" and time_alignment_list[i_phoneme][2] != "sp":
                continue
            i_phoneme += 1
            new_phonemes.append(p)

        aligned = JvsPhoneme.convert(
            [
                JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p)
                for p, o in zip(new_phonemes, time_alignment_list)
            ]
        )
        for p in aligned:
            p.verify()

        # world
        f0 = F0.from_wave(
            Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64),
            frame_period=5.0,
            f0_floor=71.0,
            f0_ceil=800,
            with_vuv=False,
            f0_type=F0Type.world,
        )
        converted_f0 = f0.convert(
            input_mean=f0.valid_f0_log.mean(),
            input_var=f0.valid_f0_log.var(),
            target_mean=_voiro_mean,
            target_var=f0.valid_f0_log.var(),
        )
        converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1)

        # feature
        phoneme_array = LinguisticFeature(
            phonemes=aligned,
            phoneme_class=JvsPhoneme,
            rate=_feature_rate,
            feature_types=[LinguisticFeature.FeatureType.PHONEME],
        ).make_array()

        phoneme = SamplingData(array=phoneme_array, rate=_feature_rate)

        feature = SamplingData.collect(
            [converted_f0, phoneme],
            rate=_feature_rate,
            mode="min",
            error_time_length=0.015,
        )

    return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))
示例#22
0
    def generate(
        self,
        time_length: float,
        sampling_policy: SamplingPolicy,
        num_generate: int,
        local_array: Union[numpy.ndarray, Tensor] = None,
        speaker_nums: Union[Sequence[int], Tensor] = None,
    ):
        assert num_generate <= self.max_batch_size
        assert local_array is None or len(local_array) == num_generate
        assert speaker_nums is None or len(speaker_nums) == num_generate

        length = int(self.sampling_rate * time_length)

        if local_array is None:
            local_array = torch.empty((num_generate, length, 0)).float()
        local_array = to_tensor(local_array).to(self.device)

        if speaker_nums is not None:
            speaker_nums = to_tensor(speaker_nums).reshape(
                (-1, )).to(self.device)
            with torch.no_grad():
                s_one = self.predictor.forward_speaker(speaker_nums)
        else:
            s_one = None

        if self.predictor.with_local:
            with torch.no_grad():
                local_array = self.predictor.forward_encode(
                    l_array=local_array, s_one=s_one)

        x = numpy.zeros(num_generate, dtype=numpy.float32)
        x = encode_single(x, bit=self.bit_size)

        hidden = numpy.zeros(
            (num_generate, self.predictor.gru.hidden_size),
            dtype=numpy.float32,
        )

        if sampling_policy == SamplingPolicy.corrected_random:
            low_probability_threshold = -18
        else:
            low_probability_threshold = -999

        if self.use_fast_inference and self.use_gpu:
            assert sampling_policy in [
                SamplingPolicy.random,
            ]

            import yukarin_autoreg_cpp

            wave = numpy.zeros((length, num_generate), dtype=numpy.int32)
            yukarin_autoreg_cpp.inference(
                batch_size=num_generate,
                length=length,
                output=wave,
                x=x.astype(numpy.int32),
                l_array=to_numpy(local_array.transpose(0, 1)),
                hidden=to_numpy(hidden),
            )

        elif self.use_fast_inference and not self.use_gpu:
            assert sampling_policy == SamplingPolicy.random

            params = get_fast_forward_params(self.predictor)
            x_list = fast_generate(
                length=length,
                x=x,
                l_array=local_array.numpy(),
                h=hidden,
                **params,
            )
            wave = numpy.stack(x_list)
        else:
            with torch.no_grad():
                x = to_tensor(x).to(self.device)
                x_max = x
                hidden = to_tensor(hidden).to(self.device)
                x_list = []
                for i in tqdm(range(length), desc="generate"):
                    d_max, _ = self.predictor.forward_one(
                        prev_x=x_max, prev_l=local_array[:, i], hidden=hidden)
                    d, hidden = self.predictor.forward_one(
                        prev_x=x, prev_l=local_array[:, i], hidden=hidden)

                    if sampling_policy == SamplingPolicy.maximum:
                        is_random = False
                    else:
                        is_random = True
                        d[F.log_softmax(d_max.double(), dim=1) <
                          low_probability_threshold] -= 200

                    x = self.predictor.sampling(d, maximum=not is_random)
                    x_max = self.predictor.sampling(d, maximum=True)
                    x_list.append(x)

                wave = torch.stack(x_list).cpu().numpy()

        wave = wave.T
        wave = decode_single(wave, bit=self.bit_size)
        if self.mulaw:
            wave = decode_mulaw(wave, mu=2**self.bit_size)

        return [
            Wave(wave=w_one, sampling_rate=self.sampling_rate)
            for w_one in wave
        ]
def collect_to_tfevents(
    input_dir: Path,
    output_dir: Optional[Path],
    filename_suffix: str,
    audio_tag_format: str,
    diff_tag: str,
    iteration_format: str,
    remove_exist: bool,
    expected_wave_dir: Optional[Path],
):
    if output_dir is None:
        output_dir = input_dir

    if remove_exist:
        for p in output_dir.glob(f"*tfevents*{filename_suffix}"):
            p.unlink()

    flag_calc_diff = expected_wave_dir is not None

    summary_writer = SummaryWriter(logdir=str(output_dir),
                                   filename_suffix=filename_suffix)

    diffs: DefaultDict[int, List[float]] = defaultdict(list)
    for p in tqdm(sorted(input_dir.rglob("*"), key=_to_nums),
                  desc=input_dir.stem):
        if p.is_dir():
            continue

        if "tfevents" in p.name:
            continue

        rp = p.relative_to(input_dir)
        iteration = int(iteration_format.format(p=p, rp=rp))

        # audio
        if p.suffix in [".wav"]:
            wave, sr = librosa.load(str(p), sr=None)
            summary_writer.add_audio(
                tag=audio_tag_format.format(p=p, rp=rp),
                snd_tensor=wave,
                sample_rate=sr,
                global_step=iteration,
            )

        # diff
        if flag_calc_diff and p.name.endswith("_woc.wav"):
            wave_id = p.name[:-8]
            expected = expected_wave_dir.joinpath(f"{wave_id}.wav")

            wo = Wave.load(p)
            wi = Wave.load(expected, sampling_rate=wo.sampling_rate)

            diff = calc_mcd(wave1=wi, wave2=wo)
            diffs[iteration].append(diff)

    if flag_calc_diff:
        for iteration, values in sorted(diffs.items()):
            summary_writer.add_scalar(
                tag=diff_tag,
                scalar_value=numpy.mean(values),
                global_step=iteration,
            )

    summary_writer.close()
示例#24
0
def main():
    model_dir: Path = arguments.model_dir
    model_iteration: int = arguments.model_iteration
    model_config: Path = arguments.model_config
    time_length: float = arguments.time_length
    gpu: int = arguments.gpu

    config = create_config(model_config)
    model_path = _get_predictor_model_path(model_dir, model_iteration)

    sr = config.dataset.sampling_rate

    model = create_predictor(config.model)
    chainer.serializers.load_npz(str(model_path), model)
    if gpu is not None:
        model.to_gpu(gpu)
        cuda.get_device_from_id(gpu).use()

    chainer.global_config.train = False
    chainer.global_config.enable_backprop = False

    wave_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_wave_glob))])
    local_paths = sorted([Path(p) for p in glob.glob(str(config.dataset.input_local_glob))])
    assert len(wave_paths) == len(local_paths)

    np.random.RandomState(config.dataset.seed).shuffle(wave_paths)
    np.random.RandomState(config.dataset.seed).shuffle(local_paths)
    wave_path = wave_paths[0]
    local_path = local_paths[0]
    w_data = Wave.load(wave_path, sampling_rate=sr)
    l_data = SamplingData.load(local_path)

    length = int(sr * time_length)
    l_scale = int(sr // l_data.rate)
    l_sl = length // l_scale
    length = l_sl * l_scale

    w = w_data.wave[:length]
    l = l_data.array[:l_sl]
    coarse, fine = encode_16bit(w)

    c, f, hc, hf = model(
        c_array=decode_single(model.xp.asarray(coarse)).astype(np.float32)[np.newaxis],
        f_array=decode_single(model.xp.asarray(fine)).astype(np.float32)[:-1][np.newaxis],
        l_array=model.xp.asarray(l)[np.newaxis],
    )

    c = chainer.functions.softmax(c)

    c = chainer.cuda.to_cpu(c[0].data)
    f = chainer.cuda.to_cpu(f[0].data)

    fig = plt.figure(figsize=[32 * time_length, 10])

    plt.imshow(c, aspect='auto', interpolation='nearest')
    plt.colorbar()

    plt.plot((w + 1) * 127.5, 'g', linewidth=0.1, label='true')
    plt.plot(np.argmax(c, axis=0) + np.argmax(f, axis=0) / 256, 'r', linewidth=0.1, label='predicted')
    plt.legend()

    fig.savefig('output.eps')
示例#25
0
    def generate(
        self,
        time_length: Optional[float],
        sampling_policy: SamplingPolicy,
        num_generate: int,
        coarse=None,
        local_array: np.ndarray = None,
        speaker_nums: List[int] = None,
        hidden_coarse=None,
    ):
        assert num_generate <= self.max_batch_size
        assert coarse is None or len(coarse) == num_generate
        assert local_array is None or len(local_array) == num_generate
        assert speaker_nums is None or len(speaker_nums) == num_generate
        assert hidden_coarse is None or len(hidden_coarse) == num_generate
        assert sampling_policy == SamplingPolicy.random

        length = int(self.sampling_rate * time_length)

        if local_array is None:
            local_array = self.xp.empty((num_generate, length, 0),
                                        dtype=np.float32)
        else:
            local_array = self.xp.asarray(local_array)

        if speaker_nums is not None:
            speaker_nums = self.xp.asarray(speaker_nums).reshape((-1, ))
            with chainer.using_config("train", False), chainer.using_config(
                    "enable_backprop", False):
                s_one = self.model.forward_speaker(speaker_nums).data
        else:
            s_one = None

        if self.model.with_local:
            with chainer.using_config("train", False), chainer.using_config(
                    "enable_backprop", False):
                local_array = self.model.forward_encode(l_array=local_array,
                                                        s_one=s_one).data

        if coarse is None:
            c = self.xp.zeros([num_generate], dtype=np.float32)
            c = encode_single(c, bit=self.single_bit)
        else:
            c = coarse

        if hidden_coarse is None:
            hidden_coarse = self.model.gru.init_hx(local_array)[0].data

        wave = np.zeros((length, num_generate), dtype=np.int32)
        yukarin_autoreg_cpp.inference(
            batch_size=num_generate,
            length=length,
            output=wave,
            x=to_numpy(c),
            l_array=to_numpy(self.xp.transpose(local_array, (1, 0, 2))),
            hidden=to_numpy(hidden_coarse),
        )

        wave = wave.T
        wave = decode_single(wave, bit=self.single_bit)
        if self.mulaw:
            wave = decode_mulaw(wave, mu=2**self.single_bit)

        return [
            Wave(wave=w_one, sampling_rate=self.sampling_rate)
            for w_one in wave
        ]
示例#26
0
 def generate(self):
     return Input(
         wave=Wave.load(self.path_wave),
         silence=SamplingData.load(self.path_silence),
         local=SamplingData.load(self.path_local),
     )
示例#27
0
def generate(
    model_dir: Path,
    model_iteration: Optional[int],
    model_config: Optional[Path],
    output_dir: Path,
    to_voiced_scaler: bool,
    to_f0_scaler: bool,
    to_phoneme_onehot: bool,
    batch_size: Optional[int],
    num_test: int,
    target_glob: Optional[str],
    use_gpu: bool,
):
    if model_config is None:
        model_config = model_dir / "config.yaml"

    output_dir.mkdir(exist_ok=True)
    save_arguments(output_dir / "arguments.yaml", generate, locals())

    config = Config.from_dict(yaml.safe_load(model_config.open()))

    generator = Generator(
        config=config,
        predictor=_get_model_path(
            model_dir=model_dir,
            iteration=model_iteration,
            prefix="predictor_",
        ),
        voiced_network=(
            None
            if not to_voiced_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="voiced_network_",
            )
        ),
        f0_network=(
            None
            if not to_f0_scaler
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="f0_network_",
            )
        ),
        phoneme_network=(
            None
            if not to_phoneme_onehot
            else _get_model_path(
                model_dir=model_dir,
                iteration=model_iteration,
                prefix="phoneme_network_",
            )
        ),
        use_gpu=use_gpu,
    )

    dataset = create_dataset(config.dataset)["test"]
    scale = numpy.prod(config.network.scale_list)

    if batch_size is None:
        batch_size = config.train.batch_size

    if isinstance(dataset, SpeakerWavesDataset):
        wave_paths = [data.path_wave for data in dataset.wave_dataset.inputs[:num_test]]
    elif isinstance(dataset, WavesDataset):
        wave_paths = [data.path_wave for data in dataset.inputs[:num_test]]
    else:
        raise Exception()

    if target_glob is not None:
        wave_paths += list(map(Path, glob(target_glob)))

    for wps in tqdm(chunked(wave_paths, batch_size), desc="generate"):
        waves = [Wave.load(p) for p in wps]
        arrays = [w.wave for w in waves]

        pad_lengths = [int(numpy.ceil(len(w) / scale) * scale) for w in arrays]
        arrays = [numpy.r_[w, numpy.zeros(max(pad_lengths) - len(w))] for w in arrays]

        tensors = [torch.from_numpy(array.astype(numpy.float32)) for array in arrays]
        output = generator.generate(
            wave=concat_examples(tensors),
            to_voiced_scaler=to_voiced_scaler,
            to_f0_scaler=to_f0_scaler,
            to_phoneme_onehot=to_phoneme_onehot,
        )

        for feature, p, w, l in zip(output, wps, waves, pad_lengths):
            feature = feature.T[: l // scale]
            data = SamplingData(array=feature, rate=w.sampling_rate // scale)
            data.save(output_dir / (p.stem + ".npy"))