def extract_input( sampling_length: int, wave_data: Wave, silence_data: SamplingData, f0_data: SamplingData, phoneme_data: SamplingData, min_not_silence_length: int, with_mic_augment: bool, time_mask_max_second: float, time_mask_num: int, ): rate = wave_data.sampling_rate sl = sampling_length l_rate = max(f0_data.rate, phoneme_data.rate) assert rate % l_rate == 0 l_scale = int(rate // l_rate) assert sl % l_scale == 0 local = SamplingData.collect([f0_data, phoneme_data], rate=l_rate, mode="min", error_time_length=0.015) f0_array = local[:, 0] phoneme_array = local[:, 1:] assert numpy.abs(len(local) * l_scale - len(wave_data.wave)) < l_scale * 4 length = min( len(local) * l_scale, len(wave_data.wave) // l_scale * l_scale) if sl > length: pad = sl - length sl = length else: pad = 0 l_length = length // l_scale l_sl = sl // l_scale l_pad = pad // l_scale for _ in range(10000): if l_length > l_sl: l_offset = numpy.random.randint(l_length - l_sl) else: l_offset = 0 offset = l_offset * l_scale silence = numpy.squeeze( silence_data.resample(rate, index=offset, length=sl)) if (~silence).sum() >= min_not_silence_length: break else: raise Exception("cannot pick not silence data") wave = wave_data.wave[offset:offset + sl] f0 = numpy.squeeze(f0_array[l_offset:l_offset + l_sl]) phoneme = numpy.argmax(phoneme_array[l_offset:l_offset + l_sl], axis=1) padded = numpy.zeros_like(f0, dtype=bool) if l_pad > 0: l_pre = numpy.random.randint(l_pad + 1) l_post = l_pad - l_pre f0 = numpy.pad(f0, [l_pre, l_post]) phoneme = numpy.pad(phoneme, [l_pre, l_post]) padded = numpy.pad(padded, [l_pre, l_post], constant_values=True) pre, post = int(l_pre * l_scale), int(l_post * l_scale) wave = numpy.pad(wave, [pre, post]) if with_mic_augment: wave = mic_augment(wave, sampling_rate=rate) if time_mask_max_second > 0 and time_mask_num > 0: for _ in range(time_mask_num): mask_length = numpy.random.randint( int(wave_data.sampling_rate * time_mask_max_second)) mask_offset = numpy.random.randint(len(wave) - mask_length + 1) wave[mask_offset:mask_offset + mask_length] = 0 return dict( wave=wave, f0=f0, phoneme=phoneme, padded=padded, )
async def to_feature(text: str = Form(...), wave: UploadFile = File(...)): with TemporaryDirectory() as d: tmp_dir = Path(d) input_audio_path = tmp_dir.joinpath("input.wav") input_audio_path.write_bytes(await wave.read()) # openjtalk phonemes = [ p.label for p in openjtalk_label_getter( text, openjtalk_command="open_jtalk", dict_path=Path("/var/lib/mecab/dic/open-jtalk/naist-jdic"), htsvoice_path=Path( "/usr/share/hts-voice/nitech-jp-atr503-m001/nitech_jp_atr503_m001.htsvoice" ), output_wave_path=tmp_dir.joinpath("wave.wav"), output_log_path=tmp_dir.joinpath("log.txt"), output_type=OutputType.phoneme, without_span=False, ) ] # julius julius_audio_path = tmp_dir.joinpath("julius.wav") subprocess.check_call( f"sox {input_audio_path} -r 16000 -b 16 {julius_audio_path}".split() ) julius_phonemes = [ p if p not in _jvs_to_julius else _jvs_to_julius[p] for p in phonemes if p != "sil" ] julius_dict_path = tmp_dir.joinpath("2nd.dict") julius_dict = sp_inserter.gen_julius_dict_2nd( " ".join(julius_phonemes), model_type=sp_inserter.ModelType.gmm ) julius_dict_path.write_text(julius_dict) julius_dfa_path = tmp_dir.joinpath("2nd.dfa") julius_dfa = sp_inserter.gen_julius_aliment_dfa(julius_dict.count("\n")) julius_dfa_path.write_text(julius_dfa) julius_output = sp_inserter.julius_phone_alignment( str(julius_audio_path), str(tmp_dir.joinpath("2nd")), _hmm_model, model_type=sp_inserter.ModelType.gmm, options=None, ) time_alignment_list = sp_inserter.frame_to_second( sp_inserter.get_time_alimented_list(julius_output) ) i_phoneme = 0 new_phonemes = [] for p in phonemes: if p == "pau" and time_alignment_list[i_phoneme][2] != "sp": continue i_phoneme += 1 new_phonemes.append(p) aligned = JvsPhoneme.convert( [ JvsPhoneme(start=float(o[0]), end=float(o[1]), phoneme=p) for p, o in zip(new_phonemes, time_alignment_list) ] ) for p in aligned: p.verify() # world f0 = F0.from_wave( Wave.load(input_audio_path, sampling_rate=24000, dtype=numpy.float64), frame_period=5.0, f0_floor=71.0, f0_ceil=800, with_vuv=False, f0_type=F0Type.world, ) converted_f0 = f0.convert( input_mean=f0.valid_f0_log.mean(), input_var=f0.valid_f0_log.var(), target_mean=_voiro_mean, target_var=f0.valid_f0_log.var(), ) converted_f0.array = converted_f0.array.astype(numpy.float32).reshape(-1, 1) # feature phoneme_array = LinguisticFeature( phonemes=aligned, phoneme_class=JvsPhoneme, rate=_feature_rate, feature_types=[LinguisticFeature.FeatureType.PHONEME], ).make_array() phoneme = SamplingData(array=phoneme_array, rate=_feature_rate) feature = SamplingData.collect( [converted_f0, phoneme], rate=_feature_rate, mode="min", error_time_length=0.015, ) return StreamingResponse(BytesIO(feature.astype(numpy.float32).tobytes()))