def separate_effective(self, wave: Wave, feature: AcousticFeature, threshold=None): """ :return: (effective feature, effective flags) """ hop, length = wave.get_hop_and_length( frame_period=self._param.frame_period) if threshold is None: if self._param.threshold_db is not None: effective = wave.get_effective_frame( threshold_db=self._param.threshold_db, fft_length=self._param.fft_length, frame_period=self._param.frame_period, ) feature = feature.indexing(effective) else: effective = numpy.ones(length, dtype=bool) else: mse = librosa.feature.rmse(y=wave.wave, frame_length=self._param.fft_length, hop_length=hop)**2 effective = (librosa.core.power_to_db(mse.squeeze()) > -threshold) if len(effective) < len(feature.f0): # the divide move effective = numpy.r_[effective, False] if len(effective) > len(feature.f0): # the divide move effective = effective if len(effective) < len(feature.f0): # the divide move effective = numpy.r_[effective, False] if len(effective) > len(feature.f0): # the divide move effective = effective feature = feature.indexing(effective) return feature, effective
def combine_silent(self, effective: numpy.ndarray, feature: AcousticFeature): sizes = AcousticFeature.get_sizes( sampling_rate=self._param.sampling_rate, order=self._param.order, ) silent_feature = AcousticFeature.silent(len(effective), sizes=sizes, keys=('mc', 'ap', 'f0', 'voiced')) silent_feature.indexing_set(effective, feature) return silent_feature
def get_example(self, i): train = chainer.config.train inputs = self.inputs[i] p_input, p_target, p_indexes = inputs.in_feature_path, inputs.out_feature_path, inputs.indexes_path indexes = AlignIndexes.load(p_indexes) # input feature f_in = AcousticFeature.load(p_input) f_in = f_in.indexing(indexes.indexes1) input = encode_feature(f_in, targets=self.config.in_features) # target feature f_tar = AcousticFeature.load(p_target) f_tar = f_tar.indexing(indexes.indexes2) target = encode_feature(f_tar, targets=self.config.out_features) mask = encode_feature(make_mask(f_tar), targets=self.config.out_features) # padding seed = numpy.random.randint(2**31) input = random_pad(input, seed=seed, min_size=self.config.train_crop_size) target = random_pad(target, seed=seed, min_size=self.config.train_crop_size) mask = random_pad(mask, seed=seed, min_size=self.config.train_crop_size) # crop seed = numpy.random.randint(2**31) input = random_crop(input, seed=seed, crop_size=self.config.train_crop_size) target = random_crop(target, seed=seed, crop_size=self.config.train_crop_size) mask = random_crop(mask, seed=seed, crop_size=self.config.train_crop_size) if train: input = add_noise(input, p_global=self.config.input_global_noise, p_local=self.config.input_local_noise) target = add_noise(target, p_global=self.config.target_global_noise, p_local=self.config.target_local_noise) return dict( input=input, target=target, mask=mask, )
def get_example(self, i): train = chainer.config.train p_x = self.x_paths[numpy.random.randint(len(self.x_paths))] p_y = self.y_paths[numpy.random.randint(len(self.y_paths))] f_x = AcousticFeature.load(p_x) x = encode_feature(f_x, targets=self.config.in_features) f_y = AcousticFeature.load(p_y) y = encode_feature(f_y, targets=self.config.out_features) mask_x = encode_feature(make_mask(f_x), targets=self.config.in_features) mask_y = encode_feature(make_mask(f_y), targets=self.config.out_features) # padding seed = numpy.random.randint(2**31) x = random_pad(x, seed=seed, min_size=self.config.train_crop_size) mask_x = random_pad(mask_x, seed=seed, min_size=self.config.train_crop_size) seed = numpy.random.randint(2**31) y = random_pad(y, seed=seed, min_size=self.config.train_crop_size) mask_y = random_pad(mask_y, seed=seed, min_size=self.config.train_crop_size) # crop seed = numpy.random.randint(2**31) x = random_crop(x, seed=seed, crop_size=self.config.train_crop_size) mask_x = random_crop(mask_x, seed=seed, crop_size=self.config.train_crop_size) seed = numpy.random.randint(2**31) y = random_crop(y, seed=seed, crop_size=self.config.train_crop_size) mask_y = random_crop(mask_y, seed=seed, crop_size=self.config.train_crop_size) if train: x = add_noise(x, p_global=self.config.input_global_noise, p_local=self.config.input_local_noise) y = add_noise(y, p_global=self.config.target_global_noise, p_local=self.config.target_local_noise) return dict( x=x, y=y, mask_x=mask_x, mask_y=mask_y, )
def generate_align_indexes(pair_path: Tuple[Path, Path]): path1, path2 = pair_path if path1.stem != path2.stem: print('warning: the file names are different', path1, path2) out = Path(arguments.output, path1.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # original wave = Wave.load(path=path1, sampling_rate=sconf1.wav_fs) wave = wave.pad(pre_second=arguments.pad_second1, post_second=arguments.pad_second1) x = low_cut_filter(wave.wave, wave.sampling_rate, cutoff=70) feat1.analyze(x) mcep = feat1.mcep(dim=sconf1.mcep_dim, alpha=sconf1.mcep_alpha) if arguments.threshold_db1 is not None: indexes = wave.get_effective_frame( threshold_db=arguments.threshold_db1, fft_length=sconf1.wav_fftl, frame_period=sconf1.wav_shiftms, ) mcep = mcep[indexes] cvmcep_wopow = mcepgmm.convert(static_delta(mcep[:, 1:]), cvtype=pconf.GMM_mcep_cvtype) mcep1 = numpy.c_[mcep[:, 0], cvmcep_wopow] # target wave = Wave.load(path=path2, sampling_rate=sconf2.wav_fs) wave = wave.pad(pre_second=arguments.pad_second2, post_second=arguments.pad_second2) x = low_cut_filter(wave.wave, wave.sampling_rate, cutoff=70) feat2.analyze(x) mcep2 = feat2.mcep(dim=sconf2.mcep_dim, alpha=sconf2.mcep_alpha) if arguments.threshold_db2 is not None: indexes = wave.get_effective_frame( threshold_db=arguments.threshold_db2, fft_length=sconf2.wav_fftl, frame_period=sconf2.wav_shiftms, ) mcep2 = mcep2[indexes] # align feature1 = AcousticFeature(mc=mcep1) feature2 = AcousticFeature(mc=mcep2) align_indexes = AlignIndexes.extract(feature1, feature2, dtype=arguments.dtype) align_indexes.save(path=out, validate=True, ignores=arguments.ignore_feature)
def generate_align_indexes(pair_path: Tuple[Path, Path]): path1, path2 = pair_path if path1.stem != path2.stem: print('warning: the file names are different', path1, path2) out = Path(arguments.output, path1.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return feature1 = AcousticFeature.load(path=path1) feature2 = AcousticFeature.load(path=path2) align_indexes = AlignIndexes.extract(feature1, feature2, dtype=arguments.dtype) # save align_indexes.save(path=out, validate=True, ignores=arguments.ignore_feature)
def convert(self, in_feature: AcousticFeature): im, iv = self.input_statistics.mean, self.input_statistics.var tm, tv = self.target_statistics.mean, self.target_statistics.var f0 = numpy.copy(in_feature.f0) f0[f0.nonzero()] = numpy.exp((tv / iv) * (numpy.log(f0[f0.nonzero()]) - im) + tm) return AcousticFeature(f0=f0)
def generate_feature(path: Path): out = Path(arguments.output, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave = Wave.load(path=path, sampling_rate=arguments.sampling_rate) wave = wave.pad(pre_second=arguments.pad_second, post_second=arguments.pad_second) # make acoustic feature feature = AcousticFeature.extract( wave=wave, frame_period=arguments.frame_period, f0_floor=arguments.f0_floor, f0_ceil=arguments.f0_ceil, fft_length=arguments.fft_length, order=arguments.order, alpha=arguments.alpha, dtype=arguments.dtype, ) if arguments.threshold_db is not None: index = wave.get_effective_frame( threshold_db=arguments.threshold_db, fft_length=arguments.fft_length, frame_period=arguments.frame_period, ) feature = feature.indexing(index) # save feature.save(path=out, validate=True, ignores=arguments.ignore_feature)
def convert_loop(self, in_feature: AcousticFeature, n_len: int = 512, n_wrap: int = 128): out_feature_list: List[AcousticFeature] = [] N = len(in_feature.f0) for i in numpy.arange(0, int(numpy.ceil(N / n_len))): # convert with overwrapped start = i * n_len mi = max(start - n_wrap, 0) ma = min(start + n_len + n_wrap, N) f = in_feature.indexing(numpy.arange(mi, ma)) o_warp = self.convert(f) # eliminate overwrap ex_mi = start - mi ex_len = min(ma - start, n_len) o = o_warp.indexing(numpy.arange(ex_mi, ex_mi + ex_len)) out_feature_list.append(o) return AcousticFeature.concatenate(out_feature_list)
def decode_spectrogram(self, feature: AcousticFeature): fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate) feature.sp = pysptk.mc2sp( feature.mc.astype(numpy.float32), alpha=pysptk.util.mcepalpha(self.out_sampling_rate), fftlen=fftlen, ) return feature
def _decode_feature(self, data): sizes = AcousticFeature.get_sizes( sampling_rate=self._param.sampling_rate, order=self._param.order, ) return decode_feature(data, targets=self.config.dataset.out_features, sizes=sizes)
def make_mask(feature: AcousticFeature): return AcousticFeature( f0=feature.voiced, sp=numpy.ones_like(feature.sp, dtype=numpy.bool), ap=numpy.ones_like(feature.ap, dtype=numpy.bool), coded_ap=numpy.ones_like(feature.coded_ap, dtype=numpy.bool), mc=numpy.ones_like(feature.mc, dtype=numpy.bool), voiced=numpy.ones_like(feature.voiced, dtype=numpy.bool), ).astype(numpy.float32)
def decode_feature(data: numpy.ndarray, targets: List[str], sizes: Dict[str, int]): data = data.T lasts = numpy.cumsum([sizes[t] for t in targets]).tolist() assert data.shape[1] == lasts[-1] return AcousticFeature(**{ t: data[:, bef:aft] for t, bef, aft in zip(targets, [0] + lasts[:-1], lasts) })
def extract_acoustic_feature(self, wave: Wave): return AcousticFeature.extract( wave, frame_period=self._param.frame_period, f0_floor=self._param.f0_floor, f0_ceil=self._param.f0_ceil, fft_length=self._param.fft_length, order=self._param.order, alpha=self._param.alpha, dtype=self._param.dtype, )
def pad(self, width: int): sizes = AcousticFeature.get_sizes( sampling_rate=self.wave_sampling_rate, order=self.order) return AcousticFeatureWrapper.silent_wrapper( width, sizes=sizes, keys=self._keys, frame_period=self.frame_period, sampling_rate=self.wave_sampling_rate, wave_dtype=numpy.float32, ).astype_only_float_wrapper(numpy.float32)
def post_convert(self, start_time: float, time_length: float): sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order) keys = ['f0', 'ap', 'sp', 'voiced'] out_feature = self.fetch( start_time=start_time, time_length=time_length, data_stream=self._out_feature_stream, rate=1000 / self.frame_period, pad_function=lambda length: AcousticFeature.silent(length, sizes=sizes, keys=keys), pick_function=lambda segment, first, last: segment.feature.pick(first, last, keys=keys), concat_function=lambda buffers: AcousticFeature.concatenate(buffers, keys=keys), ) out_wave = self.vocoder.decode( acoustic_feature=out_feature, ) w = out_wave.wave w[numpy.isnan(w)] = 0 out_wave = Wave(wave=w, sampling_rate=out_wave.sampling_rate) return out_wave
def decode( self, acoustic_feature: AcousticFeature, ): acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) out = pyworld.synthesize( f0=acoustic_feature.f0.ravel(), spectrogram=acoustic_feature.spectrogram, aperiodicity=acoustic_feature.aperiodicity, fs=self.out_sampling_rate, frame_period=self.acoustic_param.frame_period, ) return Wave(out, sampling_rate=self.out_sampling_rate)
def generate_feature(path: Path): out = Path(arguments.output, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave = Wave.load(path=path, sampling_rate=arguments.sampling_rate) wave = wave.pad(pre_second=arguments.pad_second, post_second=arguments.pad_second) # make acoustic feature feature = AcousticFeature.extract( wave=wave, frame_period=arguments.frame_period, f0_floor=arguments.f0_floor, f0_ceil=arguments.f0_ceil, fft_length=arguments.fft_length, order=arguments.order, alpha=arguments.alpha, dtype=arguments.dtype, ) if arguments.threshold_db is not None: if arguments.sampling_rate_for_thresholding is not None: wave_ref = Wave.load( path=path, sampling_rate=arguments.sampling_rate_for_thresholding) wave_ref = wave_ref.pad(pre_second=arguments.pad_second, post_second=arguments.pad_second) else: wave_ref = wave effective = wave_ref.get_effective_frame( threshold_db=arguments.threshold_db, fft_length=arguments.fft_length, frame_period=arguments.frame_period, ) # there is possibility mismatch of length # https://github.com/mmorise/World/blob/c41e580c24c8d360f322ba6e2092ad4785d2d5b9/src/harvest.cpp#L1220 len_wave = wave.get_hop_and_length(arguments.frame_period)[1] len_wave_ref = wave_ref.get_hop_and_length(arguments.frame_period)[1] if len_wave == len_wave_ref - 1: effective = effective[:-1] feature = feature.indexing(effective) # save feature.save(path=out, ignores=arguments.ignore_feature)
def generate_aligned_wave( pair_path: Tuple[Path, Path, Path], sampling_rate: int, frame_period: float, alpha: float, ): path_feature1, path_feature2, path_indexes = pair_path if path_feature1.stem != path_feature2.stem: print('warning: the file names are different', path_feature1, path_feature2) if path_feature1.stem != path_indexes.stem: print('warning: the file names are different', path_feature1, path_indexes) out = Path(arguments.output, path_indexes.stem + '.wav') if arguments.disable_overwrite: return feature1 = AcousticFeature.load(path=path_feature1) feature2 = AcousticFeature.load(path=path_feature2) feature1.sp = AcousticFeature.mc2sp(feature1.mc, sampling_rate=sampling_rate, alpha=alpha) feature2.sp = AcousticFeature.mc2sp(feature2.mc, sampling_rate=sampling_rate, alpha=alpha) feature1.ap = AcousticFeature.decode_ap(feature1.coded_ap, sampling_rate=sampling_rate) feature2.ap = AcousticFeature.decode_ap(feature2.coded_ap, sampling_rate=sampling_rate) align_indexes = AlignIndexes.load(path=path_indexes) align_indexes.feature1 = feature1 align_indexes.feature2 = feature2 wave1 = align_indexes.get_aligned_feature1().decode( sampling_rate=sampling_rate, frame_period=frame_period) wave2 = align_indexes.get_aligned_feature2().decode( sampling_rate=sampling_rate, frame_period=frame_period) # save y = numpy.vstack([wave1.wave, wave2.wave]) librosa.output.write_wav(str(out), y, sr=sampling_rate)
def convert(self, start_time: float, time_length: float, extra_time: float): sizes = AcousticFeature.get_sizes(sampling_rate=self.sampling_rate, order=self.order) keys = ['f0', 'ap', 'mc', 'voiced'] def _pad_function(length): return AcousticFeatureWrapper.silent_wrapper( length, sizes=sizes, keys=keys, frame_period=self.frame_period, sampling_rate=self.sampling_rate, wave_dtype=self.in_dtype, ).astype_only_float_wrapper(self.in_dtype) def _pick_function(segment: FeatureWrapperSegment, first, last): return segment.feature.pick_wrapper( first, last, keys=keys, frame_period=self.frame_period, ) in_feature = self.fetch( start_time=start_time, time_length=time_length, extra_time=extra_time, data_stream=self._in_feature_stream, rate=1000 / self.frame_period, pad_function=_pad_function, pick_function=_pick_function, concat_function=lambda buffers: AcousticFeatureWrapper.concatenate_wrapper(buffers, keys=keys), ) out_feature = self.voice_changer.convert_from_acoustic_feature(in_feature) pad = round(extra_time * 1000 / self.frame_period) out_feature = out_feature.pick(pad, -pad, keys=['f0', 'ap', 'sp', 'voiced']) return out_feature
def load_f0(path: Path): feature = AcousticFeature.load(path=path) return feature.f0
def concat(self, datas: Iterable[AcousticFeatureWrapper]): return AcousticFeature.concatenate(list(datas), keys=self._keys)
def pad(self, width: int): sizes = AcousticFeature.get_sizes( sampling_rate=self.wave_sampling_rate, order=self.order) return AcousticFeature.silent(width, sizes=sizes, keys=self._keys)
def load_acoustic_feature(self, path: Path): return AcousticFeature.load(path)