def melcepstrum_extract(wav_list, args): """EXTRACT MEL CEPSTRUM.""" # define feature extractor for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warning("wav file format is not 16 bit PCM.") x = np.array(x, dtype=np.float64) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # extract features shiftl = int(args.shiftms * fs * 0.001) mcep = stft_mcep(x, args.fftl, shiftl, args.mcep_dim, args.mcep_alpha) # save to hdf5 hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/mcep", np.float32(mcep)) # overwrite wav file if args.highpass_cutoff != 0 and args.save_wav: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def world_feature_extract(wav_list, args): """EXTRACT WORLD FEATURE VECTOR.""" # define feature extractor feature_extractor = FeatureExtractor(analyzer="world", fs=args.fs, shiftms=args.shiftms, minf0=args.minf0, maxf0=args.maxf0, fftl=args.fftl) for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warning("wav file format is not 16 bit PCM.") x = np.array(x, dtype=np.float64) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # extract features f0, _, _ = feature_extractor.analyze(x) uv, cont_f0 = convert_continuos_f0(f0) cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20) codeap = feature_extractor.codeap() mcep = feature_extractor.mcep(dim=args.mcep_dim, alpha=args.mcep_alpha) # concatenate cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) uv = np.expand_dims(uv, axis=-1) feats = np.concatenate([uv, cont_f0_lpf, mcep, codeap], axis=1) # save to hdf5 hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/world", feats) # overwrite wav file if args.highpass_cutoff != 0 and args.save_wav: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def test_preprocessing(feature_type): # make arguments args = make_args(feature_type=feature_type) # prepare dummy wav files wavdir = "tmp/wav" if not os.path.exists(wavdir): os.makedirs(wavdir) for i in range(5): make_dummy_wav(wavdir + "/%d.wav" % i, 8000, args.fs) # feature extract wav_list = find_files(wavdir, "*.wav") if not os.path.exists(args.wavdir): os.makedirs(args.wavdir) if args.feature_type == "world": world_feature_extract(wav_list, args) elif args.feature_type == "melspc": melspectrogram_extract(wav_list, args) else: melcepstrum_extract(wav_list, args) # calc_stats file_list = find_files(args.hdf5dir, "*.h5") calc_stats(file_list, args) # noise shaping if feature_type != "melspc": wav_list = find_files(args.wavdir, "*.wav") if not os.path.exists(args.outdir): os.makedirs(args.outdir) if not check_hdf5(args.stats, "/mlsa/coef"): avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean") if args.feature_type == "world": avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end] mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha) write_hdf5(args.stats, "/mlsa/coef", mlsa_coef) write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha) noise_shaping(wav_list, args) # remove shutil.rmtree("tmp")
def melspectrogram_extract(wav_list, args): """EXTRACT MEL SPECTROGRAM.""" # define feature extractor for i, wav_name in enumerate(wav_list): logging.info("now processing %s (%d/%d)" % (wav_name, i + 1, len(wav_list))) # load wavfile and apply low cut filter fs, x = wavfile.read(wav_name) if x.dtype != np.int16: logging.warning("wav file format is not 16 bit PCM.") x = np.array(x, dtype=np.float64) if args.highpass_cutoff != 0: x = low_cut_filter(x, fs, cutoff=args.highpass_cutoff) # check sampling frequency if not fs == args.fs: logging.error("sampling frequency is not matched.") sys.exit(1) # extract features x_norm = x / (np.iinfo(np.int16).max + 1) shiftl = int(args.shiftms * fs * 0.001) mspc = librosa.feature.melspectrogram( x_norm, fs, n_fft=args.fftl, hop_length=shiftl, n_mels=args.mspc_dim, fmin=args.fmin if args.fmin is not None else 0, fmax=args.fmax if args.fmax is not None else fs // 2, power=1.0) mspc = np.log10(np.maximum(EPS, mspc.T)) # save to hdf5 hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") write_hdf5(hdf5name, "/melspc", np.float32(mspc)) # overwrite wav file if args.highpass_cutoff != 0 and args.save_wav: wavfile.write(args.wavdir + "/" + os.path.basename(wav_name), fs, np.int16(x))
def calc_stats(file_list, args): """CALCULATE STATISTICS.""" scaler = StandardScaler() # process over all of data for i, filename in enumerate(file_list): logging.info("now processing %s (%d/%d)" % (filename, i + 1, len(file_list))) feat = read_hdf5(filename, "/" + args.feature_type) scaler.partial_fit(feat) # add uv term mean = scaler.mean_ scale = scaler.scale_ if args.feature_type == "world": mean[0] = 0.0 scale[0] = 1.0 # write to hdf5 write_hdf5(args.stats, "/" + args.feature_type + "/mean", np.float32(mean)) write_hdf5(args.stats, "/" + args.feature_type + "/scale", np.float32(scale))
def main(): """RUN NOISE SHAPING IN PARALLEL.""" parser = argparse.ArgumentParser( description="making feature file argsurations.") parser.add_argument( "--waveforms", default=None, help="directory or list of filename of input wavfile") parser.add_argument( "--stats", default=None, help="filename of hdf5 format") parser.add_argument( "--outdir", default=None, help="directory to save preprocessed wav file") parser.add_argument( "--fs", default=16000, type=int, help="Sampling frequency") parser.add_argument( "--shiftms", default=5, type=float, help="Frame shift in msec") parser.add_argument( "--feature_type", default="world", choices=["world", "mcep", "melspc"], type=str, help="feature type") parser.add_argument( "--mcep_dim_start", default=2, type=int, help="Start index of mel cepstrum") parser.add_argument( "--mcep_dim_end", default=27, type=int, help="End index of mel cepstrum") parser.add_argument( "--mcep_alpha", default=0.41, type=float, help="Alpha of mel cepstrum") parser.add_argument( "--mag", default=0.5, type=float, help="magnification of noise shaping") parser.add_argument( "--verbose", default=1, type=int, help="log message level") parser.add_argument( '--n_jobs', default=10, type=int, help="number of parallel jobs") parser.add_argument( '--inv', default=False, type=strtobool, help="if True, inverse filtering will be performed") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARNING, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warning("logging is disabled.") # show arguments for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # read list if os.path.isdir(args.waveforms): file_list = sorted(find_files(args.waveforms, "*.wav")) else: file_list = read_txt(args.waveforms) logging.info("number of utterances = %d" % len(file_list)) # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # divide list file_lists = np.array_split(file_list, args.n_jobs) file_lists = [f_list.tolist() for f_list in file_lists] # calculate MLSA coef ans save it if not check_hdf5(args.stats, "/mlsa/coef"): avg_mcep = read_hdf5(args.stats, args.feature_type + "/mean") if args.feature_type == "world": avg_mcep = avg_mcep[args.mcep_dim_start:args.mcep_dim_end] mlsa_coef = convert_mcep_to_mlsa_coef(avg_mcep, args.mag, args.mcep_alpha) write_hdf5(args.stats, "/mlsa/coef", mlsa_coef) write_hdf5(args.stats, "/mlsa/alpha", args.mcep_alpha) # multi processing processes = [] if args.feature_type == "melspc": # TODO(kan-bayashi): implement noise shaping using melspectrogram raise NotImplementedError("currently, support only world and mcep.") for f in file_lists: p = mp.Process(target=noise_shaping, args=(f, args,)) p.start() processes.append(p) # wait for all process for p in processes: p.join()