def process_file(wav_dir, vad_dir, out_dir, file_name, model, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Extract i-vector from wav file. :param wav_dir: directory with wav files :type wav_dir: str :param vad_dir: directory with vad files :type vad_dir: str :param out_dir: output directory :type out_dir: str :param file_name: name of the file :type file_name: str :param model: input models for i-vector extraction :type model: tuple :param wav_suffix: suffix of wav files :type wav_suffix: str :param vad_suffix: suffix of vad files :type vad_suffix """ loginfo('[wav2ivec.process_file] Processing file {} ...'.format(file_name)) ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt = model wav = os.path.join(wav_dir, file_name) + wav_suffix rate, sig = read(wav) if len(sig.shape) != 1: raise GeneralException( '[wav2ivec.process_file] Expected mono as input audio.') if rate != 8000: logwarning( '[wav2ivec.process_file] ' 'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.' .format(rate)) sig = signal.resample(sig, 8000) if ADDDITHER > 0.0: loginfo('[wav2ivec.process_file] Adding dither ...') sig = features.add_dither(sig, ADDDITHER) fea = get_mfccs(sig) vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig, fea) fea = fea[vad, ...] w = get_ivec(fea, numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt) if w is not None: Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) np.save(os.path.join(out_dir, file_name), w)
def process_file(wav_dir, vad_dir, out_dir, file_name, model, min_size, max_size, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Extract i-vectors from wav file. :param wav_dir: directory with wav files :type wav_dir: str :param vad_dir: directory with vad files :type vad_dir: str :param out_dir: output directory :type out_dir: str :param file_name: name of the file :type file_name: str :param model: input models for i-vector extraction :type model: tuple :param min_size: minimal size of window in ms :type min_size: int :param max_size: maximal size of window in ms :type max_size: int :param tolerance: accept given number of frames as speech even when it is marked as silence :type tolerance: int :param wav_suffix: suffix of wav files :type wav_suffix: str :param vad_suffix: suffix of vad files :type vad_suffix """ loginfo('[wav2ivecs.process_file] Processing file {} ' '...'.format(file_name)) _, ubm_means, _, ubm_norm, gmm_model, numg, dimf, v, mvvt = model ubm_means = model[1] ubm_norm = model[3] gmm_model = model[4] if len(file_name.split()) > 1: file_name = file_name.split()[0] wav = os.path.join(wav_dir, file_name) + wav_suffix rate = get_sr(wav) # loginfo(wav) # if rate != 8000: logwarning('[wav2ivec.process_file] ' 'The input file is expected to be in 8000 Hz, got {} Hz ' 'instead, resampling.'.format(rate)) rate, sig = af_to_array(wav, target_sr=8000) if ADDDITHER > 0.0: # TODO: Where is this defined? #loginfo('[wav2ivecs.process_file] Adding dither ...') sig = features.add_dither(sig, ADDDITHER) fea = get_mfccs(sig) vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig, fea) ivec_set = IvecSet() ivec_set.name = file_name for seg in get_segments(vad, min_size, max_size, tolerance): start, end = get_num_segments(seg[0]), get_num_segments(seg[1]) w = get_ivec(fea[seg[0]:seg[1] + 1], numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt) if w is None: continue ivec_set.add(w, start, end) Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))
def process_file(wav_dir, vad_dir, out_dir, file_name, model, min_size, max_size, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Extract i-vectors from wav file. :param wav_dir: directory with wav files :type wav_dir: str :param vad_dir: directory with vad files :type vad_dir: str :param out_dir: output directory :type out_dir: str :param file_name: name of the file :type file_name: str :param model: input models for i-vector extraction :type model: tuple :param min_size: minimal size of window in ms :type min_size: int :param max_size: maximal size of window in ms :type max_size: int :param tolerance: accept given number of frames as speech even when it is marked as silence :type tolerance: int :param wav_suffix: suffix of wav files :type wav_suffix: str :param vad_suffix: suffix of vad files :type vad_suffix """ loginfo('[wav2ivecs.process_file] Processing file {} ...'.format( file_name.split()[0])) ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt = model if len(file_name.split()) > 1: file_name = file_name.split()[0] wav = os.path.join(wav_dir, file_name) + wav_suffix rate, sig = read(wav) if len(sig.shape) != 1: raise GeneralException( '[wav2ivec.process_file] Expected mono as input audio.') if rate != 8000: logwarning( '[wav2ivecs.process_file] ' 'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.' .format(rate)) sig = signal.resample(sig, 8000) if ADDDITHER > 0.0: loginfo('[wav2ivecs.process_file] Adding dither ...') sig = features.add_dither(sig, ADDDITHER) fea = get_mfccs(sig) vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig, fea) ivec_set = IvecSet() ivec_set.name = file_name for seg in get_segments(vad, min_size, max_size, tolerance): start, end = get_num_segments(seg[0]), get_num_segments(seg[1]) loginfo( '[wav2ivecs.process_file] Processing speech segment from {} ms to {} ms ...' .format(start, end)) if seg[0] > fea.shape[0] - 1 or seg[1] > fea.shape[0] - 1: logwarning( '[norm.process_file] Unexpected features dimensionality - check VAD input or audio.' ) continue w = get_ivec(fea[seg[0]:seg[1]], numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt) if w is None: continue ivec_set.add(w, start, end) Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))