def get_vad(vad_dir, file_name, vad_suffix, sig, fea): """ Perform voice activity detection or load it from file. :param vad_dir: directory with vad files :type vad_dir: str :param file_name: name of the file :type file_name: str :param vad_suffix: suffix of vad files :type vad_suffix :param sig: input signal :type sig: numpy.array :param fea: MFCCs :type fea: numpy.array :returns: VAD segments - list with boolean values :rtype: list """ if vad_dir is None: loginfo('[wav2ivec.get_vad] Computing VAD ...') return compute_vad(sig, win_length=WINDOWSIZE / SOURCERATE, win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)] else: vad = os.path.join(vad_dir, file_name) + vad_suffix loginfo('[wav2ivec.get_vad] Loading VAD from file {} ...'.format( file_name)) return load_vad_lab_as_bool_vec(vad)[:len(fea)]
def process_file(wav_dir, vad_dir, out_dir, file_name, model, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Extract i-vector from wav file. :param wav_dir: directory with wav files :type wav_dir: str :param vad_dir: directory with vad files :type vad_dir: str :param out_dir: output directory :type out_dir: str :param file_name: name of the file :type file_name: str :param model: input models for i-vector extraction :type model: tuple :param wav_suffix: suffix of wav files :type wav_suffix: str :param vad_suffix: suffix of vad files :type vad_suffix """ loginfo('[wav2ivec.process_file] Processing file {} ...'.format(file_name)) ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt = model wav = os.path.join(wav_dir, file_name) + wav_suffix rate, sig = read(wav) if len(sig.shape) != 1: raise GeneralException( '[wav2ivec.process_file] Expected mono as input audio.') if rate != 8000: logwarning( '[wav2ivec.process_file] ' 'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.' .format(rate)) sig = signal.resample(sig, 8000) if ADDDITHER > 0.0: loginfo('[wav2ivec.process_file] Adding dither ...') sig = features.add_dither(sig, ADDDITHER) fea = get_mfccs(sig) vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig, fea) fea = fea[vad, ...] w = get_ivec(fea, numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt) if w is not None: Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) np.save(os.path.join(out_dir, file_name), w)
def get_mfccs(sig): """ Extract MFCC featrues from signal. :param sig: input signal :type sig: numpy.array :returns: MFCCs :rtype: numpy.array """ loginfo('[wav2ivec.get_mfccs] Extracting MFCC features ...') fbank_mx = features.mel_fbank_mx(winlen_nfft=WINDOWSIZE / SOURCERATE, fs=fs, NUMCHANS=NUMCHANS, LOFREQ=LOFREQ, HIFREQ=HIFREQ) fea = features.mfcc_htk(sig, window=WINDOWSIZE / SOURCERATE, noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE, fbank_mx=fbank_mx, _0='first', NUMCEPS=NUMCEPS, RAWENERGY=RAWENERGY, PREEMCOEF=PREEMCOEF, CEPLIFTER=CEPLIFTER, ZMEANSOURCE=ZMEANSOURCE, ENORMALISE=ENORMALISE, ESCALE=0.1, SILFLOOR=50.0, USEHAMMING=True) loginfo('[wav2ivec.get_mfccs] Adding derivatives ...') fea = features.add_deriv(fea, (deltawindow, accwindow)) loginfo('[wav2ivec.get_mfccs] Reshaping to SFeaCat conventions ...') return fea.reshape(fea.shape[0], 3, -1).transpose( (0, 2, 1)).reshape(fea.shape[0], -1)
def init(ubm_file, v_file): """ Initialize i-vector extractor. :param ubm_file: path to UBM :type ubm_file: str :param v_file: path to v matrix :type v_file: str :returns: loaded and initialized models :rtype: tuple """ loginfo('[wav2ivec.init] Loading UBM file ...') ubm_weights, ubm_means, ubm_covs = load_ubm(ubm_file) gmm_model = gmm.gmm_eval_prep(ubm_weights, ubm_means, ubm_covs) numg = ubm_means.shape[0] dimf = ubm_means.shape[1] if ubm_covs.shape[1] == dimf: ubm_norm = 1 / np.sqrt(ubm_covs) else: ubm_norm = None loginfo('[wav2ivec.init] Loading V model file ...') v = np.load(v_file) mvvt = iv.compute_VtV(v, numg) return ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt
def get_ivec(fea, numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt): loginfo('[wav2ivec.get_ivec] Applying floating CMVN ...') fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True) n_data, d_data = fea.shape l = 0 lc = 0 n = np.zeros(numg, dtype=np.float32) f = np.zeros((numg, dimf), dtype=np.float32) loginfo('[wav2ivec.get_ivec] Computing statistics ...') seq_data = split_seq(range(n_data), 1000) for i in range(len(seq_data)): dd = fea[seq_data[i], :] l1, n1, f1 = gmm.gmm_eval(dd, gmm_model, return_accums=1) l = l + l1.sum() lc = lc + l1.shape[0] n = n + n1 f = f + f1 n, f = normalize_stats(n, f, ubm_means, ubm_norm) f = row(f.astype(v.dtype)) n = row(n.astype(v.dtype)) loginfo('[wav2ivec.get_ivec] Computing i-vector ...') return iv.estimate_i(n, f, v, mvvt).T
def main(argv): parser = argparse.ArgumentParser( 'Run diarization on input data with PLDA model.') parser.add_argument('-l', '--input-list', help='list of input files without suffix', action='store', dest='input_list', type=str, required=True) parser.add_argument( '--norm-list', help='list of input normalization files without suffix in .npy format', action='store', dest='norm_list', type=str, required=False) parser.add_argument( '--ivecs-dir', help= 'directory containing i-vectos using class IvecSet - pickle format', action='store', dest='ivecs_dir', type=str, required=True) parser.add_argument('--out-dir', help='output directory for storing .rttm files', action='store', dest='out_dir', type=str, required=False) parser.add_argument('--reference', help='reference rttm file for system scoring', action='store', dest='reference', type=str, required=False) parser.add_argument('--plda-model-dir', help='directory with PLDA model files', action='store', dest='plda_model_dir', type=str, required=True) parser.add_argument('-j', '--num-cores', help='number of processor cores to use', action='store', dest='num_cores', type=int, required=False) parser.set_defaults(norm_list=None) parser.set_defaults(num_cores=multiprocessing.cpu_count()) args = parser.parse_args() if args.reference is None and args.out_dir is None: parser.print_help() sys.stderr.write( 'at least one of --reference and --out-dir must be specified' + os.linesep) loginfo('[diar.main] Setting {} processor cores for the MKL library ...'. format(args.num_cores)) set_mkl(args.num_cores) diar = Diarization(args.input_list, args.norm_list, args.ivecs_dir, args.out_dir, args.plda_model_dir) scores = diar.score() if args.reference is not None: diar.get_der(args.reference, scores) if args.out_dir is not None: diar.dump_rttm(scores) return 0
def main(argv): parser = argparse.ArgumentParser( 'Run diarization on input data with PLDA model.') parser.add_argument('-l', '--input-list', help='list of input files without suffix', action='store', dest='input_list', type=str, required=True) parser.add_argument( '--norm-list', help='list of input normalization files without suffix in .npy format', action='store', dest='norm_list', type=str, required=False) parser.add_argument( '--ivecs-dir', help= 'directory containing i-vectos using class IvecSet - pickle format', action='store', dest='ivecs_dir', type=str, required=True) parser.add_argument('--out-dir', help='output directory for storing .rttm files', action='store', dest='out_dir', type=str, required=False) parser.add_argument('--reference', help='reference rttm file for system scoring', action='store', dest='reference', type=str, required=False) parser.add_argument('--plda-model-dir', help='directory with PLDA model files', action='store', dest='plda_model_dir', type=str, required=True) parser.add_argument('-j', '--num-cores', help='number of processor cores to use', action='store', dest='num_cores', type=int, required=False) parser.set_defaults(norm_list=None) parser.set_defaults(num_cores=multiprocessing.cpu_count()) args = parser.parse_args() if args.reference is None and args.out_dir is None: parser.print_help() sys.stderr.write( 'at least one of --reference and --out-dir must be specified' + os.linesep) loginfo('[diar.main] Using {} threads...'.format(args.num_cores)) # TODO: Currently, num_cores has no effect. Consider refactoring the # code in lib.diarization so that clutering is done in parallel in # num_cores threads. This is probably a better idea than using # multithreading in the BLAS or with the actual k-means in sklearn # (passing n_jobs to KMeans will ensure that each initialization is # processed in a separate thread...well, not actually a thread... # stupid GIL...abuse of ellipses) diar = Diarization(args.input_list, args.norm_list, args.ivecs_dir, args.out_dir, args.plda_model_dir) scores = diar.score() if args.reference is not None: diar.get_der(args.reference, scores) if args.out_dir is not None: diar.dump_rttm(scores) return 0
def process_file(wav_dir, vad_dir, out_dir, file_name, model, min_size, max_size, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Extract i-vectors from wav file. :param wav_dir: directory with wav files :type wav_dir: str :param vad_dir: directory with vad files :type vad_dir: str :param out_dir: output directory :type out_dir: str :param file_name: name of the file :type file_name: str :param model: input models for i-vector extraction :type model: tuple :param min_size: minimal size of window in ms :type min_size: int :param max_size: maximal size of window in ms :type max_size: int :param tolerance: accept given number of frames as speech even when it is marked as silence :type tolerance: int :param wav_suffix: suffix of wav files :type wav_suffix: str :param vad_suffix: suffix of vad files :type vad_suffix """ loginfo('[wav2ivecs.process_file] Processing file {} ' '...'.format(file_name)) _, ubm_means, _, ubm_norm, gmm_model, numg, dimf, v, mvvt = model ubm_means = model[1] ubm_norm = model[3] gmm_model = model[4] if len(file_name.split()) > 1: file_name = file_name.split()[0] wav = os.path.join(wav_dir, file_name) + wav_suffix rate = get_sr(wav) # loginfo(wav) # if rate != 8000: logwarning('[wav2ivec.process_file] ' 'The input file is expected to be in 8000 Hz, got {} Hz ' 'instead, resampling.'.format(rate)) rate, sig = af_to_array(wav, target_sr=8000) if ADDDITHER > 0.0: # TODO: Where is this defined? #loginfo('[wav2ivecs.process_file] Adding dither ...') sig = features.add_dither(sig, ADDDITHER) fea = get_mfccs(sig) vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig, fea) ivec_set = IvecSet() ivec_set.name = file_name for seg in get_segments(vad, min_size, max_size, tolerance): start, end = get_num_segments(seg[0]), get_num_segments(seg[1]) w = get_ivec(fea[seg[0]:seg[1] + 1], numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt) if w is None: continue ivec_set.add(w, start, end) Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))
def main(argv): parser = argparse.ArgumentParser( 'Extract i-vector from audio wav files 1:1.') parser.add_argument('-l', '--input-list', help='list of input files without suffix', action='store', dest='input_list', type=str, required=True) parser.add_argument( '--audio-dir', help='directory with audio files in .wav format - 8000Hz, 16bit-s, 1c', action='store', dest='audio_dir', type=str, required=True) parser.add_argument('-wav-suffix', help='wav file suffix', action='store', dest='wav_suffix', type=str, required=False) parser.add_argument( '--vad-dir', help='directory with lab files - Voice/Speech activity detection', action='store', dest='vad_dir', type=str, required=False) parser.add_argument('-vad-suffix', help='Voice Activity Detector file suffix', action='store', dest='vad_suffix', type=str, required=False) parser.add_argument('--out-dir', help='output directory for storing i-vectors', action='store', dest='out_dir', type=str, required=True) parser.add_argument('--ubm-file', help='Universal Background Model file', action='store', dest='ubm_file', type=str, required=True) parser.add_argument('--v-file', help='V Model file', action='store', dest='v_file', type=str, required=True) parser.add_argument('-j', '--num-cores', help='number of processor cores to use', action='store', dest='num_cores', type=int, required=False) parser.set_defaults(num_cores=multiprocessing.cpu_count()) parser.set_defaults(wav_suffix='.wav') parser.set_defaults(vad_suffix='.lab.gz') args = parser.parse_args() models = init(args.ubm_file, args.v_file) loginfo( '[wav2ivecs.main] Setting {} processor cores for the MKL library ...'. format(args.num_cores)) set_mkl(args.num_cores) files = [line.rstrip('\n') for line in open(args.input_list)] for f in files: process_file(args.audio_dir, args.vad_dir, args.out_dir, f, models, wav_suffix=args.wav_suffix, vad_suffix=args.vad_suffix) return 0
def process_file(wav_dir, vad_dir, out_dir, file_name, model, min_size, max_size, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz'): """ Extract i-vectors from wav file. :param wav_dir: directory with wav files :type wav_dir: str :param vad_dir: directory with vad files :type vad_dir: str :param out_dir: output directory :type out_dir: str :param file_name: name of the file :type file_name: str :param model: input models for i-vector extraction :type model: tuple :param min_size: minimal size of window in ms :type min_size: int :param max_size: maximal size of window in ms :type max_size: int :param tolerance: accept given number of frames as speech even when it is marked as silence :type tolerance: int :param wav_suffix: suffix of wav files :type wav_suffix: str :param vad_suffix: suffix of vad files :type vad_suffix """ loginfo('[wav2ivecs.process_file] Processing file {} ...'.format( file_name.split()[0])) ubm_weights, ubm_means, ubm_covs, ubm_norm, gmm_model, numg, dimf, v, mvvt = model if len(file_name.split()) > 1: file_name = file_name.split()[0] wav = os.path.join(wav_dir, file_name) + wav_suffix rate, sig = read(wav) if len(sig.shape) != 1: raise GeneralException( '[wav2ivec.process_file] Expected mono as input audio.') if rate != 8000: logwarning( '[wav2ivecs.process_file] ' 'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.' .format(rate)) sig = signal.resample(sig, 8000) if ADDDITHER > 0.0: loginfo('[wav2ivecs.process_file] Adding dither ...') sig = features.add_dither(sig, ADDDITHER) fea = get_mfccs(sig) vad, n_regions, n_frames = get_vad(vad_dir, file_name, vad_suffix, sig, fea) ivec_set = IvecSet() ivec_set.name = file_name for seg in get_segments(vad, min_size, max_size, tolerance): start, end = get_num_segments(seg[0]), get_num_segments(seg[1]) loginfo( '[wav2ivecs.process_file] Processing speech segment from {} ms to {} ms ...' .format(start, end)) if seg[0] > fea.shape[0] - 1 or seg[1] > fea.shape[0] - 1: logwarning( '[norm.process_file] Unexpected features dimensionality - check VAD input or audio.' ) continue w = get_ivec(fea[seg[0]:seg[1]], numg, dimf, gmm_model, ubm_means, ubm_norm, v, mvvt) if w is None: continue ivec_set.add(w, start, end) Tools.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name)))