def evaluate_deepmine_from_xvecs(ds_eval, outfolder='./exp/example_xvecs'): if not os.path.isfile(os.path.join(outfolder, 'xvector.scp')): xvec_scps = glob(os.path.join(outfolder, '*.scp')) assert len(xvec_scps) != 0, 'No xvector scps found' with open(os.path.join(outfolder, 'xvector.scp'), 'w+') as outfile: for fname in xvec_scps: with open(fname) as infile: for line in infile: outfile.write(line) xvec_dict = odict_from_2_col(os.path.join(outfolder, 'xvector.scp')) answer_col0 = [] answer_col1 = [] answer_col2 = [] for i in tqdm(range(len(ds_eval))): model, enrol_utts, eval_utts, = ds_eval.get_item_utts(i) answer_col0.append([model for _ in range(len(eval_utts))]) answer_col1.append(eval_utts) model_embeds = np.array( [read_vec_flt(xvec_dict[u]) for u in enrol_utts]) model_embed = np.mean(normalize(model_embeds, axis=1), axis=0).reshape(1, -1) eval_embeds = np.array([read_vec_flt(xvec_dict[u]) for u in eval_utts]) eval_embeds = normalize(eval_embeds, axis=1) scores = cosine_similarity(model_embed, eval_embeds).squeeze(0) assert len(scores) == len(eval_utts) answer_col2.append(scores) answer_col0 = np.concatenate(answer_col0) answer_col1 = np.concatenate(answer_col1) answer_col2 = np.concatenate(answer_col2) print('Writing results to file...') with open(os.path.join(outfolder, 'answer_full.txt'), 'w+') as fp: for m, ev, s in tqdm(zip(answer_col0, answer_col1, answer_col2)): line = '{} {} {}\n'.format(m, ev, s) fp.write(line) with open(os.path.join(outfolder, 'answer.txt'), 'w+') as fp: for s in tqdm(answer_col2): line = '{}\n'.format(s) fp.write(line) if (answer_col0 == np.array(ds_eval.models_eval)).all(): print('model ordering matched') else: print('model ordering was not correct, need to fix before submission') if (answer_col1 == np.array(ds_eval.eval_utts)).all(): print('eval utt ordering matched') else: print( 'eval utt ordering was not correct, need to fix before submission')
def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.feat_in_channels = args.feat_in_channels self.specaugment_config = args.specaugment_config self.num_targets = args.num_targets self.training_stage = hasattr(args, "valid_subset") # the following attributes are related to state_prior estimate self.initial_state_prior = None if args.initial_state_prior_file is not None: # only relevant for Xent training, used in models self.initial_state_prior = kaldi_io.read_vec_flt( args.initial_state_prior_file) self.initial_state_prior = torch.from_numpy( self.initial_state_prior) assert self.initial_state_prior.size(0) == self.num_targets, \ "length of initial_state_prior ({}) != num_targets ({})".format( self.initial_state_prior.size(0), self.num_targets ) self.state_prior_update_interval = args.state_prior_update_interval if self.state_prior_update_interval is None and self.initial_state_prior is not None: logger.info("state prior will not be updated during training") self.state_prior_update_smoothing = args.state_prior_update_smoothing self.averaged_state_post = None # state poterior will be saved here before commited as new state prior # the following 4 options are for chunk-wise training/test (including Xent and LF-MMI) self.chunk_width = args.chunk_width self.chunk_left_context = args.chunk_left_context self.chunk_right_context = args.chunk_right_context self.label_delay = args.label_delay # only for chunk-wise Xent training torch.backends.cudnn.deterministic = True
def __init__(self, cfg: SpeechRecognitionHybridConfig, dictionary, feat_dim): super().__init__(cfg) self.dictionary = dictionary self.feat_dim = feat_dim self.feat_in_channels = cfg.feat_in_channels self.num_targets = cfg.num_targets self.training_stage = (cfg.max_epoch > 0) # a hack # the following attributes are related to state_prior estimate self.initial_state_prior = None if cfg.initial_state_prior_file is not None: # only relevant for Xent training, used in models self.initial_state_prior = kaldi_io.read_vec_flt( cfg.initial_state_prior_file) self.initial_state_prior = torch.from_numpy( self.initial_state_prior) assert ( self.initial_state_prior.size(0) == self.num_targets ), "length of initial_state_prior ({}) != num_targets ({})".format( self.initial_state_prior.size(0), self.num_targets) self.state_prior_update_interval = cfg.state_prior_update_interval if self.state_prior_update_interval is None and self.initial_state_prior is not None: logger.info("state prior will not be updated during training") self.state_prior_update_smoothing = cfg.state_prior_update_smoothing self.averaged_state_post = None # state poterior will be saved here before commited as new state prior # the following 4 options are for chunk-wise training/test (including Xent and LF-MMI) self.chunk_width = cfg.chunk_width self.chunk_left_context = cfg.chunk_left_context self.chunk_right_context = cfg.chunk_right_context self.label_delay = cfg.label_delay # only for chunk-wise Xent training torch.backends.cudnn.deterministic = True
def fetch_llkprob_segment(wavid, ipath2prob_scp, seg=(0.0, math.inf), win_len=0.025, hop_len=0.010): """ given wavid, return an loglikehood probability segment from ipath2prob_scp args: wavid -- string, id of a audio file ipath2prob_scp -- the path to llk_prob.scp each wavid corresponds to a float vector of llk_prob llk_prob: the prob of a specific GMM generating a frame seg -- a tuple of (start_time, end_time) win_len -- window length in second hop_len -- window shift in second return: vec -- llk_prob curve with numpy format """ fd = kaldi_io.open_or_fd(ipath2prob_scp) for line in fd: (wid, path) = line.decode("utf-8").rstrip().split(' ', 1) if wavid == wid: vec = kaldi_io.read_vec_flt(path) # np.array start_t, end_t = seg end_t = min(end_t, vec.shape[0] * hop_len) # the second term is float by default assert start_t < end_t and start_t >= 0.0, "InputArg: seg {0} invalid".format( str(seg)) start_f = int(start_t / hop_len) end_f = int(end_t / hop_len) return vec[start_f:end_f]
def load_phn(f): #files = [train_scp_info['utt2file'][uu] for uu in u] vf = fid2vadfile[p_aug_remove.sub('', f)] # File IDs vad = kaldi_io.read_vec_flt(vf) if (phn_vad_scp != "None"): lab = np.genfromtxt(phn_dir + f + ".hp")[np.where(vad == 1)] else: lab = np.genfromtxt(phn_dir + f + ".hp") lab = lab.reshape(1, -1) return lab
def load_kaldi_feats_segm_same_dur_plus_lab(rng, files, min_length, max_length, n_avl_samp, lab_dir, f_ids, vad_files, start_from_zero=False): min_n_avl_samp = np.min(n_avl_samp) max_len = np.min( [min_n_avl_samp + 1, max_length] ) # Need to add 1 because max_len because the intervall is [min_len, max_len)?????!!!!??? n_sel_samp = rng.randint(min_length, max_len) # not [min_len, max_len] start = [] end = [] vad = [kaldi_io.read_vec_flt(vf).astype(bool) for vf in vad_files] lab = [ np.genfromtxt(lab_dir + f + ".hp")[vad[i]] for i, f in enumerate(f_ids) ] assert (len(lab[0]) == n_avl_samp[0]) for i, f in enumerate(files): # The start_from_zero option is mainly for debugging/development if start_from_zero: start.append(0) else: last_possible_start = n_avl_samp[i] - n_sel_samp start.append( rng.randint(0, last_possible_start + 1)[0] ) # This means the intervall is [0,last_possible_start + 1) = [0, last_possible_start] end.append(start[-1] + n_sel_samp) ff = [ "xxx {}[{}:{},:]".format(files[i], start[i], end[i]) for i in range(len(files)) ] data = [rr[1] for rr in kaldi_io.read_mat_scp(ff)] data = np.stack(data, axis=0) lab = np.array([l[start[i]:end[i]] for i, l in enumerate(lab)]) return data, lab
def _main(args, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, stream=output_file, ) logger = logging.getLogger("espresso.dump_posteriors") print_options_meaning_changes(args, logger) utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset split task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args.path), arg_overrides=eval(args.model_overrides), task=task, suffix=getattr(args, "checkpoint_suffix", ""), ) # Load state prior for cross-entropy trained systems decoding if args.state_prior_file is not None: prior = torch.from_numpy(kaldi_io.read_vec_flt(args.state_prior_file)) else: prior = [] # Optimize ensemble for generation for model in models: model.make_generation_fast_() if args.fp16: model.half() if use_cuda: model.cuda() if isinstance(prior, list) and getattr(model, "state_prior", None) is not None: prior.append(model.state_prior.unsqueeze(0)) if isinstance(prior, list) and len(prior) > 0: prior = torch.cat(prior, 0).mean(0) # average priors across models prior = prior / prior.sum() # re-normalize elif isinstance(prior, list): prior = None if prior is not None: if args.fp16: prior = prior.half() if use_cuda: prior = prior.cuda() log_prior = prior.log() else: log_prior = None # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[ model.max_positions() if hasattr(model, "encoder") else (None, model.max_positions()) for model in models ]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args.log_format, log_interval=args.log_interval, default_log_format=("tqdm" if not args.no_progress_bar else "none"), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, args) # Generate and dump num_sentences = 0 chunk_width = getattr(task, "chunk_width", None) lprobs_wspecifier = "ark:| copy-matrix ark:- ark:-" with kaldi_io.open_or_fd(lprobs_wspecifier, "wb") as f: if chunk_width is None: # normal dumping (i.e., no chunking) for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue gen_timer.start() lprobs, padding_mask = task.inference_step( generator, models, sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior out_lengths = (~padding_mask).long().sum( dim=1).cpu() if padding_mask is not None else None num_processed_frames = sample["ntokens"] gen_timer.stop(num_processed_frames) num_sentences += sample["nsentences"] if out_lengths is not None: for i in range(sample["nsentences"]): length = out_lengths[i] kaldi_io.write_mat(f, lprobs[i, :length, :].cpu().numpy(), key=sample["utt_id"][i]) else: for i in range(sample["nsentences"]): kaldi_io.write_mat(f, lprobs[i, :, :].cpu().numpy(), key=sample["utt_id"][i]) else: # dumping chunks within the same utterance from left to right for sample in progress: # sample is actually a list of batches sample = utils.move_to_cuda(sample) if use_cuda else sample utt_id = sample[0]["utt_id"] id = sample[0]["id"] whole_lprobs = None for i, chunk_sample in enumerate(sample): if "net_input" not in chunk_sample: continue assert chunk_sample["utt_id"] == utt_id and ( chunk_sample["id"] == id).all() gen_timer.start() lprobs, _ = task.inference_step(generator, models, chunk_sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior if whole_lprobs is None: whole_lprobs = lprobs.cpu() else: whole_lprobs = torch.cat((whole_lprobs, lprobs.cpu()), 1) num_processed_frames = chunk_sample["ntokens"] gen_timer.stop(num_processed_frames) if i == len(sample) - 1: num_sentences += len(utt_id) for j in range(len(utt_id)): truncated_length = models[0].output_lengths( task.dataset(args.gen_subset).src_sizes[id[j]] ) # length is after possible subsampling by the model mat = whole_lprobs[j, :truncated_length, :] kaldi_io.write_mat(f, mat.numpy(), key=utt_id[j]) logger.info( "Dumped {} utterances ({} frames) in {:.1f}s ({:.2f} sentences/s, {:.2f} frames/s)" .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) return
from os.path import join import sys import kaldi_io import numpy as np from sklearn.neighbors import KNeighborsClassifier args = sys.argv feat_file = join(args[1], 'xvector.scp') with open(feat_file) as f: lines = f.read().splitlines() npts = len(lines) test_x = kaldi_io.read_vec_flt(lines[0].split()[1]) fdim = test_x.shape[0] X = np.zeros((npts, fdim)) y = [] for idx, line in enumerate(lines): sp = line.split() X[idx, :] = kaldi_io.read_vec_flt(sp[1]) # male/female is present in uttname y.append(sp[0].split('-')[2].split('_')[0]) neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, y) print(neigh.score(X, y))
'{}/xvectors_sre/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_mx6/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre16_eval_enrollment/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre16_eval_test/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre16_eval_enrollment/spk_xvector.scp'.format(xvectors_base_path), '{}/xvectors_sre18_dev_enrollment/spk_xvector.scp'.format(xvectors_base_path), '{}/xvectors_sre18_dev_test/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre18_dev_enrollment/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre18_eval_test/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre18_eval_enrollment/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre18_eval_enrollment/spk_xvector.scp'.format(xvectors_base_path), '{}/xvectors_sre19_eval_test/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre19_eval_enrollment/xvector_fullpaths.scp'.format(xvectors_base_path), '{}/xvectors_sre19_eval_enrollment/spk_xvector.scp'.format(xvectors_base_path)]) mega_scp_dict = {} mega_xvec_dict = {} for fx in xvector_scp_list: subprocess.call(['sed','-i', 's| exp/xvector_nnet_1a| {}|g'.format(xvectors_base_path), fx]) with open(fx) as f: scp_list = f.readlines() scp_dict = {x.split(' ', 1)[0]: x.rstrip('\n').split(' ', 1)[1] for x in scp_list} xvec_dict = {x.split(' ', 1)[0]: kaldi_io.read_vec_flt(x.rstrip('\n').split(' ', 1)[1]) for x in scp_list} mega_scp_dict.update(scp_dict) mega_xvec_dict.update(xvec_dict) mega_scp = np.c_[np.asarray(list(mega_scp_dict.keys()))[:,np.newaxis], np.asarray(list(mega_scp_dict.values()))] np.savetxt('xvectors/mega_xvector_voxceleb_8k.scp', mega_scp, fmt='%s', delimiter=' ', comments='') pickle.dump(mega_xvec_dict, open('xvectors/mega_xvector_voxceleb_8k.pkl', 'wb'))
# mega_scp_dict = {} mega_xvec_dict = pickle.load( open('xvectors/mega_xvector_voices_voxceleb_16k.pkl', 'rb')) for fx in xvector_scp_list: subprocess.call([ 'sed', '-i', 's| {}| {}|g'.format(xv_path, xvectors_base_path), fx ]) with open(fx) as f: scp_list = f.readlines() scp_dict = { os.path.splitext(os.path.basename(x.split(' ', 1)[0]))[0]: x.rstrip('\n').split(' ', 1)[1] for x in scp_list } xvec_dict = { os.path.splitext(os.path.basename(x.split(' ', 1)[0]))[0]: kaldi_io.read_vec_flt(x.rstrip('\n').split(' ', 1)[1]) for x in scp_list } # mega_scp_dict.update(scp_dict) mega_xvec_dict.update(xvec_dict) # mega_scp = np.c_[np.asarray(list(mega_scp_dict.keys()))[:,np.newaxis], np.asarray(list(mega_scp_dict.values()))] # np.savetxt('xvectors/mega_xvector_voices_voxceleb_16k.scp', mega_scp, fmt='%s', delimiter=' ', comments='') pickle.dump( mega_xvec_dict, open('xvectors/mega_xvector_voices_voxceleb_16k.pkl', 'wb'))
def __getitem__(self, idx): # feat = kaldi_io.read_mat(self.feat_list[idx]) # reading MFCCs/fbanks... feat = kaldi_io.read_vec_flt(self.feat_list[idx]) # reading xvecs return feat
file_len(utt2spk_train_path) - len(fea_train)) print("Missing features number dev: ", file_len(utt2spk_dev_path) - len(fea_dev)) print("Missing features number eval: ", file_len(utt2spk_eval_path) - len(fea_eval)) if suffix: fea_train_spk = { k: m[0] for k, m in kaldi_io.read_mat_scp(replacement_xvectors_path + '/lda_spk_xvector_mat.scp') } replacement = fea_train_spk[replacement_key] print("Replacing with mean for class", replacement_key) else: replacement = kaldi_io.read_vec_flt(replacement_xvectors_path + '/lda_mean.vec') print("Replacing with mean all speaker xvector.") ########## replace missing xvectors ############## for utt in utts_dev: if utt not in fea_dev: fea_dev[utt] = replacement for utt in utts_eval: if utt not in fea_eval: fea_eval[utt] = replacement for utt in utts_train: if utt not in fea_train: fea_train[utt] = replacement
fc += 1 spk2gender[sp[0]] = 'f' else: continue else: if mc < MAX_MALE: mc += 1 spk2gender[sp[0]] = 'm' else: continue spk2utt[sp[0]] = utts spk_feats = [] for u in utts: utt_feat = kaldi_io.read_vec_flt(feats[u]) utt_feat = utt_feat[np.newaxis, :] #print(utt_feat.shape) spk_feats.append(utt_feat) spk_feats = np.array(spk_feats) spk_feats = spk_feats.squeeze() spk2featlen[sp[0]] = spk_feats.shape[0] print(spk_feats.shape) X.append(spk_feats) nspk = len(spk2gender.keys()) print("Number of speakers", nspk) labels = [] print("creating labels for silhouette score...")
# read utterance embeddings with open(sys.argv[1], 'r') as f: content = f.readlines() content = [x.strip() for x in content] # speaker to utterances mapping spk2mat = defaultdict(list) for line in content: (key, rxfile) = line.split() spk = key.split('-')[0] if spk in dev_test_spk.keys(): uttid = key.split('-')[1] + '_' + key.split('-')[2] if uttid not in dev_test_spk[spk]: continue spk2mat[spk].append(read_vec_flt(rxfile)) #for i in spk2mat.keys(): # if i in dev_test_spk.keys(): # print(len(spk2mat[i])) # create speaker embeddings out_file = sys.argv[2] ark_scp_output = 'ark:| copy-vector ark:- ark,scp:' + out_file + '.ark,' + out_file + '.scp' with open_or_fd(ark_scp_output, 'wb') as f: for spk, mat in spk2mat.items(): spk_emb = np.mean(mat, axis=0).reshape(-1, ) # get speaker embedding (vector) #print(spk_emb.shape) #print(spk)
def zscore_normalization(x, mean, std): x = (x - mean) / (std + args.eps) return x trial_utts = set() trial_spks = set() with open(args.trials, 'r') as fptr: for line in fptr: enroll_spk, eval_utt, target = line.strip().split() trial_utts.add(eval_utt) trial_spks.add(enroll_spk) # Read mean, enroll embedding and eval embedding mean = kaldi_io.read_vec_flt(args.mean_file) enroll_spks, enroll_feats = read_target_vector(args.enroll_scp, trial_spks) eval_utts, eval_feats = read_target_vector(args.eval_scp, trial_utts) if args.impostor_scp != '': impostor_feats = read_impostor_vector(args.impostor_scp) impostor_feats = np.array(impostor_feats, dtype=np.float32) impostor_feats = impostor_feats - mean impostor_feats = preprocessing.normalize(impostor_feats, norm='l2') # Convert data to numpy enroll_feats = np.array(enroll_feats, dtype=np.float32) eval_feats = np.array(eval_feats, dtype=np.float32) enroll_spks = np.array(enroll_spks) eval_utts = np.array(eval_utts) # Subtract mean for enroll and eval embedding enroll_feats = enroll_feats - mean
cwd = os.getcwd() os.system('bash ' + cwd + '/embedding_extraction.sh ' + kaldiDir + ' ' + wavFile + ' ' + rttmFile) ############################################ ## Convert the xvector.sh to numpy matrix ############################################ data = [] # np.array([]) file = open(cwd + '/tmpkaldidir/xvectors/xvector.scp', 'r') x = file.readlines() d = np.empty((len(x), 512)) for i in range(0, len(x)): a = x[i] d[i, :] = kaldi_io.read_vec_flt(a.strip().split()[1]) data.append(d) data = np.concatenate(data) ## x-vectors of a particular session np.save('data.npy', data) #################################################################################################################### # Latent embeddings extraction #################################################################################################################### timestamp = np.load(timestamp + '.npy') # Load the timestamp of the saved trained model for j in range(3, len(timestamp)): # Evaluate for the saved models (20k, 25k, 30k) timestamp1 = timestamp[j] # Timestamp of the saved model latent = recon_enc(timestamp1, sampler, z_dim, beta_cycle_label, beta_cycle_gen, data, batch_size) # model load and prediction
async def async_read_xvec(path): return read_vec_flt(path)
def _main(cfg, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("espresso.dump_posteriors") print_options_meaning_changes(cfg, logger) utils.import_user_module(cfg.common) if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.max_tokens = 12000 logger.info(cfg) # Fix seed for stochastic decoding if cfg.common.seed is not None and not cfg.generation.no_seed_provided: np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) use_cuda = torch.cuda.is_available() and not cfg.common.cpu task = tasks.setup_task(cfg.task) overrides = ast.literal_eval(cfg.common_eval.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task) # Load state prior for cross-entropy trained systems decoding if cfg.generation.state_prior_file is not None: prior = torch.from_numpy( kaldi_io.read_vec_flt(cfg.generation.state_prior_file)) else: prior = [] # Optimize ensemble for generation for model in models: if model is None: continue if cfg.common.fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) if isinstance(prior, list) and getattr(model, "state_prior", None) is not None: prior.append(model.state_prior.unsqueeze(0)) if isinstance(prior, list) and len(prior) > 0: prior = torch.cat(prior, 0).mean(0) # average priors across models prior = prior / prior.sum() # re-normalize elif isinstance(prior, list): prior = None if prior is not None: if cfg.common.fp16: prior = prior.half() if use_cuda: prior = prior.cuda() log_prior = prior.log() else: log_prior = None # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(cfg.dataset.gen_subset), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models]), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(models, cfg.generation) # Generate and dump num_sentences = 0 chunk_width = getattr(task, "chunk_width", None) lprobs_wspecifier = "ark:| copy-matrix ark:- ark:-" with kaldi_io.open_or_fd(lprobs_wspecifier, "wb") as f: if chunk_width is None: # normal dumping (i.e., no chunking) for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue gen_timer.start() lprobs, padding_mask = task.inference_step( generator, models, sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior out_lengths = ((~padding_mask).long().sum( dim=1).cpu() if padding_mask is not None else None) num_processed_frames = sample["ntokens"] gen_timer.stop(num_processed_frames) num_sentences += (sample["nsentences"] if "nsentences" in sample else sample["id"].numel()) if out_lengths is not None: for i in range(sample["nsentences"]): length = out_lengths[i] kaldi_io.write_mat( f, lprobs[i, :length, :].cpu().numpy(), key=sample["utt_id"][i], ) else: for i in range(sample["nsentences"]): kaldi_io.write_mat(f, lprobs[i, :, :].cpu().numpy(), key=sample["utt_id"][i]) else: # dumping chunks within the same utterance from left to right for sample in progress: # sample is actually a list of batches sample = utils.move_to_cuda(sample) if use_cuda else sample utt_id = sample[0]["utt_id"] id = sample[0]["id"] whole_lprobs = None for i, chunk_sample in enumerate(sample): if "net_input" not in chunk_sample: continue assert (chunk_sample["utt_id"] == utt_id and (chunk_sample["id"] == id).all()) gen_timer.start() lprobs, _ = task.inference_step(generator, models, chunk_sample) if log_prior is not None: assert lprobs.size(-1) == log_prior.size(0) lprobs = lprobs - log_prior if whole_lprobs is None: whole_lprobs = lprobs.cpu() else: whole_lprobs = torch.cat((whole_lprobs, lprobs.cpu()), 1) num_processed_frames = chunk_sample["ntokens"] gen_timer.stop(num_processed_frames) if i == len(sample) - 1: num_sentences += len(utt_id) for j in range(len(utt_id)): truncated_length = models[0].output_lengths( task.dataset( cfg.dataset.gen_subset).src_sizes[id[j]] ) # length is after possible subsampling by the model mat = whole_lprobs[j, :truncated_length, :] kaldi_io.write_mat(f, mat.numpy(), key=utt_id[j]) logger.info( "Dumped {:,} utterances ({} frames) in {:.1f}s ({:.2f} sentences/s, {:.2f} frames/s)" .format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, )) return
lda_dim = int( sys.argv[12] ) # For VB-HMM, x-vectors are reduced to this dimensionality using LDA Fa = float( sys.argv[13]) # Parameter of VB-HMM (see VB_diarization.VB_diarization) Fb = float( sys.argv[14]) # Parameter of VB-HMM (see VB_diarization.VB_diarization) LoopP = float( sys.argv[15]) # Parameter of VB-HMM (see VB_diarization.VB_diarization) use_VB = True # False for using only AHC frm_shift = 0.01 # frame rate of MFCC features glob_tran = kaldi_io.read_mat( tran_mat_file) # x-vector whitening transformation glob_mean = kaldi_io.read_vec_flt(mean_vec_file) # x-vector centering vector kaldi_plda_train = kaldi_io.read_plda(plda_file) # out-of-domain PLDA model kaldi_plda_adapt = kaldi_io.read_plda( plda_adapt_file) # in-domain "adaptation" PLDA model segs_dict = read_xvector_timing_dict( segments_file) # segments file with x-vector timing information plda_train_mu, plda_train_tr, plda_train_psi = kaldi_plda_train plda_adapt_mu, plda_adapt_tr, plda_adapt_psi = kaldi_plda_adapt # Interpolate across-class, within-class and means of the two PLDA models with interpolation factor "alpha" plda_mu = alpha * plda_train_mu + (1.0 - alpha) * plda_adapt_mu W_train = np.linalg.inv(plda_train_tr.T.dot(plda_train_tr)) B_train = np.linalg.inv((plda_train_tr.T / plda_train_psi).dot(plda_train_tr)) W_adapt = np.linalg.inv(plda_adapt_tr.T.dot(plda_adapt_tr)) B_adapt = np.linalg.inv((plda_adapt_tr.T / plda_adapt_psi).dot(plda_adapt_tr))
def read_xvec(file): return kaldi_io.read_vec_flt(file)
fd = open_or_fd(feats_path) feats = {} try: for line in fd: key, rxfile = line.decode().split(' ') feats[key] = read_mat(rxfile) finally: if fd is not feats_path: fd.close() vad_path = '/home/abbas/abbas/workspace/data/sre16/v2/data/iberspeech_dev2/vad.scp' fd = open_or_fd(vad_path) vads = {} try: for line in fd: key, rxfile = line.decode().split(' ') vads[key] = read_vec_flt(rxfile).astype(bool) finally: if fd is not vad_path: fd.close() print('diarizing the test dataset ...') hypothesis = {} metric = DiarizationErrorRate(collar=0.250, skip_overlap=True) for f in tqdm(feats): fname = f[5:-2] if fname == "LN24H-20151125": continue spkpath = 'dev2/hyp/' + fname + '.rttm' lab = np.loadtxt('dev2/spk/' + fname + '.spk',
def read_xvec(file): return read_vec_flt(file)