def increase_n_components(n_mixes, model_name, model_file, training_list, lab_dir, target_model_dir, min_var_macro_file=None, n_states=1, do_kmeans=False): ''' Increase the number of GMM components ''' hhed_cfg_file = os.path.join(target_model_dir, 'hhed.cfg') hhed_cfg = open(hhed_cfg_file, 'w') hhed_cfg.write('KEEPDISTINCT=TRUE') hhed_cfg.close() hhed_cmd_file = os.path.join(target_model_dir, 'hhed.cmd') hhed_cmd = open(hhed_cmd_file, 'w') if n_states > 1: hhed_cmd.write('MU {0} {{{1}.state[2-{2}].mix}}\n'.format( str(n_mixes), model_name, str(n_states + 1))) else: hhed_cmd.write('MU {0} {{{1}.state[2].mix}}\n'.format( str(n_mixes), model_name)) hhed_cmd.close() model_list_file = os.path.join(target_model_dir, 'model.list') model_list = open(model_list_file, 'w') model_list.write(model_name) model_list.close() model_dir = os.path.split(model_file)[0] hhed_args = [ '-C', hhed_cfg_file, '-d', model_dir, '-M', target_model_dir, hhed_cmd_file, model_list_file ] hhed(hhed_args) if do_kmeans == True: initialize_gmm.initialize_gmm_kmeans(model_name, training_list, lab_dir, target_model_dir, target_model_dir) train_gmm(model_file=os.path.join(target_model_dir, model_name), training_list=training_list, lab_dir=lab_dir, out_model_dir=target_model_dir, n_train_iterations=30, min_var_macro=min_var_macro_file)
def increase_n_components( n_mixes, model_name, model_file, training_list, lab_dir, target_model_dir, min_var_macro_file=None, n_states=1, do_kmeans=False, ): """ Increase the number of GMM components """ hhed_cfg_file = os.path.join(target_model_dir, "hhed.cfg") hhed_cfg = open(hhed_cfg_file, "w") hhed_cfg.write("KEEPDISTINCT=TRUE") hhed_cfg.close() hhed_cmd_file = os.path.join(target_model_dir, "hhed.cmd") hhed_cmd = open(hhed_cmd_file, "w") if n_states > 1: hhed_cmd.write("MU {0} {{{1}.state[2-{2}].mix}}\n".format(str(n_mixes), model_name, str(n_states + 1))) else: hhed_cmd.write("MU {0} {{{1}.state[2].mix}}\n".format(str(n_mixes), model_name)) hhed_cmd.close() model_list_file = os.path.join(target_model_dir, "model.list") model_list = open(model_list_file, "w") model_list.write(model_name) model_list.close() model_dir = os.path.split(model_file)[0] hhed_args = ["-C", hhed_cfg_file, "-d", model_dir, "-M", target_model_dir, hhed_cmd_file, model_list_file] hhed(hhed_args) if do_kmeans == True: initialize_gmm.initialize_gmm_kmeans(model_name, training_list, lab_dir, target_model_dir, target_model_dir) train_gmm( model_file=os.path.join(target_model_dir, model_name), training_list=training_list, lab_dir=lab_dir, out_model_dir=target_model_dir, n_train_iterations=30, min_var_macro=min_var_macro_file, )
def train_M_sized_gmm_classifier_incrementally(M, n_dims, feature_type, training_list, model_list, lab_dir, model_dir, apply_hlda=False, hlda_nuisance_dims=0, n_train_iterations=10): ''' Train a GMM-based classifier with a known number of components The assumption is that training files are HTK-formatted feature files and that there exist HTK-formatted transcription files as well named in the same way as the feature files with just a .lab suffix. The lab_dir should correspond to a flat structure. ''' # Find the dimensionality of the feature vector assert(os.path.exists(training_list)) t_list = open(training_list,'r') fea_file = t_list.readline().rstrip('\r\n') t_list.close() sampSize = read_htk_header(fea_file)[2] orig_n_dims = sampSize/4 assert(orig_n_dims<max_dimensions) # Find the model names models = [] m_list = open(model_list,'r') for line in m_list: line = line.rstrip('\r\n') models.append(line) # Directory structure generation if not os.path.exists(model_dir): os.makedirs(model_dir) hmm0_dir = os.path.join(model_dir,'hmm0') if not os.path.exists(hmm0_dir): os.makedirs(hmm0_dir) hmm1_dir = os.path.join(model_dir,'hmm1') if not os.path.exists(hmm1_dir): os.makedirs(hmm1_dir) n_models = len(models) final_model_files = [] for count in range(n_models): if len(M) > 1: n_comps = M[count] else: n_comps = M[0] initial_n_comps = 1 if n_comps>1: initial_n_comps = 2 model_name = models[count] # Define the GMM as a trivial HMM with a single state in HTK format create_gmm(orig_n_dims, initial_n_comps, feature_type, model_name, covar_type, hmm0_dir) initialize_gmm.initialize_gmm_kmeans(model_name, training_list, lab_dir, hmm0_dir, hmm1_dir) if count == 0: # Estimate variance floors as percentage of the global variances (only once) initialize_gmm.estimate_minimum_variances(training_list, os.path.join(hmm0_dir, models[count]), hmm0_dir) min_var_macro_file = os.path.join(hmm0_dir,'vFloors') train_gmm.prepare_vfloor_macros_file(model_file=os.path.join(hmm1_dir, model_name), vfloors_file=min_var_macro_file) # Increase the number of mixtures n_mixes = 4; orig_model_dir = hmm1_dir target_model_dir = os.path.join(model_dir,'hmm4'); while n_mixes < n_comps+1: if not os.path.exists(target_model_dir): os.makedirs(target_model_dir) model_file = os.path.join(orig_model_dir, model_name) train_gmm.increase_n_components(n_mixes, model_name, model_file, training_list, lab_dir, target_model_dir, min_var_macro_file) n_mixes *= 2 orig_model_dir = target_model_dir target_model_dir = os.path.join(model_dir,'hmm{0}'.format(str(n_mixes))) final_model_files.append(os.path.join(orig_model_dir,model_name)) # Gather all HMM definitions into a single mmf file model_file = os.path.join(model_dir, "newMacros") mmf = open(model_file,'w') for md in final_model_files: md_file = open(md,'r') mmf.writelines(md_file.readlines()) mmf.write('\n') md_file.close() mmf.close() train_gmm.train_gmm_set(model_list, training_list, lab_dir, model_dir, model_file, n_train_iterations, min_var_macro_file) model_file = os.path.join(model_dir, "newMacros") shutil.copyfile(model_file, model_file+'_no_hlda') if apply_hlda: if hlda_nuisance_dims>0: train_hlda(model_list, hlda_nuisance_dims, training_list, lab_dir, model_dir, min_var_macro_file)