示例#1
0
def increase_n_components(n_mixes,
                          model_name,
                          model_file,
                          training_list,
                          lab_dir,
                          target_model_dir,
                          min_var_macro_file=None,
                          n_states=1,
                          do_kmeans=False):
    '''
    Increase the number of GMM components
    '''
    hhed_cfg_file = os.path.join(target_model_dir, 'hhed.cfg')
    hhed_cfg = open(hhed_cfg_file, 'w')
    hhed_cfg.write('KEEPDISTINCT=TRUE')
    hhed_cfg.close()

    hhed_cmd_file = os.path.join(target_model_dir, 'hhed.cmd')
    hhed_cmd = open(hhed_cmd_file, 'w')
    if n_states > 1:
        hhed_cmd.write('MU {0} {{{1}.state[2-{2}].mix}}\n'.format(
            str(n_mixes), model_name, str(n_states + 1)))
    else:
        hhed_cmd.write('MU {0} {{{1}.state[2].mix}}\n'.format(
            str(n_mixes), model_name))
    hhed_cmd.close()

    model_list_file = os.path.join(target_model_dir, 'model.list')
    model_list = open(model_list_file, 'w')
    model_list.write(model_name)
    model_list.close()

    model_dir = os.path.split(model_file)[0]
    hhed_args = [
        '-C', hhed_cfg_file, '-d', model_dir, '-M', target_model_dir,
        hhed_cmd_file, model_list_file
    ]
    hhed(hhed_args)

    if do_kmeans == True:
        initialize_gmm.initialize_gmm_kmeans(model_name, training_list,
                                             lab_dir, target_model_dir,
                                             target_model_dir)
    train_gmm(model_file=os.path.join(target_model_dir, model_name),
              training_list=training_list,
              lab_dir=lab_dir,
              out_model_dir=target_model_dir,
              n_train_iterations=30,
              min_var_macro=min_var_macro_file)
示例#2
0
def increase_n_components(
    n_mixes,
    model_name,
    model_file,
    training_list,
    lab_dir,
    target_model_dir,
    min_var_macro_file=None,
    n_states=1,
    do_kmeans=False,
):
    """
    Increase the number of GMM components
    """
    hhed_cfg_file = os.path.join(target_model_dir, "hhed.cfg")
    hhed_cfg = open(hhed_cfg_file, "w")
    hhed_cfg.write("KEEPDISTINCT=TRUE")
    hhed_cfg.close()

    hhed_cmd_file = os.path.join(target_model_dir, "hhed.cmd")
    hhed_cmd = open(hhed_cmd_file, "w")
    if n_states > 1:
        hhed_cmd.write("MU {0} {{{1}.state[2-{2}].mix}}\n".format(str(n_mixes), model_name, str(n_states + 1)))
    else:
        hhed_cmd.write("MU {0} {{{1}.state[2].mix}}\n".format(str(n_mixes), model_name))
    hhed_cmd.close()

    model_list_file = os.path.join(target_model_dir, "model.list")
    model_list = open(model_list_file, "w")
    model_list.write(model_name)
    model_list.close()

    model_dir = os.path.split(model_file)[0]
    hhed_args = ["-C", hhed_cfg_file, "-d", model_dir, "-M", target_model_dir, hhed_cmd_file, model_list_file]
    hhed(hhed_args)

    if do_kmeans == True:
        initialize_gmm.initialize_gmm_kmeans(model_name, training_list, lab_dir, target_model_dir, target_model_dir)
    train_gmm(
        model_file=os.path.join(target_model_dir, model_name),
        training_list=training_list,
        lab_dir=lab_dir,
        out_model_dir=target_model_dir,
        n_train_iterations=30,
        min_var_macro=min_var_macro_file,
    )
def train_M_sized_gmm_classifier_incrementally(M, n_dims, feature_type, training_list, model_list,
                                               lab_dir, model_dir, apply_hlda=False, hlda_nuisance_dims=0,
                                               n_train_iterations=10):
    '''
    Train a GMM-based classifier with a known number of components
    The assumption is that training files are HTK-formatted feature files and that
    there exist HTK-formatted transcription files as well named in the same way
    as the feature files with just a .lab suffix. The lab_dir should correspond to
    a flat structure.
    '''
    # Find the dimensionality of the feature vector
    assert(os.path.exists(training_list))
    t_list = open(training_list,'r')
    fea_file = t_list.readline().rstrip('\r\n')
    t_list.close()
    sampSize = read_htk_header(fea_file)[2]
    orig_n_dims = sampSize/4
    assert(orig_n_dims<max_dimensions)

    # Find the model names
    models = []
    m_list = open(model_list,'r')
    for line in m_list:
        line = line.rstrip('\r\n')
        models.append(line)

    # Directory structure generation
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    hmm0_dir = os.path.join(model_dir,'hmm0')
    if not os.path.exists(hmm0_dir):
        os.makedirs(hmm0_dir)
    hmm1_dir = os.path.join(model_dir,'hmm1')
    if not os.path.exists(hmm1_dir):
        os.makedirs(hmm1_dir)

    n_models = len(models)
    final_model_files = []
    for count in range(n_models):
        if len(M) > 1:
            n_comps = M[count]
        else:
            n_comps = M[0]

        initial_n_comps = 1
        if n_comps>1:
            initial_n_comps = 2
        model_name = models[count]

        # Define the GMM as a trivial HMM with a single state in HTK format
        create_gmm(orig_n_dims, initial_n_comps, feature_type, model_name, covar_type, hmm0_dir)
        initialize_gmm.initialize_gmm_kmeans(model_name, training_list, lab_dir, hmm0_dir, hmm1_dir)

        if count == 0:
            # Estimate variance floors as percentage of the global variances (only once)
            initialize_gmm.estimate_minimum_variances(training_list, os.path.join(hmm0_dir, models[count]), hmm0_dir)
            min_var_macro_file = os.path.join(hmm0_dir,'vFloors')
            train_gmm.prepare_vfloor_macros_file(model_file=os.path.join(hmm1_dir, model_name),
                                                  vfloors_file=min_var_macro_file)


        # Increase the number of mixtures
        n_mixes = 4;
        orig_model_dir = hmm1_dir
        target_model_dir = os.path.join(model_dir,'hmm4');
        while n_mixes < n_comps+1:
            if not os.path.exists(target_model_dir):
                os.makedirs(target_model_dir)
            model_file = os.path.join(orig_model_dir, model_name)
            train_gmm.increase_n_components(n_mixes, model_name, model_file, training_list, lab_dir,
                                            target_model_dir, min_var_macro_file)
            n_mixes *= 2
            orig_model_dir = target_model_dir
            target_model_dir = os.path.join(model_dir,'hmm{0}'.format(str(n_mixes)))

        final_model_files.append(os.path.join(orig_model_dir,model_name))

    # Gather all HMM definitions into a single mmf file
    model_file = os.path.join(model_dir, "newMacros")
    mmf = open(model_file,'w')
    for md in final_model_files:
        md_file = open(md,'r')
        mmf.writelines(md_file.readlines())
        mmf.write('\n')
        md_file.close()
    mmf.close()

    train_gmm.train_gmm_set(model_list, training_list, lab_dir, model_dir, model_file, n_train_iterations, min_var_macro_file)
    model_file = os.path.join(model_dir, "newMacros")
    shutil.copyfile(model_file, model_file+'_no_hlda')

    if apply_hlda:
        if hlda_nuisance_dims>0:
            train_hlda(model_list, hlda_nuisance_dims, training_list, lab_dir, model_dir, min_var_macro_file)