Exemplo n.º 1
0
def mixdown_mono(model, root_dir, prev_dir, phone_list):
    """
    Run HHEd to mixdown monophones
    """

    output_dir = '%s/HMM-1-0' % root_dir
    util.create_new_dir(output_dir)

    ## Create the full list of possible triphones
    phones = open(phone_list).read().splitlines()
    non_sil_phones = [p for p in phones if p not in ['sp', 'sil']]

    ## Make the hed script
    mixdown_hed = '%s/mix_down.hed' % output_dir
    fh = open(mixdown_hed, 'w')
    fh.write('MD 12 {(sil,sp).state[2-%d].mix}\n' % (model.states - 1))
    for phone in non_sil_phones:
        fh.write('MD 1 {%s.state[2-%d].mix}\n' % (phone, model.states - 1))
    fh.close()

    hhed_log = '%s/hhed_mixdown.log' % output_dir

    cmd = 'HHEd -A -D -T 1 -H %s/MMF -M %s' % (prev_dir, output_dir)
    cmd += ' %s %s > %s' % (mixdown_hed, phone_list, hhed_log)
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 2
0
def mixdown_mono(model, root_dir, prev_dir, phone_list):
    """
    Run HHEd to mixdown monophones
    """

    output_dir = '%s/HMM-1-0' %root_dir
    util.create_new_dir(output_dir)

    ## Create the full list of possible triphones
    phones = open(phone_list).read().splitlines()
    non_sil_phones = [p for p in phones if p not in ['sp', 'sil']]

    ## Make the hed script
    mixdown_hed = '%s/mix_down.hed' %output_dir
    fh = open(mixdown_hed, 'w')
    fh.write('MD 12 {(sil,sp).state[2-%d].mix}\n' %(model.states-1))
    for phone in non_sil_phones:
        fh.write('MD 1 {%s.state[2-%d].mix}\n' %(phone, model.states-1))
    fh.close()

    hhed_log = '%s/hhed_mixdown.log' %output_dir

    cmd  = 'HHEd -A -D -T 1 -H %s/MMF -M %s' %(prev_dir, output_dir)
    cmd += ' %s %s > %s' %(mixdown_hed, phone_list, hhed_log)
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 3
0
def init_tri_from_mono(model, root_dir, mono_dir, tri_mlf, mono_list,
                       tri_list):
    """
    Convert a monophone model and triphone mlf to triphones
    """

    ## Create the xword directory and the current output directory
    output_dir = '%s/HMM-0-0' % root_dir
    util.create_new_dir(root_dir)
    util.create_new_dir(output_dir)

    mktri_hed = '%s/mktri.hed' % output_dir
    hhed_log = '%s/hhed_clone_mono.log' % output_dir

    ## Create an HHEd script to clone monophones to triphones
    fh = open(mktri_hed, 'w')
    for line in open(mono_list):
        mono = line.strip()
        fh.write('TI T_%s {(%s).transP}\n' % (mono, mono))
    fh.write('CL %s\n' % tri_list)
    fh.close()

    ## Run HHEd to clone monophones and tie transition matricies
    cmd = 'HHEd -A -T 1 -H %s/MMF' % mono_dir
    cmd += ' -M %s' % output_dir
    cmd += ' %s %s > %s' % (mktri_hed, mono_list, hhed_log)

    if model.local: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 4
0
def init_tri_from_mono(model, root_dir, mono_dir, tri_mlf, mono_list, tri_list):
    """
    Convert a monophone model and triphone mlf to triphones
    """

    ## Create the xword directory and the current output directory
    output_dir = '%s/HMM-0-0' %root_dir
    util.create_new_dir(root_dir)
    util.create_new_dir(output_dir)

    mktri_hed = '%s/mktri.hed' %output_dir
    hhed_log = '%s/hhed_clone_mono.log' %output_dir

    ## Create an HHEd script to clone monophones to triphones
    fh = open(mktri_hed, 'w')
    for line in open(mono_list):
        mono = line.strip()
        fh.write('TI T_%s {(%s).transP}\n' %(mono, mono))
    fh.write('CL %s\n' %tri_list)
    fh.close()

    ## Run HHEd to clone monophones and tie transition matricies
    cmd  = 'HHEd -A -T 1 -H %s/MMF' %mono_dir
    cmd += ' -M %s' %output_dir
    cmd += ' %s %s > %s' %(mktri_hed, mono_list, hhed_log)

    if model.local: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 5
0
def mixup(model, root_dir, prev_dir, model_list, mix_size, estimateVarFloor=0):
    """
    Run HHEd to initialize a mixup to mix_size gaussians
    """

    output_dir = '%s/HMM-%d-%d' % (root_dir, mix_size, 0)
    util.create_new_dir(output_dir)

    ## Make the hed script
    mix_hed = '%s/mix_%d.hed' % (output_dir, mix_size)
    fh = open(mix_hed, 'w')

    if estimateVarFloor:
        fh.write('LS %s/stats\n' % prev_dir)
        fh.write('FA 0.1\n')

    fh.write('MU %d {(sil,sp).state[2-%d].mix}\n' %
             (2 * mix_size, model.states - 1))
    fh.write('MU %d {*.state[2-%d].mix}\n' % (mix_size, model.states - 1))
    fh.close()

    hhed_log = '%s/hhed_mix.log' % output_dir

    cmd = 'HHEd -A -D -T 1 -H %s/MMF -M %s' % (prev_dir, output_dir)
    cmd += ' %s %s > %s' % (mix_hed, model_list, hhed_log)
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 6
0
def mixup(model, root_dir, prev_dir, model_list, mix_size, estimateVarFloor=0):
    """
    Run HHEd to initialize a mixup to mix_size gaussians
    """

    output_dir = '%s/HMM-%d-%d' %(root_dir, mix_size, 0)
    util.create_new_dir(output_dir)

    ## Make the hed script
    mix_hed = '%s/mix_%d.hed' %(output_dir, mix_size)
    fh = open(mix_hed, 'w')

    if estimateVarFloor:
            fh.write('LS %s/stats\n' %prev_dir)
            fh.write('FA 0.1\n')
            
    fh.write('MU %d {(sil,sp).state[2-%d].mix}\n' %(2*mix_size,model.states-1))
    fh.write('MU %d {*.state[2-%d].mix}\n' %(mix_size, model.states-1))
    fh.close()

    hhed_log = '%s/hhed_mix.log' %output_dir

    cmd  = 'HHEd -A -D -T 1 -H %s/MMF -M %s' %(prev_dir, output_dir)
    cmd += ' %s %s > %s' %(mix_hed, model_list, hhed_log)
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 7
0
def diagonalize(model, output_dir, model_dir, model_list, mlf_file, mix_size):
    """
    Diagonalize output distributions
    """
    util.create_new_dir(output_dir)

    diag_config = '%s/config.diag' %output_dir
    global_class = '%s/global' %output_dir

    fh = open(diag_config, 'w')
    fh.write('HADAPT:TRANSKIND = SEMIT\n')
    fh.write('HADAPT:USEBIAS = FALSE\n')
    fh.write('HADAPT:BASECLASS = global\n')
    fh.write('HADAPT:SPLITTHRESH = 0.0\n')
    fh.write('HADAPT:MAXXFORMITER = 100\n')
    fh.write('HADAPT:MAXSEMITIEDITER = 20\n')
    fh.write('HADAPT:TRACE = 61\n')
    fh.write('HMODEL:TRACE = 512\n')
    fh.write('HADAPT: SEMITIED2INPUTXFORM = TRUE\n')
    fh.close()

    max_mix = 2 * mix_size
    fh = open(global_class, 'w')
    fh.write('~b "global"\n')
    fh.write('<MMFIDMASK> *\n')
    fh.write('<PARAMETERS> MIXBASE\n')
    fh.write('<NUMCLASSES> 1\n')
    fh.write('<CLASS> 1 {*.state[2-4].mix[1-%d]}\n' %max_mix)
    fh.close()

    extra = ' -C %s -J %s -K %s/HMM-%d-0 -u stw' %(diag_config, output_dir, output_dir, mix_size)

    hmm_dir, k, likelihood = run_iter(model, output_dir, model_dir, mlf_file, model_list, mix_size, 0, extra)

    return hmm_dir, likelihood
Exemplo n.º 8
0
def tie_states(model, output_dir, model_dir, mono_list, tri_list, tied_list):
    """
    Tie HMM states using decision tree clustering
    """

    util.create_new_dir(output_dir)
    tree_hed = '%s/tree.hed' % output_dir
    all_tri_list = '%s/all_tri.list' % model.exp
    tree_output = '%s/trees' % output_dir
    hhed_log = '%s/hhed_cluster.log' % output_dir

    ## Decision tree parameters
    ro = 200
    tb = 750

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]

    fh = open(all_tri_list, 'w')
    fh.write('sp\n')
    fh.write('sil\n')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' % p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' % (p1, p2))
            fh.write('%s-%s+sil\n' % (p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' % (p2, p1, p3))
    fh.close()

    ## Set up decision tree clustering
    fh = open(tree_hed, 'w')
    fh.write('RO %d %s/stats\n' % (ro, model_dir))
    fh.write('TR 0\n')
    fh.write('%s\n' % open(model.tree_questions).read())
    fh.write('TR 12\n')
    for p in non_sp_phones:
        for s in range(1, model.states + 1)[1:-1]:
            fh.write('TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %
                     (tb, p, s, p, p, p, p, s))
    fh.write('TR 1\n')
    fh.write('AU "%s"\n' % all_tri_list)
    fh.write('CO "%s"\n' % tied_list)
    fh.write('ST "%s"\n' % tree_output)
    fh.close()

    ## Use HHEd to cluster
    cmd = 'HHEd -A -T 1 -H %s/MMF' % model_dir
    cmd += ' -M %s' % output_dir
    cmd += ' %s %s > %s' % (tree_hed, tri_list, hhed_log)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 9
0
def tie_states(model, output_dir, model_dir, mono_list, tri_list, tied_list):
    """
    Tie HMM states using decision tree clustering
    """

    util.create_new_dir(output_dir)
    tree_hed = '%s/tree.hed' %output_dir
    all_tri_list = '%s/all_tri.list' %model.exp
    tree_output = '%s/trees' %output_dir
    hhed_log = '%s/hhed_cluster.log' %output_dir

    ## Decision tree parameters
    ro = 200
    tb = 750

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]

    fh = open(all_tri_list, 'w')
    fh.write('sp\n')
    fh.write('sil\n')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' %p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' %(p1, p2))
            fh.write('%s-%s+sil\n' %(p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' %(p2, p1, p3))
    fh.close()

    ## Set up decision tree clustering
    fh = open(tree_hed, 'w')
    fh.write('RO %d %s/stats\n' %(ro, model_dir))
    fh.write('TR 0\n')
    fh.write('%s\n' %open(model.tree_questions).read())
    fh.write('TR 12\n')
    for p in non_sp_phones:
        for s in range(1, model.states+1)[1:-1]:
            fh.write('TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %(tb,p,s,p,p,p,p,s))
    fh.write('TR 1\n')
    fh.write('AU "%s"\n' %all_tri_list)
    fh.write('CO "%s"\n' %tied_list)
    fh.write('ST "%s"\n' %tree_output)
    fh.close()

    ## Use HHEd to cluster
    cmd  = 'HHEd -A -T 1 -H %s/MMF' %model_dir
    cmd += ' -M %s' %output_dir
    cmd += ' %s %s > %s' %(tree_hed, tri_list, hhed_log)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 10
0
def mono_to_tri(model, root_dir, mono_dir, phone_mlf, tri_mlf, mono_list,
                tri_list):
    """
    Convert a monophone model and phone mlf to triphones
    """

    ## Create the xword directory and the current output directory
    output_dir = '%s/HMM-0-0' % root_dir
    util.create_new_dir(root_dir)
    util.create_new_dir(output_dir)

    mktri_led = '%s/mktri_cross.led' % output_dir
    mktri_hed = '%s/mktri.hed' % output_dir
    hled_log = '%s/hled_make_tri.log' % output_dir
    hhed_log = '%s/hhed_clone_mono.log' % output_dir

    ## Create an HLEd script
    fh = open(mktri_led, 'w')
    fh.write('NB sp\n')
    fh.write('TC\n')
    fh.write('IT\n')
    fh.write('CH sil * sil *\n')
    fh.write('CH sp  * sp  *\n')
    fh.write('ME sil sil sil sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sil sp sil\n')
    fh.close()

    ## Create a new alignment in tri_mlf and output used triphones to tri_list
    cmd = 'HLEd -A -n %s' % tri_list
    cmd += ' -i %s' % tri_mlf
    cmd += ' %s %s > %s' % (mktri_led, phone_mlf, hled_log)

    if model.local: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Create an HHEd script to clone monophones to triphones
    fh = open(mktri_hed, 'w')
    for line in open(mono_list):
        mono = line.strip()
        fh.write('TI T_%s {(%s).transP}\n' % (mono, mono))
    fh.write('CL %s\n' % tri_list)
    fh.close()

    ## Run HHEd to clone monophones and tie transition matricies
    cmd = 'HHEd -A -T 1 -H %s/MMF' % mono_dir
    cmd += ' -M %s' % output_dir
    cmd += ' %s %s > %s' % (mktri_hed, mono_list, hhed_log)

    if model.local: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 11
0
def mono_to_tri(model, root_dir, mono_dir, phone_mlf, tri_mlf, mono_list, tri_list):
    """
    Convert a monophone model and phone mlf to triphones
    """

    ## Create the xword directory and the current output directory
    output_dir = '%s/HMM-0-0' %root_dir
    util.create_new_dir(root_dir)
    util.create_new_dir(output_dir)

    mktri_led = '%s/mktri_cross.led' %output_dir
    mktri_hed = '%s/mktri.hed' %output_dir
    hled_log = '%s/hled_make_tri.log' %output_dir
    hhed_log = '%s/hhed_clone_mono.log' %output_dir

    ## Create an HLEd script
    fh = open(mktri_led, 'w')
    fh.write('NB sp\n')
    fh.write('TC\n')
    fh.write('IT\n')
    fh.write('CH sil * sil *\n')
    fh.write('CH sp  * sp  *\n')
    fh.write('ME sil sil sil sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sil sp sil\n')
    fh.close()

    ## Create a new alignment in tri_mlf and output used triphones to tri_list
    cmd  = 'HLEd -A -n %s' %tri_list
    cmd += ' -i %s' %tri_mlf
    cmd += ' %s %s > %s' %(mktri_led, phone_mlf, hled_log)

    if model.local: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Create an HHEd script to clone monophones to triphones
    fh = open(mktri_hed, 'w')
    for line in open(mono_list):
        mono = line.strip()
        fh.write('TI T_%s {(%s).transP}\n' %(mono, mono))
    fh.write('CL %s\n' %tri_list)
    fh.close()

    ## Run HHEd to clone monophones and tie transition matricies
    cmd  = 'HHEd -A -T 1 -H %s/MMF' %mono_dir
    cmd += ' -M %s' %output_dir
    cmd += ' %s %s > %s' %(mktri_hed, mono_list, hhed_log)

    if model.local: os.system(cmd)
    else: util.run(cmd, output_dir)

    return output_dir
Exemplo n.º 12
0
    def test(self,
             gaussians=1,
             iter=8,
             mmi=False,
             diag=False,
             xword_id='',
             output_dir=None):

        ## Copy config file to the experiment dir
        config_output = '%s/config' % self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TESTING with config [%s]' % config_output)

        if self.test_pipeline['coding']:
            import coding
            coding_dir = '%s/Coding' % self.exp
            util.create_new_dir(coding_dir)
            count = coding.wav_to_mfc(self, coding_dir, self.mfc_list)
            log(self.logfh, 'CODING finished [%d files]' % count)

        if self.test_pipeline['test']:
            import dict_and_lm
            start_time = time.time()
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(
                model,
                self.dict,
                self.setup,
                self.data,
                self.word_mlf,
                self.mfc_list,
                skip_oov=True)
            log(self.logfh,
                'wrote word mlf [%d utts] [%s]' % (num_utts, self.word_mlf))

            self.decode(model, self.mfc_list, self.word_mlf, self.lm,
                        gaussians, iter, mmi, diag, xword_id, output_dir)
            total_time = time.time() - start_time
            log(self.logfh,
                'TESTING finished; secs elapsed [%1.2f]' % total_time)
Exemplo n.º 13
0
    def test(self, gaussians=1, iter=8, mmi=False, diag=False, xword_id='', output_dir=None):

        ## Copy config file to the experiment dir
        config_output = '%s/config' %self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TESTING with config [%s]' %config_output)

        if self.test_pipeline['coding']:
            import coding
            coding_dir = '%s/Coding' %self.exp
            util.create_new_dir(coding_dir)
            count = coding.wav_to_mfc(self, coding_dir, self.mfc_list)
            log(self.logfh, 'CODING finished [%d files]' %count)

        if self.test_pipeline['test']:
            import dict_and_lm
            start_time = time.time()
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(model, self.dict, self.setup, self.data, self.word_mlf, self.mfc_list, skip_oov=True)
            log(self.logfh, 'wrote word mlf [%d utts] [%s]' %(num_utts, self.word_mlf))

            self.decode(model, self.mfc_list, self.word_mlf, self.lm, gaussians, iter, mmi, diag, xword_id, output_dir)
            total_time = time.time() - start_time
            log(self.logfh, 'TESTING finished; secs elapsed [%1.2f]' %total_time)
Exemplo n.º 14
0
def diagonalize(model, output_dir, model_dir, model_list, mlf_file, mix_size):
    """
    Diagonalize output distributions
    """
    util.create_new_dir(output_dir)

    diag_config = '%s/config.diag' % output_dir
    global_class = '%s/global' % output_dir

    fh = open(diag_config, 'w')
    fh.write('HADAPT:TRANSKIND = SEMIT\n')
    fh.write('HADAPT:USEBIAS = FALSE\n')
    fh.write('HADAPT:BASECLASS = global\n')
    fh.write('HADAPT:SPLITTHRESH = 0.0\n')
    fh.write('HADAPT:MAXXFORMITER = 100\n')
    fh.write('HADAPT:MAXSEMITIEDITER = 20\n')
    fh.write('HADAPT:TRACE = 61\n')
    fh.write('HMODEL:TRACE = 512\n')
    fh.write('HADAPT: SEMITIED2INPUTXFORM = TRUE\n')
    fh.close()

    max_mix = 2 * mix_size
    fh = open(global_class, 'w')
    fh.write('~b "global"\n')
    fh.write('<MMFIDMASK> *\n')
    fh.write('<PARAMETERS> MIXBASE\n')
    fh.write('<NUMCLASSES> 1\n')
    fh.write('<CLASS> 1 {*.state[2-4].mix[1-%d]}\n' % max_mix)
    fh.close()

    extra = ' -C %s -J %s -K %s/HMM-%d-0 -u stw' % (diag_config, output_dir,
                                                    output_dir, mix_size)

    hmm_dir, k, likelihood = run_iter(model, output_dir, model_dir, mlf_file,
                                      model_list, mix_size, 0, extra)

    return hmm_dir, likelihood
Exemplo n.º 15
0
    def train(self):

        ## Copy config file to the experiment dir
        config_output = '%s/config' %self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TRAINING with config [%s]' %config_output)

        if self.train_pipeline['coding']:
            log(self.logfh, 'CODING started')
            import coding
            util.create_new_dir(self.coding_root)
            coding.create_config(self)
            count = coding.wav_to_mfc(self, self.coding_root, self.mfc_list)
            os.system('cp %s %s/mfc.list.original' %(self.mfc_list, self.misc))
            log(self.logfh, 'wrote mfc files [%d]' %count)
            log(self.logfh, 'CODING finished')

        if self.train_pipeline['lm']:
            log(self.logfh, 'MLF/LM/DICT started')
            import dict_and_lm
            phone_set = dict_and_lm.fix_cmu_dict(self.orig_dict, self.htk_dict)
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(self, self.htk_dict, self.setup, self.data, self.word_mlf, self.mfc_list)
            log(self.logfh, 'wrote word mlf [%d utts] [%s]' %(num_utts, self.word_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.dict' %(self.mfc_list, self.misc))
            num_entries = dict_and_lm.make_train_dict(self.htk_dict, self.train_dict, words)
            dict_and_lm.make_decode_dict(self.htk_dict, self.decode_dict, words)
            log(self.logfh, 'wrote training dictionary [%d entries] [%s]' %(num_entries, self.train_dict))

            util.create_new_dir(self.lm_dir)
            train_vocab = '%s/vocab' %self.lm_dir
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf, self.train_dict, train_vocab, self.lm_dir, self.lm, self.lm_order)
            log(self.logfh, 'wrote lm [%s] training ppl [%1.2f]' %(self.lm, ppl))
            log(self.logfh, 'MLF/LM/DICT finished')
            
        if self.train_pipeline['flat_start']:
            log(self.logfh, 'FLAT START started')
            import init_hmm
            init_hmm.word_to_phone_mlf(self, self.train_dict, self.word_mlf, self.phone_mlf, self.phone_list)
            log(self.logfh, 'wrote phone mlf [%s]' %self.phone_mlf)

            os.system('cp %s %s/phone.mlf.from.dict' %(self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.dict' %self.misc)
            init_hmm.make_proto_hmm(self, self.mfc_list, self.proto_hmm)
            hmm_dir, num_mfcs = init_hmm.initialize_hmms(self, self.mono_root, self.mfc_list, self.phone_list, self.proto_hmm)
            log(self.logfh, 'initialized an HMM for each phone in [%s]' %hmm_dir)
            log(self.logfh, 'used [%d] mfc files to compute variance floor' %num_mfcs)

            import train_hmm
            for iter in range(1, self.initial_mono_iters+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root, hmm_dir, self.phone_mlf, self.phone_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            align_config = '%s/config.align' %self.mono_root
            fh = open(align_config, 'w')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.close()
            
            align_dir = train_hmm.align(self, self.mono_root, self.mfc_list, hmm_dir, self.word_mlf, self.phone_mlf, self.phone_list, self.train_dict, align_config)
            log(self.logfh, 'aligned with model in [%s], wrote phone mlf [%s]' %(hmm_dir, self.phone_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.mono.align' %(self.mfc_list, self.misc))

            os.system('cp %s %s/phone.mlf.from.mono.align' %(self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.mono.align' %self.misc)

            for iter in range(self.initial_mono_iters+1, self.initial_mono_iters+1+self.mono_iters):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root, hmm_dir, self.phone_mlf, self.phone_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'FLAT START finished')

        if self.train_pipeline['mixup_mono']:
            log(self.logfh, 'MIXUP MONO started')
            import train_hmm

            hmm_dir = '%s/HMM-%d-%d' %(self.mono_root, 1, self.initial_mono_iters+self.mono_iters)
            
            ## mixup everything
            for mix_size in self.mono_mixup_schedule:
                hmm_dir = train_hmm.mixup(self, self.mixup_mono_root, hmm_dir, self.phone_list, mix_size)
                log(self.logfh, 'mixed up to [%d] in [%s]' %(mix_size, hmm_dir))
                for iter in range(1, self.mono_iters+1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.mixup_mono_root, hmm_dir, self.phone_mlf, self.phone_list, mix_size, iter, '')
                    log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'MIXUP MONO finished')

        if self.train_pipeline['mixdown_mono']:
            log(self.logfh, 'MIXDOWN MONO started')
            import train_hmm

            num_gaussians = self.mono_mixup_schedule[-1]
            hmm_dir = '%s/HMM-%d-%d' %(self.mixup_mono_root, num_gaussians, self.mono_iters)
            train_hmm.mixdown_mono(self, self.mixdown_mono_root, hmm_dir, self.phone_list)

            log(self.logfh, 'MIXDOWN MONO finished')

        if self.train_pipeline['mono_to_tri']:
            log(self.logfh, 'MONO TO TRI started')
            import train_hmm

            if self.train_pipeline['mixdown_mono']:
                mono_final_dir = '%s/HMM-1-0' %self.mixdown_mono_root
            else:
                mono_final_dir = '%s/HMM-%d-%d' %(self.mono_root, 1, self.initial_mono_iters+self.mono_iters)
                
            hmm_dir = train_hmm.mono_to_tri(self, self.xword_root, mono_final_dir, self.phone_mlf, self.tri_mlf, self.phone_list, self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' %hmm_dir)
            log(self.logfh, 'created triphone mlf [%s]' %self.tri_mlf)

            os.system('cp %s %s/tri.mlf.from.mono.align' %(self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.mono.align' %self.misc)
            os.system('cp %s %s/tri.list.from.mono.align' %(self.tri_list, self.misc))

            for iter in range(1, self.initial_tri_iters+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root, hmm_dir, self.tri_mlf, self.tri_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            
            xword_tie_dir = '%s/HMM-%d-%d' %(self.xword_root, 1, self.initial_tri_iters+1)
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir, self.phone_list, self.tri_list, self.tied_list)
            log(self.logfh, 'tied states in [%s]' %hmm_dir)

            os.system('cp %s %s/tied.list.initial' %(self.tied_list, self.misc))

            hmm_dir = '%s/HMM-%d-%d' %(self.xword_root, 1, self.initial_tri_iters+1)
            for iter in range(self.initial_tri_iters+2, self.initial_tri_iters+1+self.tri_iters+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root, hmm_dir, self.tri_mlf, self.tied_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'MONO TO TRI finished')

        if self.train_pipeline['mixup_tri']:
            log(self.logfh, 'MIXUP TRI started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.initial_tri_iters+self.tri_iters+1
            hmm_dir = '%s/HMM-%d-%d' %(self.xword_root, start_gaussians, start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size==2:
                    hmm_dir = train_hmm.mixup(self, self.xword_root, hmm_dir, self.tied_list, mix_size, estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_root, hmm_dir, self.tied_list, mix_size)
                log(self.logfh, 'mixed up to [%d] in [%s]' %(mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split+1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root, hmm_dir, self.tri_mlf, self.tied_list, mix_size, iter, '')
                    log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            log(self.logfh, 'MIXUP TRI finished')

        if self.train_pipeline['align_with_xword']:
            log(self.logfh, 'XWORD ALIGN started')
            import train_hmm

            align_config = '%s/config.align' %self.xword_root
            train_hmm.make_hvite_xword_config(self, align_config, 'MFCC_0_D_A_Z')
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split
            hmm_dir = '%s/HMM-%d-%d' %(self.xword_root, num_gaussians, iter_num)
            realigned_mlf = '%s/raw_tri_xword_realigned.mlf' %self.misc

            # Use the original, mfc list that has prons for every word
            os.system('cp %s/mfc.list.filtered.by.dict %s' %(self.misc, self.mfc_list))
            
            align_dir = train_hmm.align(self, self.xword_root, self.mfc_list, hmm_dir, self.word_mlf, realigned_mlf, self.tied_list, self.train_dict, align_config)
            log(self.logfh, 'aligned with model in [%s], tri mlf [%s]' %(hmm_dir, realigned_mlf))

            # Because of state tying, the triphones in the mlf will only be
            # valid for this state tying. Strip down to monophones, the
            # correct triphones will be created later in mono_to_tri
            train_hmm.map_tri_to_mono(self, align_dir, realigned_mlf, self.phone_mlf)
            os.system('cp %s %s/phone.mlf.from.xword.align' %(self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.xword.align' %self.misc)
            os.system('bzip2 -f %s' %realigned_mlf)

            log(self.logfh, 'XWORD ALIGN finished')


        if self.train_pipeline['mono_to_tri_from_xword']:
            log(self.logfh, 'MONO TO TRI FROM XWORD started')
            import train_hmm

            #Assume that midown mono happened?
            mono_final_dir = '%s/HMM-1-0' %self.mixdown_mono_root

            hmm_dir = train_hmm.mono_to_tri(self, self.xword_1_root, mono_final_dir, self.phone_mlf, self.tri_mlf, self.phone_list, self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' %hmm_dir)

            os.system('cp %s %s/tri.mlf.from.xword.align' %(self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.xword.align' %self.misc)
            os.system('cp %s %s/tri.list.from.xword.align' %(self.tri_list, self.misc))

            two_model_config = '%s/config.two_model' %self.xword_1_root
            fh = open(two_model_config, 'w')
            fh.write('ALIGNMODELMMF = %s/HMM-%d-%d/MMF\n' %(self.xword_root, self.tri_mixup_schedule[-1], self.tri_iters_per_split))
            fh.write('ALIGNHMMLIST = %s\n' %self.tied_list)
            fh.close()

            # Do one pass of two-model re-estimation
            extra = ' -C %s' %two_model_config
            hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root, hmm_dir, self.tri_mlf, self.tri_list, 1, 1, extra)
            log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            
            xword_tie_dir = '%s/HMM-1-2' %self.xword_1_root
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir, self.phone_list, self.tri_list, self.tied_list)
            log(self.logfh, 'tied states in [%s]' %hmm_dir)

            os.system('cp %s %s/tied.list.second' %(self.tied_list, self.misc))

            hmm_dir = '%s/HMM-1-2' %self.xword_1_root
            for iter in range(3, self.tri_iters+3):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root, hmm_dir, self.tri_mlf, self.tied_list, 1, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'MONO TO TRI FROM XWORD finished')

        if self.train_pipeline['mixup_tri_2']:
            log(self.logfh, 'MIXUP TRI 2 started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.tri_iters+2
            hmm_dir = '%s/HMM-%d-%d' %(self.xword_1_root, start_gaussians, start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size==2:
                    hmm_dir = train_hmm.mixup(self, self.xword_1_root, hmm_dir, self.tied_list, mix_size, estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_1_root, hmm_dir, self.tied_list, mix_size)
                log(self.logfh, 'mixed up to [%d] in [%s]' %(mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split+1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root, hmm_dir, self.tri_mlf, self.tied_list, mix_size, iter, '')
                    log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            log(self.logfh, 'MIXUP TRI 2 finished')
            
        if self.train_pipeline['diag']:
            log(self.logfh, 'DIAG started')
            import train_hmm
 
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['mixup_tri_2']:
                seed_dir = '%s/HMM-%d-%d' %(self.xword_1_root, num_gaussians, iter_num)
            else:
                seed_dir = '%s/HMM-%d-%d' %(self.xword_root, num_gaussians, iter_num)
            hmm_dir, L = train_hmm.diagonalize(self, self.diag_root, seed_dir, self.tied_list, self.tri_mlf, num_gaussians)
            log(self.logfh, 'ran diag in [%s] lik/fr [%1.4f]' %(hmm_dir, L))
            
            for iter in range(1, self.tri_iters_per_split+1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.diag_root, hmm_dir, self.tri_mlf, self.tied_list, num_gaussians, iter, '')
                log(self.logfh, 'ran an iteration of BW in [%s] lik/fr [%1.4f]' %(hmm_dir, L))

            log(self.logfh, 'DIAG finished')
            
        if self.train_pipeline['mmi']:
            log(self.logfh, 'DISCRIM started')
            
            ## Common items
            import mmi
            mmi_dir = '%s/MMI' %self.exp
            util.create_new_dir(mmi_dir)
            mfc_list_mmi = '%s/mfc.list' %mmi_dir
            os.system('cp %s %s' %(self.mfc_list, mfc_list_mmi))

            ## Create weak LM
            import dict_and_lm
            train_vocab = '%s/vocab' %self.lm_dir
            lm_order = 2
            target_ppl_ratio = 8
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf, self.train_dict, train_vocab, self.lm_dir, self.mmi_lm, lm_order, target_ppl_ratio)
            log(self.logfh, 'wrote lm for mmi [%s] training ppl [%1.2f]' %(self.mmi_lm, ppl))

            ## Create decoding lattices for every utterance
            lattice_dir = '%s/Denom/Lat_word' %mmi_dir
            util.create_new_dir(lattice_dir)
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['diag']:
                model_dir = '%s/HMM-%d-%d' %(self.diag_root, num_gaussians, iter_num)
            elif self.train_pipeline['mixup_tri_2']:
                model_dir = '%s/HMM-%d-%d' %(self.xword_1_root, num_gaussians, iter_num)
            else:
                model_dir = '%s/HMM-%d-%d' %(self.xword_root, num_gaussians, iter_num)
            mmi.decode_to_lattices(model, lattice_dir, model_dir, mfc_list_mmi, self.mmi_lm, self.decode_dict,
                                   self.tied_list, self.word_mlf)
            log(self.logfh, 'generated training lattices in [%s]' %lattice_dir)

            ## Prune and determinize lattices
            pruned_lattice_dir = '%s/Denom/Lat_prune' %mmi_dir
            util.create_new_dir(pruned_lattice_dir)
            mmi.prune_lattices(model, lattice_dir, pruned_lattice_dir, self.decode_dict)
            log(self.logfh, 'pruned lattices in [%s]' %pruned_lattice_dir)

            ## Phone-mark lattices
            phone_lattice_dir = '%s/Denom/Lat_phone' %mmi_dir
            util.create_new_dir(phone_lattice_dir)
            mmi.phonemark_lattices(model, pruned_lattice_dir, phone_lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict, self.tied_list)
            log(self.logfh, 'phone-marked lattices in [%s]' %phone_lattice_dir)

            ## Create numerator word lattices
            num_lattice_dir = '%s/Num/Lat_word' %mmi_dir
            util.create_new_dir(num_lattice_dir)
            mmi.create_num_lattices(model, num_lattice_dir, self.mmi_lm, self.decode_dict, self.word_mlf)
            log(self.logfh, 'generated numerator lattices in [%s]' %num_lattice_dir)

            ## Phone-mark numerator lattices
            num_phone_lattice_dir = '%s/Num/Lat_phone' %mmi_dir
            util.create_new_dir(num_phone_lattice_dir)
            mmi.phonemark_lattices(model, num_lattice_dir, num_phone_lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict, self.tied_list)
            log(self.logfh, 'phone-marked numerator lattices in [%s]' %num_phone_lattice_dir)

            ## Add LM scores to numerator phone lattices
            num_phone_lm_lattice_dir = '%s/Num/Lat_phone_lm' %mmi_dir
            util.create_new_dir(num_phone_lm_lattice_dir)
            mmi.add_lm_lattices(model, num_phone_lattice_dir, num_phone_lm_lattice_dir, self.decode_dict, self.mmi_lm)
            log(self.logfh, 'added LM scores to numerator lattices in [%s]' %num_phone_lm_lattice_dir)

            ## Modified Baum-Welch estimation
            root_dir = '%s/Models' %mmi_dir
            util.create_new_dir(root_dir)
            mmi_iters = 12
            mix_size = num_gaussians
            for iter in range(1, mmi_iters+1):
                model_dir = mmi.run_iter(model, model_dir, num_phone_lm_lattice_dir, phone_lattice_dir, root_dir,
                                         self.tied_list, mfc_list_mmi, mix_size, iter)
                log(self.logfh, 'ran an iteration of Modified BW in [%s]' %model_dir)

            log(self.logfh, 'DISCRIM finished')
Exemplo n.º 16
0
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list, tied_list):
    """
    Tie HMM states using decision tree clustering
    """

    util.create_new_dir(output_dir)
    tree_hed = '%s/tree.hed' %output_dir
    tree_output = '%s/trees' %output_dir
    hhed_log = '%s/hhed_cluster.log' %output_dir
    all_tri_list = '%s/all_tri.list' %model.exp

    ## Decision tree parameters
    ro = model.dt_ro
    tb = model.dt_tb
    tb_min = 100.0
    tb_max = 10000.0

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]
    fh = open(all_tri_list, 'w')
    fh.write('sp\n')
    fh.write('sil\n')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' %p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' %(p1, p2))
            fh.write('%s-%s+sil\n' %(p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' %(p2, p1, p3))
    fh.close()

    ## Search over tb arguments to get the right number states
    num_states = 0
    attempts = 0
    prev_tb = 0
    while True:

        os.system('rm -f %s %s %s' %(tree_hed, tree_output, hhed_log))
        
        ## Set up decision tree clustering
        fh = open(tree_hed, 'w')
        fh.write('RO %d %s/stats\n' %(ro, model_dir))
        fh.write('TR 0\n')
        fh.write('%s\n' %open(model.tree_questions).read())
        fh.write('TR 12\n')
        for p in non_sp_phones:
            for s in range(1, model.states+1)[1:-1]:
                fh.write('TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %(tb,p,s,p,p,p,p,s))
        fh.write('TR 1\n')
        fh.write('AU "%s"\n' %all_tri_list)
        fh.write('CO "%s"\n' %tied_list)
        fh.write('ST "%s"\n' %tree_output)
        fh.close()

        ## Use HHEd to cluster
        cmd  = 'HHEd -A -T 1 -H %s/MMF' %model_dir
        cmd += ' -M %s' %output_dir
        cmd += ' %s %s > %s' %(tree_hed, tri_list, hhed_log)

        if model.local == 1: os.system(cmd)
        else: util.run(cmd, output_dir)
        num_states = int(os.popen('grep -c "<MEAN>" %s/MMF' %output_dir).read().strip())

        
        if abs(float(num_states - model.triphone_states)/model.triphone_states) <= 0.01:
            util.log_write(model.logfh, ' current states [%d] tb [%1.2f]' %(num_states, tb))
            break
        
        if abs(prev_tb - tb) <= 0.01:
            util.log_write(model.logfh, ' Could not converge. Stopping. Current states [%d] tb [%1.2f]' %(num_states,tb))
            break
        
        attempts += 1
        prev_tb = tb
        if num_states < model.triphone_states:
            tb = (tb_min + tb) / 2
            tb_max = prev_tb
        else:
            tb = (tb_max + tb) / 2
            tb_min = prev_tb
        util.log_write(model.logfh, ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]' %(attempts, model.triphone_states, num_states, prev_tb, tb, tb_min, tb_max))

        if attempts > 50:
            util.log_write(model.logfh, ' Goal not reached after 50 tries. Exiting.')
            sys.exit()

    return output_dir
Exemplo n.º 17
0
def initialize_hmms(model, root_dir, mfc_list, mono_list, proto_hmm):
    """
    Compute mean and variance of each feature across all utterances and
    set all Gaussians in the prototype HMM to have the same mean and variance
    """

    output_dir = '%s/HMM-0-0' % root_dir
    util.create_new_dir(root_dir)
    util.create_new_dir(output_dir)
    cmd_log = '%s/hcompv.log' % output_dir

    ## Sample from the full mfc list to reduce computation
    sampled_mfc_list = '%s/mfc_sample.list' % output_dir
    fh = open(sampled_mfc_list, 'w')
    mfcs = open(mfc_list).read().splitlines()
    random.shuffle(mfcs)
    mfc_frac = model.var_floor_fraction
    num_mfcs_for_hcompv = int(mfc_frac * len(mfcs))
    for mfc in mfcs[:num_mfcs_for_hcompv]:
        fh.write('%s\n' % mfc)
    fh.close()

    cmd = 'HCompV -A -T 1 -m'
    cmd += ' -C %s' % model.mfc_config
    cmd += ' -f 0.01'
    cmd += ' -S %s' % sampled_mfc_list
    cmd += ' -M %s' % output_dir
    cmd += ' %s > %s' % (proto_hmm, cmd_log)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Copy the initial HMM for each monophone
    proto_hmm = '%s/proto_hmm' % output_dir
    hmm_defs_init = '%s/init.mmf' % output_dir
    hmm = re.search(re.compile('<BEGINHMM>.*<ENDHMM>', re.DOTALL),
                    open(proto_hmm).read()).group()

    fh = open(hmm_defs_init, 'w')
    for line in open(mono_list):
        phone = line.strip()
        fh.write('~h "%s"\n' % phone)
        fh.write('%s\n' % hmm)
    fh.close()

    ## Create mmf header file
    ## TODO: get rid of this?
    mmf_header = '%s/header.mmf' % output_dir
    cmd = 'head -3 %s | cat - %s/vFloors > %s' % (proto_hmm, output_dir,
                                                  mmf_header)
    os.system(cmd)

    ## Fix sp and silence models
    cleanup_config = '%s/cleanup_init.hed' % output_dir
    fh = open(cleanup_config, 'w')
    fh.write('AT 4 2 0.2 {sil.transP}\n')
    fh.write('AT 2 4 0.2 {sil.transP}\n')
    fh.write('AT 1 5 0.3 {sp.transP}\n')
    fh.write('TI silsp_2 {sil.state[2],sp.state[2]}\n')
    fh.write('TI silsp_3 {sil.state[3],sp.state[3]}\n')
    fh.write('TI silsp_4 {sil.state[4],sp.state[4]}\n')
    fh.close()

    hmm_defs_final = '%s/MMF' % output_dir
    cmd_log = '%s/hhed_sil.log' % output_dir
    cmd = 'HHEd -A -D -T 1 -d %s' % output_dir
    cmd += ' -H %s -H %s' % (mmf_header, hmm_defs_init)
    cmd += ' -M %s' % output_dir
    cmd += ' -w %s' % hmm_defs_final
    cmd += ' %s %s > %s' % (cleanup_config, mono_list, cmd_log)
    os.system(cmd)

    return output_dir, num_mfcs_for_hcompv
Exemplo n.º 18
0
def run_iter(model, root_dir, prev_dir, mlf_file, model_list, mix_size, iter,
             extra):
    """
    Run an iteration of Baum-Welch training using HERest
    """

    output_dir = '%s/HMM-%d-%d' % (root_dir, mix_size, iter)
    util.create_new_dir(output_dir)

    mfc_list = '%s/mfc.list' % model.exp
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## HERest parameters
    min_train_examples = 0
    prune_thresh = 250
    prune_inc = 150
    prune_limit = 2000

    def herest(input, split_num, extra):
        try:
            log_id = os.path.basename(input).split('.')[2]
        except:
            log_id = 'acc'
        cmd = '%s -D -A -T 1 -m %d' % (HEREST_CMD, min_train_examples)
        cmd += ' -t %d %d %d' % (prune_thresh, prune_inc, prune_limit)
        cmd += ' -s %s/stats' % output_dir
        cmd += ' -C %s%s' % (model.mfc_config, extra)
        cmd += ' -I %s' % mlf_file
        cmd += ' -H %s/MMF' % prev_dir
        cmd += ' -p %d' % split_num
        cmd += ' -S %s' % input
        #cmd += ' -M %s %s' %(output_dir, model_list)
        cmd += ' -M %s %s >> %s/herest.%s.log' % (output_dir, model_list,
                                                  output_dir, log_id)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(herest(input, split_num, extra))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/herest.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/herest.list' % output_dir
    os.system('ls %s/HER*.acc > %s' % (output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = herest(acc_file, 0, extra)
    cmd = cmd.split('>>')[0]
    cmd += ' >> %s/herest.log' % output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/HER*.acc' % (output_dir, output_dir))
    os.system('bzip2 %s/herest.*.log %s/run-command*.log' %
              (output_dir, output_dir))

    ## Get a few stats
    num_models = int(
        os.popen('grep "<MEAN>" %s/MMF -c' % output_dir).read().strip())
    likelihood = float(
        os.popen('cat %s/herest.log | grep aver' %
                 output_dir).read().strip().split()[-1])

    return output_dir, num_models, likelihood
Exemplo n.º 19
0
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list,
          dict, align_config):
    """
    Create a new alignment based on a model and the word alignment with HVite
    """

    output_dir = '%s/Align' % root_dir
    util.create_new_dir(output_dir)
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' % (mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT
        cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' % prune_thresh
        cmd += ' -C %s' % align_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -i %s' % output
        cmd += ' -I %s' % word_mlf
        cmd += ' -S %s' % input
        cmd += ' %s %s' % (dict, model_list)
        cmd += ' >> %s.hvite.log' % output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        outputs.append(output)
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hvite.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' % output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')
    fh.close()

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' % (
        new_mlf, merge_sil, ' '.join(outputs), output_dir)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' % new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0:
                util.log_write(model.logfh, 'removed bad alignment [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc + '\n')
    fh.close()
    util.log_write(model.logfh, 'removed alignments [%d]' % bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %
              (output_dir, output_dir))
    return output_dir
Exemplo n.º 20
0
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): 
    """
    Create a new alignment based on a model and the word alignment with HVite
    """

    output_dir = '%s/Align' %root_dir
    util.create_new_dir(output_dir)
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' %(mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT 
        cmd  = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' %prune_thresh
        cmd += ' -C %s' %align_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -i %s' %output
        cmd += ' -I %s' %word_mlf
        cmd += ' -S %s' %input
        cmd += ' %s %s' %(dict, model_list)
        cmd += ' >> %s.hvite.log' %output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        outputs.append(output)
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hvite.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' %output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')
    fh.close()

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' %(new_mlf, merge_sil, ' '.join(outputs), output_dir)
            
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' %new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' %id)
            bad_count += 1
        else: fh.write(mfc + '\n')
    fh.close()
    util.log_write(model.logfh, 'removed alignments [%d]' %bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir))
    return output_dir
Exemplo n.º 21
0
def tie_states_search(model, output_dir, model_dir, mono_list, tri_list,
                      tied_list):
    """
    Tie HMM states using decision tree clustering
    """

    util.create_new_dir(output_dir)
    tree_hed = '%s/tree.hed' % output_dir
    tree_output = '%s/trees' % output_dir
    hhed_log = '%s/hhed_cluster.log' % output_dir
    all_tri_list = '%s/all_tri.list' % model.exp

    ## Decision tree parameters
    ro = model.dt_ro
    tb = model.dt_tb
    tb_min = 100.0
    tb_max = 10000.0

    ## Create the full list of possible triphones
    phones = open(mono_list).read().splitlines()
    non_sp_phones = [p for p in phones if p not in ['sp', 'sil']]
    fh = open(all_tri_list, 'w')
    fh.write('sp\n')
    fh.write('sil\n')
    for p1 in non_sp_phones:
        fh.write('sil-%s+sil\n' % p1)
        for p2 in non_sp_phones:
            fh.write('sil-%s+%s\n' % (p1, p2))
            fh.write('%s-%s+sil\n' % (p2, p1))
            for p3 in non_sp_phones:
                fh.write('%s-%s+%s\n' % (p2, p1, p3))
    fh.close()

    ## Search over tb arguments to get the right number states
    num_states = 0
    attempts = 0
    prev_tb = 0
    while True:

        os.system('rm -f %s %s %s' % (tree_hed, tree_output, hhed_log))

        ## Set up decision tree clustering
        fh = open(tree_hed, 'w')
        fh.write('RO %d %s/stats\n' % (ro, model_dir))
        fh.write('TR 0\n')
        fh.write('%s\n' % open(model.tree_questions).read())
        fh.write('TR 12\n')
        for p in non_sp_phones:
            for s in range(1, model.states + 1)[1:-1]:
                fh.write(
                    'TB %d "ST_%s_%d_" {(%s,*-%s+*,%s+*,*-%s).state[%d]}\n' %
                    (tb, p, s, p, p, p, p, s))
        fh.write('TR 1\n')
        fh.write('AU "%s"\n' % all_tri_list)
        fh.write('CO "%s"\n' % tied_list)
        fh.write('ST "%s"\n' % tree_output)
        fh.close()

        ## Use HHEd to cluster
        cmd = 'HHEd -A -T 1 -H %s/MMF' % model_dir
        cmd += ' -M %s' % output_dir
        cmd += ' %s %s > %s' % (tree_hed, tri_list, hhed_log)

        if model.local == 1: os.system(cmd)
        else: util.run(cmd, output_dir)
        num_states = int(
            os.popen('grep -c "<MEAN>" %s/MMF' % output_dir).read().strip())

        if abs(
                float(num_states - model.triphone_states) /
                model.triphone_states) <= 0.01:
            util.log_write(
                model.logfh,
                ' current states [%d] tb [%1.2f]' % (num_states, tb))
            break

        if abs(prev_tb - tb) <= 0.01:
            util.log_write(
                model.logfh,
                ' Could not converge. Stopping. Current states [%d] tb [%1.2f]'
                % (num_states, tb))
            break

        attempts += 1
        prev_tb = tb
        if num_states < model.triphone_states:
            tb = (tb_min + tb) / 2
            tb_max = prev_tb
        else:
            tb = (tb_max + tb) / 2
            tb_min = prev_tb
        util.log_write(
            model.logfh,
            ' [%d] goal [%d] current states [%d] tb [%1.2f] -> [%1.2f] [%1.1f %1.1f]'
            % (attempts, model.triphone_states, num_states, prev_tb, tb,
               tb_min, tb_max))

        if attempts > 50:
            util.log_write(model.logfh,
                           ' Goal not reached after 50 tries. Exiting.')
            sys.exit()

    return output_dir
Exemplo n.º 22
0
def run_iter(model, root_dir, prev_dir, mlf_file, model_list, mix_size, iter, extra):
    """
    Run an iteration of Baum-Welch training using HERest
    """

    output_dir = '%s/HMM-%d-%d' %(root_dir, mix_size, iter)
    util.create_new_dir(output_dir)

    mfc_list = '%s/mfc.list' %model.exp
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## HERest parameters
    min_train_examples = 0
    prune_thresh = 250
    prune_inc = 150
    prune_limit = 2000

    def herest(input, split_num, extra):
        try: log_id = os.path.basename(input).split('.')[2]
        except: log_id = 'acc'
        cmd  = '%s -D -A -T 1 -m %d' %(HEREST_CMD, min_train_examples)
        cmd += ' -t %d %d %d' %(prune_thresh, prune_inc, prune_limit)
        cmd += ' -s %s/stats' %output_dir
        cmd += ' -C %s%s' %(model.mfc_config, extra)
        cmd += ' -I %s' %mlf_file
        cmd += ' -H %s/MMF' %prev_dir
        cmd += ' -p %d' %split_num
        cmd += ' -S %s' %input
        #cmd += ' -M %s %s' %(output_dir, model_list)
        cmd += ' -M %s %s >> %s/herest.%s.log' %(output_dir, model_list, output_dir, log_id)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(herest(input, split_num, extra))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/herest.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/herest.list' %output_dir
    os.system('ls %s/HER*.acc > %s' %(output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = herest(acc_file, 0, extra)
    cmd = cmd.split('>>')[0]
    cmd += ' >> %s/herest.log' %output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir))
    os.system('bzip2 %s/herest.*.log %s/run-command*.log' %(output_dir, output_dir))

    ## Get a few stats
    num_models = int(os.popen('grep "<MEAN>" %s/MMF -c' %output_dir).read().strip())
    likelihood = float(os.popen('cat %s/herest.log | grep aver' %output_dir).read().strip().split()[-1])

    return output_dir, num_models, likelihood
Exemplo n.º 23
0
def initialize_hmms(model, root_dir, mfc_list, mono_list, proto_hmm):
    """
    Compute mean and variance of each feature across all utterances and
    set all Gaussians in the prototype HMM to have the same mean and variance
    """

    output_dir = '%s/HMM-0-0' %root_dir
    util.create_new_dir(root_dir)
    util.create_new_dir(output_dir)
    cmd_log = '%s/hcompv.log' %output_dir

    ## Sample from the full mfc list to reduce computation
    sampled_mfc_list = '%s/mfc_sample.list' %output_dir
    fh = open(sampled_mfc_list, 'w')
    mfcs = open(mfc_list).read().splitlines()
    random.shuffle(mfcs)
    mfc_frac = model.var_floor_fraction
    num_mfcs_for_hcompv = int(mfc_frac * len(mfcs))
    for mfc in mfcs[:num_mfcs_for_hcompv]:
        fh.write('%s\n' %mfc)
    fh.close()

    cmd  = 'HCompV -A -T 1 -m'
    cmd += ' -C %s' %model.mfc_config
    cmd += ' -f 0.01'
    cmd += ' -S %s' %sampled_mfc_list
    cmd += ' -M %s' %output_dir
    cmd += ' %s > %s' %(proto_hmm, cmd_log)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Copy the initial HMM for each monophone
    proto_hmm = '%s/proto_hmm' %output_dir    
    hmm_defs_init = '%s/init.mmf' %output_dir
    hmm = re.search(re.compile('<BEGINHMM>.*<ENDHMM>', re.DOTALL), open(proto_hmm).read()).group()

    fh = open(hmm_defs_init, 'w')
    for line in open(mono_list):
        phone = line.strip()
        fh.write('~h "%s"\n' %phone)
        fh.write('%s\n' %hmm)
    fh.close()

    ## Create mmf header file
    ## TODO: get rid of this?
    mmf_header = '%s/header.mmf' %output_dir
    cmd = 'head -3 %s | cat - %s/vFloors > %s' %(proto_hmm, output_dir, mmf_header)
    os.system(cmd)

    ## Fix sp and silence models
    cleanup_config = '%s/cleanup_init.hed' %output_dir
    fh = open(cleanup_config, 'w')
    fh.write('AT 4 2 0.2 {sil.transP}\n')
    fh.write('AT 2 4 0.2 {sil.transP}\n')
    fh.write('AT 1 5 0.3 {sp.transP}\n')
    fh.write('TI silsp_2 {sil.state[2],sp.state[2]}\n')
    fh.write('TI silsp_3 {sil.state[3],sp.state[3]}\n')
    fh.write('TI silsp_4 {sil.state[4],sp.state[4]}\n')
    fh.close()
    
    hmm_defs_final = '%s/MMF' %output_dir
    cmd_log = '%s/hhed_sil.log' %output_dir
    cmd  = 'HHEd -A -D -T 1 -d %s' %output_dir
    cmd += ' -H %s -H %s' %(mmf_header, hmm_defs_init)
    cmd += ' -M %s' %output_dir
    cmd += ' -w %s' %hmm_defs_final
    cmd += ' %s %s > %s' %(cleanup_config, mono_list, cmd_log)
    os.system(cmd)

    return output_dir, num_mfcs_for_hcompv
Exemplo n.º 24
0
def run_iter(model, model_dir, num_lattice_dir, den_lattice_dir, root_dir, model_list, mfc_list, mix_size, iter):
    """
    Run an iteration of modified Baum-Welch training using HMMIRest
    """

    output_dir = '%s/HMMI-%d-%d' %(root_dir, mix_size, iter)
    util.create_new_dir(output_dir)
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## Create a config file to use with HLRescore
    hmmirest_config = '%s/hmmirest.config' %output_dir
    fh = open(hmmirest_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    #fh.write('HMMDEFOFILTER = "gzip -c > $.gz"\n')
    #fh.write('HMMDEFFILTER = "gunzip -c < $.gz"\n')
    fh.write('HMMIREST: LATMASKNUM = */%%%?????.???\n')
    fh.write('HMMIREST: LATMASKDEN = */%%%?????.???\n')
    #fh.write('HMMIREST: LATMASKNUM =  */%%%%%%%%/???????????????????.???\n')
    #fh.write('HMMIREST: LATMASKDEN =  */%%%%%%%%/???????????????????.???\n')
    fh.write('HFBLAT: LATPROBSCALE = 0.06667\n')
    fh.write('HMMIREST: E = 2.0\n')
    fh.write('ISMOOTHTAU = 50\n')
    fh.write('MPE = TRUE\n')
    #fh.write('MWE = TRUE\n')
    fh.close()

    def hmmirest(input, split_num):
        cmd  = 'HMMIRest -A -D -T 1 -C %s' %hmmirest_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -q %s' %num_lattice_dir
        cmd += ' -r %s' %den_lattice_dir
        if split_num == 0:
            cmd += ' -u mv'
        cmd += ' -p %d' %split_num
        cmd += ' -S %s' %input
        cmd += ' -M %s %s' %(output_dir, model_list)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(hmmirest(input, split_num))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/hmmirest.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/hmmirest.list' %output_dir
    os.system('ls %s/HDR*.acc* > %s' %(output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = hmmirest(acc_file, 0)
    cmd += ' >> %s/hmmirest.log' %output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)
    
    ## Clean up
    #os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir))

    return output_dir
Exemplo n.º 25
0
def run_iter(model, model_dir, num_lattice_dir, den_lattice_dir, root_dir,
             model_list, mfc_list, mix_size, iter):
    """
    Run an iteration of modified Baum-Welch training using HMMIRest
    """

    output_dir = '%s/HMMI-%d-%d' % (root_dir, mix_size, iter)
    util.create_new_dir(output_dir)
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## Create a config file to use with HLRescore
    hmmirest_config = '%s/hmmirest.config' % output_dir
    fh = open(hmmirest_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    #fh.write('HMMDEFOFILTER = "gzip -c > $.gz"\n')
    #fh.write('HMMDEFFILTER = "gunzip -c < $.gz"\n')
    fh.write('HMMIREST: LATMASKNUM = */%%%?????.???\n')
    fh.write('HMMIREST: LATMASKDEN = */%%%?????.???\n')
    #fh.write('HMMIREST: LATMASKNUM =  */%%%%%%%%/???????????????????.???\n')
    #fh.write('HMMIREST: LATMASKDEN =  */%%%%%%%%/???????????????????.???\n')
    fh.write('HFBLAT: LATPROBSCALE = 0.06667\n')
    fh.write('HMMIREST: E = 2.0\n')
    fh.write('ISMOOTHTAU = 50\n')
    fh.write('MPE = TRUE\n')
    #fh.write('MWE = TRUE\n')
    fh.close()

    def hmmirest(input, split_num):
        cmd = 'HMMIRest -A -D -T 1 -C %s' % hmmirest_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -q %s' % num_lattice_dir
        cmd += ' -r %s' % den_lattice_dir
        if split_num == 0:
            cmd += ' -u mv'
        cmd += ' -p %d' % split_num
        cmd += ' -S %s' % input
        cmd += ' -M %s %s' % (output_dir, model_list)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(hmmirest(input, split_num))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/hmmirest.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/hmmirest.list' % output_dir
    os.system('ls %s/HDR*.acc* > %s' % (output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = hmmirest(acc_file, 0)
    cmd += ' >> %s/hmmirest.log' % output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Clean up
    #os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir))

    return output_dir
Exemplo n.º 26
0
    def decode(self,
               model,
               mfc_list,
               gold_mlf,
               lm_file,
               gaussians,
               iter,
               mmi=False,
               diag=False,
               xword_id='',
               output_dir=None):

        if mmi:
            model_file = '%s/MMI/Models/HMMI-%d-%d/MMF' % (model.exp,
                                                           gaussians, iter)
        elif diag:
            model_file = '%s/Diag/HMM-%d-%d/MMF' % (model.exp, gaussians, iter)
        else:
            model_file = '%s/Xword%s/HMM-%d-%d/MMF' % (model.exp, xword_id,
                                                       gaussians, iter)
        model_list = '%s/tied.list' % model.exp

        if not output_dir: output_dir = '%s/decode' % self.exp
        output_dir = '%s/decode' % output_dir
        util.create_new_dir(output_dir)
        results_log = '%s/hresults.log' % output_dir
        output_mlf = '%s/decoded.mlf' % output_dir

        def hvite(input, output):
            cmd = 'HVite -D -A -T 1 -l "*" '
            cmd += '-t %f ' % self.beam
            cmd += '-C %s ' % self.decode_config
            cmd += '-H %s ' % model_file
            cmd += '-S %s ' % input
            cmd += '-i %s ' % output
            cmd += '-w %s ' % lm_file
            cmd += '-p %f ' % self.insertion_penalty
            cmd += '-s %f ' % self.lm_scale
            cmd += '%s %s' % (self.dict, model_list)
            return cmd

        ## HDecode parameters
        utts_per_split = 5
        block_size = 1
        word_end_beam = 150.0
        max_model = 0

        def hdecode(input, output):
            cmd = 'HDecode -D -A -V -T 9 -o M -C %s' % self.decode_config
            cmd += ' -H %s' % model_file
            cmd += ' -k %d' % block_size
            cmd += ' -t %f 100.0' % self.beam
            cmd += ' -v %f 115.0' % word_end_beam
            cmd += ' -u %d' % max_model
            cmd += ' -s %f' % self.lm_scale
            cmd += ' -p %f' % self.insertion_penalty
            cmd += ' -w %s' % lm_file
            cmd += ' -S %s' % input
            cmd += ' -i %s' % output
            cmd += ' %s %s' % (self.dict, model_list)
            if model.verbose > 0:
                cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
            return cmd

        ## Split up MFC list with unix split
        cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                                output_dir, 'mfc.list.')
        os.system(cmd)

        ## Create appropriate config file
        self.decode_config = '%s/%s.config' % (output_dir, self.decode_func)
        fh = open(self.decode_config, 'w')
        if self.decode_func == 'hvite':
            fh.write('FORCECXTEXP = T\n')
            fh.write('ALLOWXWRDEXP = T\n')
        elif self.decode_func == 'hdecode':
            #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
            fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
            fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
            fh.write('RAWMITFORMAT = T\n')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.write('STARTWORD = <s>\n')
            fh.write('ENDWORD = </s>\n')
        fh.close()

        ## Create the HVite/HDecode commands
        cmds = []
        outputs = []
        inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
        for input in inputs:
            output = input.replace('mfc.list', 'align.output')
            outputs.append(output)
            if self.decode_func == 'hvite':
                cmds.append(hvite(input, output))
            else:
                cmds.append(hdecode(input, output))

        if self.local == 1:
            for cmd in cmds:
                print cmd
                print os.popen(cmd).read()
        else:
            cmds_file = '%s/hvite.commands' % output_dir
            fh = open(cmds_file, 'w')
            for cmd in cmds:
                fh.write('%s\n' % cmd)
            fh.close()
            util.run_parallel(cmds_file, self.jobs, output_dir)
            #os.system('rm -f %s' %cmds_file)

        ## Merge outputs
        os.popen('rm -f %s' % output_mlf)
        os.popen('cat %s | grep -v "<" - > %s' %
                 (' '.join(outputs), output_mlf))

        ## Evaluate
        cmd = 'HResults -h -n -A -T 1 -c'
        cmd += ' -I %s' % gold_mlf
        cmd += ' %s %s > %s' % (model_list, output_mlf, results_log)
        os.system(cmd)
        print os.popen('cat ' + results_log).read()

        cmd = open(results_log).read().splitlines()[0]
        raw_wer = 100 - float(
            re.findall(r'Acc=([0-9.]*)',
                       os.popen(cmd.replace('-h ',
                                            '')).read())[0].split('=')[-1])
        return raw_wer

        os.system('rm -f %s/mfc.list.* %s/align.output.*' %
                  (output_dir, output_dir))
Exemplo n.º 27
0
    def decode(self, model, mfc_list, gold_mlf, lm_file, gaussians, iter, mmi=False, diag=False, xword_id='', output_dir=None):

        if mmi:
            model_file = '%s/MMI/Models/HMMI-%d-%d/MMF' %(model.exp, gaussians, iter)
        elif diag:
            model_file = '%s/Diag/HMM-%d-%d/MMF' %(model.exp, gaussians, iter)
        else:
            model_file = '%s/Xword%s/HMM-%d-%d/MMF' %(model.exp, xword_id, gaussians, iter)
        model_list = '%s/tied.list' %model.exp

        if not output_dir: output_dir = '%s/decode' %self.exp
        output_dir = '%s/decode' %output_dir
        util.create_new_dir(output_dir)
        results_log = '%s/hresults.log' %output_dir
        output_mlf = '%s/decoded.mlf' %output_dir
    
        def hvite(input, output):
            cmd  = 'HVite -D -A -T 1 -l "*" '
            cmd += '-t %f ' %self.beam
            cmd += '-C %s ' %self.decode_config
            cmd += '-H %s ' %model_file
            cmd += '-S %s ' %input
            cmd += '-i %s ' %output
            cmd += '-w %s ' %lm_file
            cmd += '-p %f ' %self.insertion_penalty
            cmd += '-s %f ' %self.lm_scale
            cmd += '%s %s' %(self.dict, model_list)
            return cmd

        ## HDecode parameters
        utts_per_split = 5
        block_size = 1
        word_end_beam = 150.0
        max_model = 0

        def hdecode(input, output):
            cmd  = 'HDecode -D -A -V -T 9 -o M -C %s' %self.decode_config
            cmd += ' -H %s' %model_file
            cmd += ' -k %d' %block_size
            cmd += ' -t %f 100.0' %self.beam
            cmd += ' -v %f 115.0' %word_end_beam
            cmd += ' -u %d' %max_model
            cmd += ' -s %f' %self.lm_scale
            cmd += ' -p %f' %self.insertion_penalty
            cmd += ' -w %s' %lm_file
            cmd += ' -S %s' %input
            cmd += ' -i %s' %output
            cmd += ' %s %s' %(self.dict, model_list)
            if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
            return cmd

        ## Split up MFC list with unix split
        cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
        os.system(cmd)

        ## Create appropriate config file
        self.decode_config = '%s/%s.config' %(output_dir, self.decode_func)
        fh = open(self.decode_config, 'w')
        if self.decode_func == 'hvite':
            fh.write('FORCECXTEXP = T\n')
            fh.write('ALLOWXWRDEXP = T\n')
        elif self.decode_func == 'hdecode':
            #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
            fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
            fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
            fh.write('RAWMITFORMAT = T\n')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.write('STARTWORD = <s>\n')
            fh.write('ENDWORD = </s>\n')
        fh.close()

        ## Create the HVite/HDecode commands
        cmds = []
        outputs = []
        inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
        for input in inputs:
            output = input.replace('mfc.list', 'align.output')
            outputs.append(output)
            if self.decode_func == 'hvite':
                cmds.append(hvite(input, output))
            else:
                cmds.append(hdecode(input, output))

        if self.local == 1:
            for cmd in cmds:
                print cmd
                print os.popen(cmd).read()
        else:
            cmds_file = '%s/hvite.commands' %output_dir
            fh = open(cmds_file, 'w')
            for cmd in cmds: fh.write('%s\n' %cmd)
            fh.close()
            util.run_parallel(cmds_file, self.jobs, output_dir)
            #os.system('rm -f %s' %cmds_file)

        ## Merge outputs
        os.popen('rm -f %s' %output_mlf)
        os.popen('cat %s | grep -v "<" - > %s' %(' '.join(outputs), output_mlf))

        ## Evaluate
        cmd  = 'HResults -h -n -A -T 1 -c'
        cmd += ' -I %s' %gold_mlf
        cmd += ' %s %s > %s' %(model_list, output_mlf, results_log)
        os.system(cmd)
        print os.popen('cat ' + results_log).read()

        cmd = open(results_log).read().splitlines()[0]
        raw_wer = 100 - float(re.findall(r'Acc=([0-9.]*)', os.popen(cmd.replace('-h ', '')).read())[0].split('=')[-1])
        return raw_wer

        os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir))
Exemplo n.º 28
0
    def train(self):

        ## Copy config file to the experiment dir
        config_output = '%s/config' % self.exp
        self.config.write(open(config_output, 'w'))
        log(self.logfh, 'TRAINING with config [%s]' % config_output)

        if self.train_pipeline['coding']:
            log(self.logfh, 'CODING started')
            import coding
            util.create_new_dir(self.coding_root)
            coding.create_config(self)
            count = coding.wav_to_mfc(self, self.coding_root, self.mfc_list)
            os.system('cp %s %s/mfc.list.original' %
                      (self.mfc_list, self.misc))
            log(self.logfh, 'wrote mfc files [%d]' % count)
            log(self.logfh, 'CODING finished')

        if self.train_pipeline['lm']:
            log(self.logfh, 'MLF/LM/DICT started')
            import dict_and_lm
            phone_set = dict_and_lm.fix_cmu_dict(self.orig_dict, self.htk_dict)
            num_utts, words = dict_and_lm.make_mlf_from_transcripts(
                self, self.htk_dict, self.setup, self.data, self.word_mlf,
                self.mfc_list)
            log(self.logfh,
                'wrote word mlf [%d utts] [%s]' % (num_utts, self.word_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.dict' %
                      (self.mfc_list, self.misc))
            num_entries = dict_and_lm.make_train_dict(self.htk_dict,
                                                      self.train_dict, words)
            dict_and_lm.make_decode_dict(self.htk_dict, self.decode_dict,
                                         words)
            log(
                self.logfh, 'wrote training dictionary [%d entries] [%s]' %
                (num_entries, self.train_dict))

            util.create_new_dir(self.lm_dir)
            train_vocab = '%s/vocab' % self.lm_dir
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf,
                                                self.train_dict, train_vocab,
                                                self.lm_dir, self.lm,
                                                self.lm_order)
            log(self.logfh,
                'wrote lm [%s] training ppl [%1.2f]' % (self.lm, ppl))
            log(self.logfh, 'MLF/LM/DICT finished')

        if self.train_pipeline['flat_start']:
            log(self.logfh, 'FLAT START started')
            import init_hmm
            init_hmm.word_to_phone_mlf(self, self.train_dict, self.word_mlf,
                                       self.phone_mlf, self.phone_list)
            log(self.logfh, 'wrote phone mlf [%s]' % self.phone_mlf)

            os.system('cp %s %s/phone.mlf.from.dict' %
                      (self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.dict' % self.misc)
            init_hmm.make_proto_hmm(self, self.mfc_list, self.proto_hmm)
            hmm_dir, num_mfcs = init_hmm.initialize_hmms(
                self, self.mono_root, self.mfc_list, self.phone_list,
                self.proto_hmm)
            log(self.logfh,
                'initialized an HMM for each phone in [%s]' % hmm_dir)
            log(self.logfh,
                'used [%d] mfc files to compute variance floor' % num_mfcs)

            import train_hmm
            for iter in range(1, self.initial_mono_iters + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root,
                                                   hmm_dir, self.phone_mlf,
                                                   self.phone_list, 1, iter,
                                                   '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            align_config = '%s/config.align' % self.mono_root
            fh = open(align_config, 'w')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.close()

            align_dir = train_hmm.align(self, self.mono_root, self.mfc_list,
                                        hmm_dir, self.word_mlf, self.phone_mlf,
                                        self.phone_list, self.train_dict,
                                        align_config)
            log(
                self.logfh,
                'aligned with model in [%s], wrote phone mlf [%s]' %
                (hmm_dir, self.phone_mlf))
            os.system('cp %s %s/mfc.list.filtered.by.mono.align' %
                      (self.mfc_list, self.misc))

            os.system('cp %s %s/phone.mlf.from.mono.align' %
                      (self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.mono.align' % self.misc)

            for iter in range(self.initial_mono_iters + 1,
                              self.initial_mono_iters + 1 + self.mono_iters):
                hmm_dir, k, L = train_hmm.run_iter(self, self.mono_root,
                                                   hmm_dir, self.phone_mlf,
                                                   self.phone_list, 1, iter,
                                                   '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'FLAT START finished')

        if self.train_pipeline['mixup_mono']:
            log(self.logfh, 'MIXUP MONO started')
            import train_hmm

            hmm_dir = '%s/HMM-%d-%d' % (
                self.mono_root, 1, self.initial_mono_iters + self.mono_iters)

            ## mixup everything
            for mix_size in self.mono_mixup_schedule:
                hmm_dir = train_hmm.mixup(self, self.mixup_mono_root, hmm_dir,
                                          self.phone_list, mix_size)
                log(self.logfh,
                    'mixed up to [%d] in [%s]' % (mix_size, hmm_dir))
                for iter in range(1, self.mono_iters + 1):
                    hmm_dir, k, L = train_hmm.run_iter(self,
                                                       self.mixup_mono_root,
                                                       hmm_dir, self.phone_mlf,
                                                       self.phone_list,
                                                       mix_size, iter, '')
                    log(
                        self.logfh,
                        'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                        (hmm_dir, L))

            log(self.logfh, 'MIXUP MONO finished')

        if self.train_pipeline['mixdown_mono']:
            log(self.logfh, 'MIXDOWN MONO started')
            import train_hmm

            num_gaussians = self.mono_mixup_schedule[-1]
            hmm_dir = '%s/HMM-%d-%d' % (self.mixup_mono_root, num_gaussians,
                                        self.mono_iters)
            train_hmm.mixdown_mono(self, self.mixdown_mono_root, hmm_dir,
                                   self.phone_list)

            log(self.logfh, 'MIXDOWN MONO finished')

        if self.train_pipeline['mono_to_tri']:
            log(self.logfh, 'MONO TO TRI started')
            import train_hmm

            if self.train_pipeline['mixdown_mono']:
                mono_final_dir = '%s/HMM-1-0' % self.mixdown_mono_root
            else:
                mono_final_dir = '%s/HMM-%d-%d' % (self.mono_root, 1,
                                                   self.initial_mono_iters +
                                                   self.mono_iters)

            hmm_dir = train_hmm.mono_to_tri(self, self.xword_root,
                                            mono_final_dir, self.phone_mlf,
                                            self.tri_mlf, self.phone_list,
                                            self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' % hmm_dir)
            log(self.logfh, 'created triphone mlf [%s]' % self.tri_mlf)

            os.system('cp %s %s/tri.mlf.from.mono.align' %
                      (self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.mono.align' % self.misc)
            os.system('cp %s %s/tri.list.from.mono.align' %
                      (self.tri_list, self.misc))

            for iter in range(1, self.initial_tri_iters + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tri_list, 1, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            xword_tie_dir = '%s/HMM-%d-%d' % (self.xword_root, 1,
                                              self.initial_tri_iters + 1)
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir,
                                                  self.phone_list,
                                                  self.tri_list,
                                                  self.tied_list)
            log(self.logfh, 'tied states in [%s]' % hmm_dir)

            os.system('cp %s %s/tied.list.initial' %
                      (self.tied_list, self.misc))

            hmm_dir = '%s/HMM-%d-%d' % (self.xword_root, 1,
                                        self.initial_tri_iters + 1)
            for iter in range(self.initial_tri_iters + 2,
                              self.initial_tri_iters + 1 + self.tri_iters + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tied_list, 1, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'MONO TO TRI finished')

        if self.train_pipeline['mixup_tri']:
            log(self.logfh, 'MIXUP TRI started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.initial_tri_iters + self.tri_iters + 1
            hmm_dir = '%s/HMM-%d-%d' % (self.xword_root, start_gaussians,
                                        start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size == 2:
                    hmm_dir = train_hmm.mixup(self,
                                              self.xword_root,
                                              hmm_dir,
                                              self.tied_list,
                                              mix_size,
                                              estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_root, hmm_dir,
                                              self.tied_list, mix_size)
                log(self.logfh,
                    'mixed up to [%d] in [%s]' % (mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split + 1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_root,
                                                       hmm_dir, self.tri_mlf,
                                                       self.tied_list,
                                                       mix_size, iter, '')
                    log(
                        self.logfh,
                        'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                        (hmm_dir, L))
            log(self.logfh, 'MIXUP TRI finished')

        if self.train_pipeline['align_with_xword']:
            log(self.logfh, 'XWORD ALIGN started')
            import train_hmm

            align_config = '%s/config.align' % self.xword_root
            train_hmm.make_hvite_xword_config(self, align_config,
                                              'MFCC_0_D_A_Z')
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split
            hmm_dir = '%s/HMM-%d-%d' % (self.xword_root, num_gaussians,
                                        iter_num)
            realigned_mlf = '%s/raw_tri_xword_realigned.mlf' % self.misc

            # Use the original, mfc list that has prons for every word
            os.system('cp %s/mfc.list.filtered.by.dict %s' %
                      (self.misc, self.mfc_list))

            align_dir = train_hmm.align(self, self.xword_root, self.mfc_list,
                                        hmm_dir, self.word_mlf, realigned_mlf,
                                        self.tied_list, self.train_dict,
                                        align_config)
            log(
                self.logfh, 'aligned with model in [%s], tri mlf [%s]' %
                (hmm_dir, realigned_mlf))

            # Because of state tying, the triphones in the mlf will only be
            # valid for this state tying. Strip down to monophones, the
            # correct triphones will be created later in mono_to_tri
            train_hmm.map_tri_to_mono(self, align_dir, realigned_mlf,
                                      self.phone_mlf)
            os.system('cp %s %s/phone.mlf.from.xword.align' %
                      (self.phone_mlf, self.misc))
            os.system('bzip2 -f %s/phone.mlf.from.xword.align' % self.misc)
            os.system('bzip2 -f %s' % realigned_mlf)

            log(self.logfh, 'XWORD ALIGN finished')

        if self.train_pipeline['mono_to_tri_from_xword']:
            log(self.logfh, 'MONO TO TRI FROM XWORD started')
            import train_hmm

            #Assume that midown mono happened?
            mono_final_dir = '%s/HMM-1-0' % self.mixdown_mono_root

            hmm_dir = train_hmm.mono_to_tri(self, self.xword_1_root,
                                            mono_final_dir, self.phone_mlf,
                                            self.tri_mlf, self.phone_list,
                                            self.tri_list)
            log(self.logfh, 'initialized triphone models in [%s]' % hmm_dir)

            os.system('cp %s %s/tri.mlf.from.xword.align' %
                      (self.tri_mlf, self.misc))
            os.system('bzip2 -f %s/tri.mlf.from.xword.align' % self.misc)
            os.system('cp %s %s/tri.list.from.xword.align' %
                      (self.tri_list, self.misc))

            two_model_config = '%s/config.two_model' % self.xword_1_root
            fh = open(two_model_config, 'w')
            fh.write('ALIGNMODELMMF = %s/HMM-%d-%d/MMF\n' %
                     (self.xword_root, self.tri_mixup_schedule[-1],
                      self.tri_iters_per_split))
            fh.write('ALIGNHMMLIST = %s\n' % self.tied_list)
            fh.close()

            # Do one pass of two-model re-estimation
            extra = ' -C %s' % two_model_config
            hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root,
                                               hmm_dir, self.tri_mlf,
                                               self.tri_list, 1, 1, extra)
            log(self.logfh,
                'ran an iteration of BW in [%s] lik/fr [%1.4f]' % (hmm_dir, L))

            xword_tie_dir = '%s/HMM-1-2' % self.xword_1_root
            hmm_dir = train_hmm.tie_states_search(self, xword_tie_dir, hmm_dir,
                                                  self.phone_list,
                                                  self.tri_list,
                                                  self.tied_list)
            log(self.logfh, 'tied states in [%s]' % hmm_dir)

            os.system('cp %s %s/tied.list.second' %
                      (self.tied_list, self.misc))

            hmm_dir = '%s/HMM-1-2' % self.xword_1_root
            for iter in range(3, self.tri_iters + 3):
                hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tied_list, 1, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'MONO TO TRI FROM XWORD finished')

        if self.train_pipeline['mixup_tri_2']:
            log(self.logfh, 'MIXUP TRI 2 started')
            import train_hmm

            ## mixup everything
            start_gaussians = 1
            start_iter = self.tri_iters + 2
            hmm_dir = '%s/HMM-%d-%d' % (self.xword_1_root, start_gaussians,
                                        start_iter)
            for mix_size in self.tri_mixup_schedule:
                if mix_size == 2:
                    hmm_dir = train_hmm.mixup(self,
                                              self.xword_1_root,
                                              hmm_dir,
                                              self.tied_list,
                                              mix_size,
                                              estimateVarFloor=1)
                else:
                    hmm_dir = train_hmm.mixup(self, self.xword_1_root, hmm_dir,
                                              self.tied_list, mix_size)
                log(self.logfh,
                    'mixed up to [%d] in [%s]' % (mix_size, hmm_dir))
                for iter in range(1, self.tri_iters_per_split + 1):
                    hmm_dir, k, L = train_hmm.run_iter(self, self.xword_1_root,
                                                       hmm_dir, self.tri_mlf,
                                                       self.tied_list,
                                                       mix_size, iter, '')
                    log(
                        self.logfh,
                        'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                        (hmm_dir, L))
            log(self.logfh, 'MIXUP TRI 2 finished')

        if self.train_pipeline['diag']:
            log(self.logfh, 'DIAG started')
            import train_hmm

            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['mixup_tri_2']:
                seed_dir = '%s/HMM-%d-%d' % (self.xword_1_root, num_gaussians,
                                             iter_num)
            else:
                seed_dir = '%s/HMM-%d-%d' % (self.xword_root, num_gaussians,
                                             iter_num)
            hmm_dir, L = train_hmm.diagonalize(self, self.diag_root, seed_dir,
                                               self.tied_list, self.tri_mlf,
                                               num_gaussians)
            log(self.logfh, 'ran diag in [%s] lik/fr [%1.4f]' % (hmm_dir, L))

            for iter in range(1, self.tri_iters_per_split + 1):
                hmm_dir, k, L = train_hmm.run_iter(self, self.diag_root,
                                                   hmm_dir, self.tri_mlf,
                                                   self.tied_list,
                                                   num_gaussians, iter, '')
                log(
                    self.logfh,
                    'ran an iteration of BW in [%s] lik/fr [%1.4f]' %
                    (hmm_dir, L))

            log(self.logfh, 'DIAG finished')

        if self.train_pipeline['mmi']:
            log(self.logfh, 'DISCRIM started')

            ## Common items
            import mmi
            mmi_dir = '%s/MMI' % self.exp
            util.create_new_dir(mmi_dir)
            mfc_list_mmi = '%s/mfc.list' % mmi_dir
            os.system('cp %s %s' % (self.mfc_list, mfc_list_mmi))

            ## Create weak LM
            import dict_and_lm
            train_vocab = '%s/vocab' % self.lm_dir
            lm_order = 2
            target_ppl_ratio = 8
            ppl = dict_and_lm.build_lm_from_mlf(self, self.word_mlf,
                                                self.train_dict, train_vocab,
                                                self.lm_dir, self.mmi_lm,
                                                lm_order, target_ppl_ratio)
            log(
                self.logfh, 'wrote lm for mmi [%s] training ppl [%1.2f]' %
                (self.mmi_lm, ppl))

            ## Create decoding lattices for every utterance
            lattice_dir = '%s/Denom/Lat_word' % mmi_dir
            util.create_new_dir(lattice_dir)
            num_gaussians = self.tri_mixup_schedule[-1]
            iter_num = self.tri_iters_per_split

            if self.train_pipeline['diag']:
                model_dir = '%s/HMM-%d-%d' % (self.diag_root, num_gaussians,
                                              iter_num)
            elif self.train_pipeline['mixup_tri_2']:
                model_dir = '%s/HMM-%d-%d' % (self.xword_1_root, num_gaussians,
                                              iter_num)
            else:
                model_dir = '%s/HMM-%d-%d' % (self.xword_root, num_gaussians,
                                              iter_num)
            mmi.decode_to_lattices(model, lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict,
                                   self.tied_list, self.word_mlf)
            log(self.logfh,
                'generated training lattices in [%s]' % lattice_dir)

            ## Prune and determinize lattices
            pruned_lattice_dir = '%s/Denom/Lat_prune' % mmi_dir
            util.create_new_dir(pruned_lattice_dir)
            mmi.prune_lattices(model, lattice_dir, pruned_lattice_dir,
                               self.decode_dict)
            log(self.logfh, 'pruned lattices in [%s]' % pruned_lattice_dir)

            ## Phone-mark lattices
            phone_lattice_dir = '%s/Denom/Lat_phone' % mmi_dir
            util.create_new_dir(phone_lattice_dir)
            mmi.phonemark_lattices(model, pruned_lattice_dir,
                                   phone_lattice_dir, model_dir, mfc_list_mmi,
                                   self.mmi_lm, self.decode_dict,
                                   self.tied_list)
            log(self.logfh,
                'phone-marked lattices in [%s]' % phone_lattice_dir)

            ## Create numerator word lattices
            num_lattice_dir = '%s/Num/Lat_word' % mmi_dir
            util.create_new_dir(num_lattice_dir)
            mmi.create_num_lattices(model, num_lattice_dir, self.mmi_lm,
                                    self.decode_dict, self.word_mlf)
            log(self.logfh,
                'generated numerator lattices in [%s]' % num_lattice_dir)

            ## Phone-mark numerator lattices
            num_phone_lattice_dir = '%s/Num/Lat_phone' % mmi_dir
            util.create_new_dir(num_phone_lattice_dir)
            mmi.phonemark_lattices(model, num_lattice_dir,
                                   num_phone_lattice_dir, model_dir,
                                   mfc_list_mmi, self.mmi_lm, self.decode_dict,
                                   self.tied_list)
            log(
                self.logfh, 'phone-marked numerator lattices in [%s]' %
                num_phone_lattice_dir)

            ## Add LM scores to numerator phone lattices
            num_phone_lm_lattice_dir = '%s/Num/Lat_phone_lm' % mmi_dir
            util.create_new_dir(num_phone_lm_lattice_dir)
            mmi.add_lm_lattices(model, num_phone_lattice_dir,
                                num_phone_lm_lattice_dir, self.decode_dict,
                                self.mmi_lm)
            log(
                self.logfh, 'added LM scores to numerator lattices in [%s]' %
                num_phone_lm_lattice_dir)

            ## Modified Baum-Welch estimation
            root_dir = '%s/Models' % mmi_dir
            util.create_new_dir(root_dir)
            mmi_iters = 12
            mix_size = num_gaussians
            for iter in range(1, mmi_iters + 1):
                model_dir = mmi.run_iter(model, model_dir,
                                         num_phone_lm_lattice_dir,
                                         phone_lattice_dir, root_dir,
                                         self.tied_list, mfc_list_mmi,
                                         mix_size, iter)
                log(self.logfh,
                    'ran an iteration of Modified BW in [%s]' % model_dir)

            log(self.logfh, 'DISCRIM finished')