예제 #1
0
파일: mmi.py 프로젝트: Debanjan1234/pyhtk
def add_lm_lattices(model, lattice_dir, output_dir, dict, lm):

    sys.stderr.write('adding LM scores to numerator lattices\n')

    ## Create a config file to use with HLRescore
    hlrescore_config = '%s/hlrescore.config' %output_dir
    fh = open(hlrescore_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HLRESCORE: FIXBADLATS = TRUE\n')
    fh.write('HLRESCORE: STARTWORD = <s>\n')
    fh.write('HLRESCORE: ENDWORD = </s>\n')
    fh.close()
    
    ## HLRescore parameters
    grammar_scale = 15.0
    trans_penalty = 0.0

    def hlrescore(input, path):
        cmd  = 'HLRescore -A -D -T 1 -w -c -q tvaldm -C %s' %hlrescore_config
        cmd += ' -S %s' %input
        cmd += ' -L %s/%s/' %(lattice_dir, path)
        cmd += ' -l %s/%s/' %(output_dir, path)
        cmd += ' -s %f' %grammar_scale
        cmd += ' -p %f' %trans_penalty
        cmd += ' -n %s' %lm
        cmd += ' %s' %dict
        if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
        return cmd

    ## Split up lattice list
    lattice_list = '%s/lattice.list' %output_dir
    fh = open(lattice_list, 'w')
    remove_gz = lambda x: x.replace('.gz', '')
    files = map(remove_gz, util.get_files(lattice_dir, r'.*\.lat'))
    fh.write('\n'.join(files))
    fh.close()
    split_lattice = SplitList(output_dir, lattice_list, by_path=True)

    ## Create the HLRescore commands
    cmds = []
    inputs = split_lattice.get_files()
    for input in inputs:
        key = split_lattice.get_key(input)
        new_output = '%s/%s' %(output_dir, key)
        if not os.path.isdir(new_output): os.makedirs(new_output)
        cmds.append(hlrescore(input, key))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hlrescore.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)
예제 #2
0
def create_num_lattices(model, output_dir, lm, dict, word_mlf):

    sys.stderr.write('Creating numerator word lattices\n')

    ## Create a config file to use with HLRescore
    hlrescore_config = '%s/hlrescore.config' % output_dir
    fh = open(hlrescore_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HLRESCORE: FIXBADLATS = TRUE\n')
    fh.write('HLRESCORE: STARTWORD = <s>\n')
    fh.write('HLRESCORE: ENDWORD = </s>\n')
    fh.close()

    def hlrescore(input, output):
        cmd = 'HLRescore -A -D -T 1 -w -f -q tvalqr -C %s' % hlrescore_config
        cmd += ' -S %s' % input
        cmd += ' -I %s' % word_mlf
        cmd += ' -l %s/' % output
        cmd += ' %s' % dict
        if model.verbose > 0:
            cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
        return cmd

    ## Split the word mlf labels to create inputs for HLRescore
    label_list = '%s/labels.list' % output_dir
    cmd = 'grep "\.lab" %s > %s' % (word_mlf, label_list)
    os.system(cmd)
    split_label = SplitList(output_dir,
                            label_list,
                            by_path=False,
                            by_letters=model.split_path_letters)

    ## Create the HLRescore commands
    cmds = []
    inputs = split_label.get_files()
    for input in inputs:
        output = '%s/%s' % (output_dir, split_label.get_key(input))
        if not os.path.isdir(output): os.makedirs(output)
        cmds.append(hlrescore(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hlrescore.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)
예제 #3
0
파일: mmi.py 프로젝트: Debanjan1234/pyhtk
def create_num_lattices(model, output_dir, lm, dict, word_mlf):

    sys.stderr.write('Creating numerator word lattices\n')

    ## Create a config file to use with HLRescore
    hlrescore_config = '%s/hlrescore.config' %output_dir
    fh = open(hlrescore_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HLRESCORE: FIXBADLATS = TRUE\n')
    fh.write('HLRESCORE: STARTWORD = <s>\n')
    fh.write('HLRESCORE: ENDWORD = </s>\n')
    fh.close()
    
    def hlrescore(input, output):
        cmd  = 'HLRescore -A -D -T 1 -w -f -q tvalqr -C %s' %hlrescore_config
        cmd += ' -S %s' %input
        cmd += ' -I %s' %word_mlf
        cmd += ' -l %s/' %output
        cmd += ' %s' %dict
        if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
        return cmd

    ## Split the word mlf labels to create inputs for HLRescore
    label_list = '%s/labels.list' %output_dir
    cmd = 'grep "\.lab" %s > %s' %(word_mlf, label_list)
    os.system(cmd)
    split_label = SplitList(output_dir, label_list, by_path=False, by_letters=model.split_path_letters)

    ## Create the HLRescore commands
    cmds = []
    inputs = split_label.get_files()
    for input in inputs:
        output = '%s/%s' %(output_dir, split_label.get_key(input))
        if not os.path.isdir(output): os.makedirs(output)
        cmds.append(hlrescore(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hlrescore.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)
예제 #4
0
def run_iter(model, model_dir, num_lattice_dir, den_lattice_dir, root_dir,
             model_list, mfc_list, mix_size, iter):
    """
    Run an iteration of modified Baum-Welch training using HMMIRest
    """

    output_dir = '%s/HMMI-%d-%d' % (root_dir, mix_size, iter)
    util.create_new_dir(output_dir)
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## Create a config file to use with HLRescore
    hmmirest_config = '%s/hmmirest.config' % output_dir
    fh = open(hmmirest_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    #fh.write('HMMDEFOFILTER = "gzip -c > $.gz"\n')
    #fh.write('HMMDEFFILTER = "gunzip -c < $.gz"\n')
    fh.write('HMMIREST: LATMASKNUM = */%%%?????.???\n')
    fh.write('HMMIREST: LATMASKDEN = */%%%?????.???\n')
    #fh.write('HMMIREST: LATMASKNUM =  */%%%%%%%%/???????????????????.???\n')
    #fh.write('HMMIREST: LATMASKDEN =  */%%%%%%%%/???????????????????.???\n')
    fh.write('HFBLAT: LATPROBSCALE = 0.06667\n')
    fh.write('HMMIREST: E = 2.0\n')
    fh.write('ISMOOTHTAU = 50\n')
    fh.write('MPE = TRUE\n')
    #fh.write('MWE = TRUE\n')
    fh.close()

    def hmmirest(input, split_num):
        cmd = 'HMMIRest -A -D -T 1 -C %s' % hmmirest_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -q %s' % num_lattice_dir
        cmd += ' -r %s' % den_lattice_dir
        if split_num == 0:
            cmd += ' -u mv'
        cmd += ' -p %d' % split_num
        cmd += ' -S %s' % input
        cmd += ' -M %s %s' % (output_dir, model_list)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(hmmirest(input, split_num))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/hmmirest.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/hmmirest.list' % output_dir
    os.system('ls %s/HDR*.acc* > %s' % (output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = hmmirest(acc_file, 0)
    cmd += ' >> %s/hmmirest.log' % output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Clean up
    #os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir))

    return output_dir
예제 #5
0
def add_lm_lattices(model, lattice_dir, output_dir, dict, lm):

    sys.stderr.write('adding LM scores to numerator lattices\n')

    ## Create a config file to use with HLRescore
    hlrescore_config = '%s/hlrescore.config' % output_dir
    fh = open(hlrescore_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HLRESCORE: FIXBADLATS = TRUE\n')
    fh.write('HLRESCORE: STARTWORD = <s>\n')
    fh.write('HLRESCORE: ENDWORD = </s>\n')
    fh.close()

    ## HLRescore parameters
    grammar_scale = 15.0
    trans_penalty = 0.0

    def hlrescore(input, path):
        cmd = 'HLRescore -A -D -T 1 -w -c -q tvaldm -C %s' % hlrescore_config
        cmd += ' -S %s' % input
        cmd += ' -L %s/%s/' % (lattice_dir, path)
        cmd += ' -l %s/%s/' % (output_dir, path)
        cmd += ' -s %f' % grammar_scale
        cmd += ' -p %f' % trans_penalty
        cmd += ' -n %s' % lm
        cmd += ' %s' % dict
        if model.verbose > 0:
            cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
        return cmd

    ## Split up lattice list
    lattice_list = '%s/lattice.list' % output_dir
    fh = open(lattice_list, 'w')
    remove_gz = lambda x: x.replace('.gz', '')
    files = map(remove_gz, util.get_files(lattice_dir, r'.*\.lat'))
    fh.write('\n'.join(files))
    fh.close()
    split_lattice = SplitList(output_dir, lattice_list, by_path=True)

    ## Create the HLRescore commands
    cmds = []
    inputs = split_lattice.get_files()
    for input in inputs:
        key = split_lattice.get_key(input)
        new_output = '%s/%s' % (output_dir, key)
        if not os.path.isdir(new_output): os.makedirs(new_output)
        cmds.append(hlrescore(input, key))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hlrescore.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)
예제 #6
0
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm,
                       dict, model_list):

    sys.stderr.write('Phonemarking lattices\n')

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' % output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()

    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 200.0
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode_mod(input, path):
        input_dir = '%s/%s/' % (lattice_dir, path)
        if not os.path.isdir(input_dir):
            input_dir = '%s/%s/' % (lattice_dir, path.replace('_', ''))
        cmd = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' % hdecode_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -k %d' % block_size
        cmd += ' -t %f' % beam
        cmd += ' -s %f' % lm_scale
        cmd += ' -p %f' % word_insertion_penalty
        cmd += ' -w'  # %s' %lm
        cmd += ' -S %s' % input
        cmd += ' -l %s/%s/' % (output_dir, path)
        cmd += ' -L %s' % input_dir
        cmd += ' %s %s' % (dict, model_list)
        if model.verbose > 0:
            cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
        return cmd

    ## Split up MFC list with unix split
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        key = split_mfc.get_key(input)
        new_output = '%s/%s' % (output_dir, key)
        if not os.path.isdir(new_output): os.makedirs(new_output)

        cmds.append(hdecode_mod(input, key))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode_mod.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' % output_dir
    os.system('cp %s %s' % (mfc_list, old_mfc_list))

    ## Prune bad lats from the mfc list
    lat_ids = [
        os.path.basename(f).split('.')[0]
        for f in util.get_files(output_dir, r'.*\.lat')
    ]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1:
                util.log_write(model.logfh, 'removed bad lat [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count)
예제 #7
0
def wav_to_mfc(model, output_dir, mfc_list):
    """
    Use HCopy to code each wav in the setup file. HCopy takes a config
    file (-C) and an input (-S). The input is a file where each line
    looks like:
    <wav file> <mfc file>
    """
    def hcopy(config, input):
        cmd = 'HCopy -A -T 1 -C %s -C %s -S %s' % (model.mfc_config, config,
                                                   input)
        return cmd

    ## Create list files for HCopy <wav file> <mfc file>
    lines_per_split = 500
    count = 0
    prev_config = ''
    cmds = []
    mfcs = []
    file = '%s/hcopy.list.0' % output_dir
    fh = open(file, 'w')

    if model.setup.endswith('gz'): setup_reader = lambda x: gzip.open(x)
    else: setup_reader = lambda x: open(x)

    for line in setup_reader(model.setup):
        count += 1
        [wav, config] = line.strip().split()[0:2]
        if not os.path.isfile(wav): sys.stderr.write('missing [%s]\n' % wav)
        mfc = get_mfc_name_from_wav(wav, model.data)
        mfcs.append(mfc)

        if count > 1 and (count % lines_per_split == 0
                          or config != prev_config):
            cmds.append(hcopy(prev_config, file))
            fh.close()
            file = '%s/hcopy.list.%d' % (output_dir, len(cmds))
            fh = open(file, 'w')

        fh.write('%s %s\n' % (wav, mfc))
        prev_config = config

    cmds.append(hcopy(config, file))
    fh.close()

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            os.system(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/hcopy.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Create a file listing all created MFCs
    fh = open(mfc_list, 'w')
    for mfc in mfcs:
        fh.write('%s\n' % mfc)
    fh.close()

    ## Clean up
    os.system('rm -f %s/hcopy.list.*' % output_dir)
    return count
예제 #8
0
파일: test.py 프로젝트: Debanjan1234/pyhtk
    def decode(self, model, mfc_list, gold_mlf, lm_file, gaussians, iter, mmi=False, diag=False, xword_id='', output_dir=None):

        if mmi:
            model_file = '%s/MMI/Models/HMMI-%d-%d/MMF' %(model.exp, gaussians, iter)
        elif diag:
            model_file = '%s/Diag/HMM-%d-%d/MMF' %(model.exp, gaussians, iter)
        else:
            model_file = '%s/Xword%s/HMM-%d-%d/MMF' %(model.exp, xword_id, gaussians, iter)
        model_list = '%s/tied.list' %model.exp

        if not output_dir: output_dir = '%s/decode' %self.exp
        output_dir = '%s/decode' %output_dir
        util.create_new_dir(output_dir)
        results_log = '%s/hresults.log' %output_dir
        output_mlf = '%s/decoded.mlf' %output_dir
    
        def hvite(input, output):
            cmd  = 'HVite -D -A -T 1 -l "*" '
            cmd += '-t %f ' %self.beam
            cmd += '-C %s ' %self.decode_config
            cmd += '-H %s ' %model_file
            cmd += '-S %s ' %input
            cmd += '-i %s ' %output
            cmd += '-w %s ' %lm_file
            cmd += '-p %f ' %self.insertion_penalty
            cmd += '-s %f ' %self.lm_scale
            cmd += '%s %s' %(self.dict, model_list)
            return cmd

        ## HDecode parameters
        utts_per_split = 5
        block_size = 1
        word_end_beam = 150.0
        max_model = 0

        def hdecode(input, output):
            cmd  = 'HDecode -D -A -V -T 9 -o M -C %s' %self.decode_config
            cmd += ' -H %s' %model_file
            cmd += ' -k %d' %block_size
            cmd += ' -t %f 100.0' %self.beam
            cmd += ' -v %f 115.0' %word_end_beam
            cmd += ' -u %d' %max_model
            cmd += ' -s %f' %self.lm_scale
            cmd += ' -p %f' %self.insertion_penalty
            cmd += ' -w %s' %lm_file
            cmd += ' -S %s' %input
            cmd += ' -i %s' %output
            cmd += ' %s %s' %(self.dict, model_list)
            if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
            return cmd

        ## Split up MFC list with unix split
        cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
        os.system(cmd)

        ## Create appropriate config file
        self.decode_config = '%s/%s.config' %(output_dir, self.decode_func)
        fh = open(self.decode_config, 'w')
        if self.decode_func == 'hvite':
            fh.write('FORCECXTEXP = T\n')
            fh.write('ALLOWXWRDEXP = T\n')
        elif self.decode_func == 'hdecode':
            #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
            fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
            fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
            fh.write('RAWMITFORMAT = T\n')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.write('STARTWORD = <s>\n')
            fh.write('ENDWORD = </s>\n')
        fh.close()

        ## Create the HVite/HDecode commands
        cmds = []
        outputs = []
        inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
        for input in inputs:
            output = input.replace('mfc.list', 'align.output')
            outputs.append(output)
            if self.decode_func == 'hvite':
                cmds.append(hvite(input, output))
            else:
                cmds.append(hdecode(input, output))

        if self.local == 1:
            for cmd in cmds:
                print cmd
                print os.popen(cmd).read()
        else:
            cmds_file = '%s/hvite.commands' %output_dir
            fh = open(cmds_file, 'w')
            for cmd in cmds: fh.write('%s\n' %cmd)
            fh.close()
            util.run_parallel(cmds_file, self.jobs, output_dir)
            #os.system('rm -f %s' %cmds_file)

        ## Merge outputs
        os.popen('rm -f %s' %output_mlf)
        os.popen('cat %s | grep -v "<" - > %s' %(' '.join(outputs), output_mlf))

        ## Evaluate
        cmd  = 'HResults -h -n -A -T 1 -c'
        cmd += ' -I %s' %gold_mlf
        cmd += ' %s %s > %s' %(model_list, output_mlf, results_log)
        os.system(cmd)
        print os.popen('cat ' + results_log).read()

        cmd = open(results_log).read().splitlines()[0]
        raw_wer = 100 - float(re.findall(r'Acc=([0-9.]*)', os.popen(cmd.replace('-h ', '')).read())[0].split('=')[-1])
        return raw_wer

        os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir))
예제 #9
0
    def decode(self,
               model,
               mfc_list,
               gold_mlf,
               lm_file,
               gaussians,
               iter,
               mmi=False,
               diag=False,
               xword_id='',
               output_dir=None):

        if mmi:
            model_file = '%s/MMI/Models/HMMI-%d-%d/MMF' % (model.exp,
                                                           gaussians, iter)
        elif diag:
            model_file = '%s/Diag/HMM-%d-%d/MMF' % (model.exp, gaussians, iter)
        else:
            model_file = '%s/Xword%s/HMM-%d-%d/MMF' % (model.exp, xword_id,
                                                       gaussians, iter)
        model_list = '%s/tied.list' % model.exp

        if not output_dir: output_dir = '%s/decode' % self.exp
        output_dir = '%s/decode' % output_dir
        util.create_new_dir(output_dir)
        results_log = '%s/hresults.log' % output_dir
        output_mlf = '%s/decoded.mlf' % output_dir

        def hvite(input, output):
            cmd = 'HVite -D -A -T 1 -l "*" '
            cmd += '-t %f ' % self.beam
            cmd += '-C %s ' % self.decode_config
            cmd += '-H %s ' % model_file
            cmd += '-S %s ' % input
            cmd += '-i %s ' % output
            cmd += '-w %s ' % lm_file
            cmd += '-p %f ' % self.insertion_penalty
            cmd += '-s %f ' % self.lm_scale
            cmd += '%s %s' % (self.dict, model_list)
            return cmd

        ## HDecode parameters
        utts_per_split = 5
        block_size = 1
        word_end_beam = 150.0
        max_model = 0

        def hdecode(input, output):
            cmd = 'HDecode -D -A -V -T 9 -o M -C %s' % self.decode_config
            cmd += ' -H %s' % model_file
            cmd += ' -k %d' % block_size
            cmd += ' -t %f 100.0' % self.beam
            cmd += ' -v %f 115.0' % word_end_beam
            cmd += ' -u %d' % max_model
            cmd += ' -s %f' % self.lm_scale
            cmd += ' -p %f' % self.insertion_penalty
            cmd += ' -w %s' % lm_file
            cmd += ' -S %s' % input
            cmd += ' -i %s' % output
            cmd += ' %s %s' % (self.dict, model_list)
            if model.verbose > 0:
                cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
            return cmd

        ## Split up MFC list with unix split
        cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                                output_dir, 'mfc.list.')
        os.system(cmd)

        ## Create appropriate config file
        self.decode_config = '%s/%s.config' % (output_dir, self.decode_func)
        fh = open(self.decode_config, 'w')
        if self.decode_func == 'hvite':
            fh.write('FORCECXTEXP = T\n')
            fh.write('ALLOWXWRDEXP = T\n')
        elif self.decode_func == 'hdecode':
            #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
            fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
            fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
            fh.write('RAWMITFORMAT = T\n')
            fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
            fh.write('STARTWORD = <s>\n')
            fh.write('ENDWORD = </s>\n')
        fh.close()

        ## Create the HVite/HDecode commands
        cmds = []
        outputs = []
        inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
        for input in inputs:
            output = input.replace('mfc.list', 'align.output')
            outputs.append(output)
            if self.decode_func == 'hvite':
                cmds.append(hvite(input, output))
            else:
                cmds.append(hdecode(input, output))

        if self.local == 1:
            for cmd in cmds:
                print cmd
                print os.popen(cmd).read()
        else:
            cmds_file = '%s/hvite.commands' % output_dir
            fh = open(cmds_file, 'w')
            for cmd in cmds:
                fh.write('%s\n' % cmd)
            fh.close()
            util.run_parallel(cmds_file, self.jobs, output_dir)
            #os.system('rm -f %s' %cmds_file)

        ## Merge outputs
        os.popen('rm -f %s' % output_mlf)
        os.popen('cat %s | grep -v "<" - > %s' %
                 (' '.join(outputs), output_mlf))

        ## Evaluate
        cmd = 'HResults -h -n -A -T 1 -c'
        cmd += ' -I %s' % gold_mlf
        cmd += ' %s %s > %s' % (model_list, output_mlf, results_log)
        os.system(cmd)
        print os.popen('cat ' + results_log).read()

        cmd = open(results_log).read().splitlines()[0]
        raw_wer = 100 - float(
            re.findall(r'Acc=([0-9.]*)',
                       os.popen(cmd.replace('-h ',
                                            '')).read())[0].split('=')[-1])
        return raw_wer

        os.system('rm -f %s/mfc.list.* %s/align.output.*' %
                  (output_dir, output_dir))
예제 #10
0
    """predicate for whether or not to keep a card"""
    return card['rarity'] != 'Basic Land'


def download(card):
    mid = card['multiverseid']
    download_image(BASE_URL.format(mid), 'imgs/{}.jpg'.format(mid))


def download_image(url, path):
    res = requests.get(url, stream=True)
    if res.status_code == 200:
        with open(path, 'wb') as f:
            for chunk in res:
                f.write(chunk)
    else:
        print('failed to download:', url)
        # res.raise_for_status()


if __name__ == '__main__':
    downloaded = [f.replace('.jpg', '') for f in os.listdir('imgs')]

    to_download = []
    for mid, card in cards.items():
        if keep(card) and mid not in downloaded:
            to_download.append(card)
    print('REMAINING:', len(to_download))

    util.run_parallel(to_download, download)
예제 #11
0
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list,
          dict, align_config):
    """
    Create a new alignment based on a model and the word alignment with HVite
    """

    output_dir = '%s/Align' % root_dir
    util.create_new_dir(output_dir)
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' % (mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT
        cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' % prune_thresh
        cmd += ' -C %s' % align_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -i %s' % output
        cmd += ' -I %s' % word_mlf
        cmd += ' -S %s' % input
        cmd += ' %s %s' % (dict, model_list)
        cmd += ' >> %s.hvite.log' % output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        outputs.append(output)
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hvite.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' % output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')
    fh.close()

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' % (
        new_mlf, merge_sil, ' '.join(outputs), output_dir)

    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' % new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0:
                util.log_write(model.logfh, 'removed bad alignment [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc + '\n')
    fh.close()
    util.log_write(model.logfh, 'removed alignments [%d]' % bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %
              (output_dir, output_dir))
    return output_dir
예제 #12
0
def run_iter(model, root_dir, prev_dir, mlf_file, model_list, mix_size, iter,
             extra):
    """
    Run an iteration of Baum-Welch training using HERest
    """

    output_dir = '%s/HMM-%d-%d' % (root_dir, mix_size, iter)
    util.create_new_dir(output_dir)

    mfc_list = '%s/mfc.list' % model.exp
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## HERest parameters
    min_train_examples = 0
    prune_thresh = 250
    prune_inc = 150
    prune_limit = 2000

    def herest(input, split_num, extra):
        try:
            log_id = os.path.basename(input).split('.')[2]
        except:
            log_id = 'acc'
        cmd = '%s -D -A -T 1 -m %d' % (HEREST_CMD, min_train_examples)
        cmd += ' -t %d %d %d' % (prune_thresh, prune_inc, prune_limit)
        cmd += ' -s %s/stats' % output_dir
        cmd += ' -C %s%s' % (model.mfc_config, extra)
        cmd += ' -I %s' % mlf_file
        cmd += ' -H %s/MMF' % prev_dir
        cmd += ' -p %d' % split_num
        cmd += ' -S %s' % input
        #cmd += ' -M %s %s' %(output_dir, model_list)
        cmd += ' -M %s %s >> %s/herest.%s.log' % (output_dir, model_list,
                                                  output_dir, log_id)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list,
                                            output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(herest(input, split_num, extra))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/herest.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/herest.list' % output_dir
    os.system('ls %s/HER*.acc > %s' % (output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = herest(acc_file, 0, extra)
    cmd = cmd.split('>>')[0]
    cmd += ' >> %s/herest.log' % output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/HER*.acc' % (output_dir, output_dir))
    os.system('bzip2 %s/herest.*.log %s/run-command*.log' %
              (output_dir, output_dir))

    ## Get a few stats
    num_models = int(
        os.popen('grep "<MEAN>" %s/MMF -c' % output_dir).read().strip())
    likelihood = float(
        os.popen('cat %s/herest.log | grep aver' %
                 output_dir).read().strip().split()[-1])

    return output_dir, num_models, likelihood
예제 #13
0
파일: mmi.py 프로젝트: Debanjan1234/pyhtk
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict, model_list, gold_mlf):

    sys.stderr.write('Decoding to lattices\n')
    output_mlf = '%s/train_recog.mlf' %output_dir
    results_log = '%s/results.log' %output_dir

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' %output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()

    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 150.0
    word_end_beam = 125.0
    max_model = 10000
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode(input, output):
        cmd  = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' %hdecode_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -k %d' %block_size
        cmd += ' -t %f 100.0' %beam
        cmd += ' -v %f 115.0' %word_end_beam
        cmd += ' -u %d' %max_model
        cmd += ' -s %f' %lm_scale
        cmd += ' -p %f' %word_insertion_penalty
        cmd += ' -w %s' %lm
        cmd += ' -S %s' %input
        cmd += ' -l %s/' %output
        cmd += ' %s %s' %(dict, model_list)
        if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
        return cmd
    
    ## Split up MFC list
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        output = '%s/%s' %(output_dir, split_mfc.get_key(input))
        if not os.path.isdir(output): os.makedirs(output)
        cmds.append(hdecode(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' %output_dir
    os.system('cp %s %s' %(mfc_list, old_mfc_list))

    ## Prune bad lats from the mfc list
    lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id)
            bad_count += 1
        else: fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count)
    
    ## Create an MLF from the recognition output
    outputs = util.get_files(output_dir, r'.*\.rec')
    os.popen('rm -f %s' %output_mlf)
    fh = open(output_mlf, 'w')
    fh.write('#!MLF!#\n')
    for output in outputs:
        fh.write('"%s"\n' %output)
        for line in open(output):
            if '<s>' in line or '</s>' in line: continue
            fh.write(line)
        fh.write('.\n')
    fh.close()

    ## Evaluate
    cmd  = 'HResults -h -n -A -T 1'
    cmd += ' -I %s' %gold_mlf
    cmd += ' %s %s > %s' %(model_list, output_mlf, results_log)
    os.system(cmd)
    print os.popen('cat ' + results_log).read()
예제 #14
0
파일: mmi.py 프로젝트: Debanjan1234/pyhtk
def run_iter(model, model_dir, num_lattice_dir, den_lattice_dir, root_dir, model_list, mfc_list, mix_size, iter):
    """
    Run an iteration of modified Baum-Welch training using HMMIRest
    """

    output_dir = '%s/HMMI-%d-%d' %(root_dir, mix_size, iter)
    util.create_new_dir(output_dir)
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## Create a config file to use with HLRescore
    hmmirest_config = '%s/hmmirest.config' %output_dir
    fh = open(hmmirest_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    #fh.write('HMMDEFOFILTER = "gzip -c > $.gz"\n')
    #fh.write('HMMDEFFILTER = "gunzip -c < $.gz"\n')
    fh.write('HMMIREST: LATMASKNUM = */%%%?????.???\n')
    fh.write('HMMIREST: LATMASKDEN = */%%%?????.???\n')
    #fh.write('HMMIREST: LATMASKNUM =  */%%%%%%%%/???????????????????.???\n')
    #fh.write('HMMIREST: LATMASKDEN =  */%%%%%%%%/???????????????????.???\n')
    fh.write('HFBLAT: LATPROBSCALE = 0.06667\n')
    fh.write('HMMIREST: E = 2.0\n')
    fh.write('ISMOOTHTAU = 50\n')
    fh.write('MPE = TRUE\n')
    #fh.write('MWE = TRUE\n')
    fh.close()

    def hmmirest(input, split_num):
        cmd  = 'HMMIRest -A -D -T 1 -C %s' %hmmirest_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -q %s' %num_lattice_dir
        cmd += ' -r %s' %den_lattice_dir
        if split_num == 0:
            cmd += ' -u mv'
        cmd += ' -p %d' %split_num
        cmd += ' -S %s' %input
        cmd += ' -M %s %s' %(output_dir, model_list)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(hmmirest(input, split_num))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/hmmirest.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/hmmirest.list' %output_dir
    os.system('ls %s/HDR*.acc* > %s' %(output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = hmmirest(acc_file, 0)
    cmd += ' >> %s/hmmirest.log' %output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)
    
    ## Clean up
    #os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir))

    return output_dir
예제 #15
0
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict,
                       model_list, gold_mlf):

    sys.stderr.write('Decoding to lattices\n')
    output_mlf = '%s/train_recog.mlf' % output_dir
    results_log = '%s/results.log' % output_dir

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' % output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()

    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 150.0
    word_end_beam = 125.0
    max_model = 10000
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode(input, output):
        cmd = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' % hdecode_config
        cmd += ' -H %s/MMF' % model_dir
        cmd += ' -k %d' % block_size
        cmd += ' -t %f 100.0' % beam
        cmd += ' -v %f 115.0' % word_end_beam
        cmd += ' -u %d' % max_model
        cmd += ' -s %f' % lm_scale
        cmd += ' -p %f' % word_insertion_penalty
        cmd += ' -w %s' % lm
        cmd += ' -S %s' % input
        cmd += ' -l %s/' % output
        cmd += ' %s %s' % (dict, model_list)
        if model.verbose > 0:
            cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input))
        return cmd

    ## Split up MFC list
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        output = '%s/%s' % (output_dir, split_mfc.get_key(input))
        if not os.path.isdir(output): os.makedirs(output)
        cmds.append(hdecode(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode.commands' % output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds:
            fh.write('%s\n' % cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' % output_dir
    os.system('cp %s %s' % (mfc_list, old_mfc_list))

    ## Prune bad lats from the mfc list
    lat_ids = [
        os.path.basename(f).split('.')[0]
        for f in util.get_files(output_dir, r'.*\.lat')
    ]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1:
                util.log_write(model.logfh, 'removed bad lat [%s]' % id)
            bad_count += 1
        else:
            fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count)

    ## Create an MLF from the recognition output
    outputs = util.get_files(output_dir, r'.*\.rec')
    os.popen('rm -f %s' % output_mlf)
    fh = open(output_mlf, 'w')
    fh.write('#!MLF!#\n')
    for output in outputs:
        fh.write('"%s"\n' % output)
        for line in open(output):
            if '<s>' in line or '</s>' in line: continue
            fh.write(line)
        fh.write('.\n')
    fh.close()

    ## Evaluate
    cmd = 'HResults -h -n -A -T 1'
    cmd += ' -I %s' % gold_mlf
    cmd += ' %s %s > %s' % (model_list, output_mlf, results_log)
    os.system(cmd)
    print os.popen('cat ' + results_log).read()
예제 #16
0
def run_iter(model, root_dir, prev_dir, mlf_file, model_list, mix_size, iter, extra):
    """
    Run an iteration of Baum-Welch training using HERest
    """

    output_dir = '%s/HMM-%d-%d' %(root_dir, mix_size, iter)
    util.create_new_dir(output_dir)

    mfc_list = '%s/mfc.list' %model.exp
    utts_per_split = max(250, (1 + (model.setup_length / 200)))

    ## HERest parameters
    min_train_examples = 0
    prune_thresh = 250
    prune_inc = 150
    prune_limit = 2000

    def herest(input, split_num, extra):
        try: log_id = os.path.basename(input).split('.')[2]
        except: log_id = 'acc'
        cmd  = '%s -D -A -T 1 -m %d' %(HEREST_CMD, min_train_examples)
        cmd += ' -t %d %d %d' %(prune_thresh, prune_inc, prune_limit)
        cmd += ' -s %s/stats' %output_dir
        cmd += ' -C %s%s' %(model.mfc_config, extra)
        cmd += ' -I %s' %mlf_file
        cmd += ' -H %s/MMF' %prev_dir
        cmd += ' -p %d' %split_num
        cmd += ' -S %s' %input
        #cmd += ' -M %s %s' %(output_dir, model_list)
        cmd += ' -M %s %s >> %s/herest.%s.log' %(output_dir, model_list, output_dir, log_id)
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HERest commands
    cmds = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    split_num = 0
    for input in inputs:
        split_num += 1
        cmds.append(herest(input, split_num, extra))

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/herest.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Gather the created .acc files
    acc_file = '%s/herest.list' %output_dir
    os.system('ls %s/HER*.acc > %s' %(output_dir, acc_file))

    ## Combine acc files into a new HMM
    cmd = herest(acc_file, 0, extra)
    cmd = cmd.split('>>')[0]
    cmd += ' >> %s/herest.log' %output_dir
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir))
    os.system('bzip2 %s/herest.*.log %s/run-command*.log' %(output_dir, output_dir))

    ## Get a few stats
    num_models = int(os.popen('grep "<MEAN>" %s/MMF -c' %output_dir).read().strip())
    likelihood = float(os.popen('cat %s/herest.log | grep aver' %output_dir).read().strip().split()[-1])

    return output_dir, num_models, likelihood
예제 #17
0
def wav_to_mfc(model, output_dir, mfc_list):
    """
    Use HCopy to code each wav in the setup file. HCopy takes a config
    file (-C) and an input (-S). The input is a file where each line
    looks like:
    <wav file> <mfc file>
    """

    def hcopy(config, input):
        cmd = 'HCopy -A -T 1 -C %s -C %s -S %s' %(model.mfc_config, config, input)
        return cmd

    ## Create list files for HCopy <wav file> <mfc file>
    lines_per_split = 500
    count = 0
    prev_config = ''
    cmds = []
    mfcs = []
    file = '%s/hcopy.list.0' %output_dir
    fh = open(file, 'w')

    if model.setup.endswith('gz'): setup_reader = lambda x: gzip.open(x)
    else: setup_reader = lambda x: open(x)
    
    for line in setup_reader(model.setup):
        count += 1
        [wav, config] = line.strip().split()[0:2]
        if not os.path.isfile(wav): sys.stderr.write('missing [%s]\n' %wav)
        mfc = get_mfc_name_from_wav(wav, model.data)
        mfcs.append(mfc)

        if count > 1 and (count % lines_per_split == 0 or config != prev_config):
            cmds.append(hcopy(prev_config, file))
            fh.close()
            file = '%s/hcopy.list.%d' %(output_dir, len(cmds))
            fh = open(file, 'w')

        fh.write('%s %s\n' %(wav, mfc))
        prev_config = config

    cmds.append(hcopy(config, file))
    fh.close()

    ## Non-parallel case
    if model.local == 1:
        for cmd in cmds: os.system(cmd)

    ## Parallel case: one command per line in cmds_file
    else:
        cmds_file = '%s/hcopy.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Create a file listing all created MFCs
    fh = open(mfc_list, 'w')
    for mfc in mfcs:
        fh.write('%s\n' %mfc)
    fh.close()

    ## Clean up
    os.system('rm -f %s/hcopy.list.*' %output_dir)
    return count
예제 #18
0
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): 
    """
    Create a new alignment based on a model and the word alignment with HVite
    """

    output_dir = '%s/Align' %root_dir
    util.create_new_dir(output_dir)
    utts_per_split = max(100, (1 + (model.setup_length / 200)))

    ## Copy old mfc list
    os.system('cp %s %s/mfc_old.list' %(mfc_list, output_dir))

    ## HVite parameters
    prune_thresh = 250

    def hvite(input, output):
        #-o SWT 
        cmd  = 'HVite -D -A -T 1 -b silence -a -m -y lab '
        cmd += '-t %d' %prune_thresh
        cmd += ' -C %s' %align_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -i %s' %output
        cmd += ' -I %s' %word_mlf
        cmd += ' -S %s' %input
        cmd += ' %s %s' %(dict, model_list)
        cmd += ' >> %s.hvite.log' %output
        return cmd

    ## Split up MFC list with unix split
    cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.')
    os.system(cmd)

    ## Create the HVite commands
    cmds = []
    outputs = []
    inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines()
    for input in inputs:
        output = input.replace('mfc.list', 'align.output')
        outputs.append(output)
        cmds.append(hvite(input, output))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hvite.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)

    ## Merge and fix silences
    ## TODO: -s file_list
    merge_sil = '%s/merge_sp_sil.led' %output_dir
    fh = open(merge_sil, 'w')
    fh.write('ME sil sp sil\n')
    fh.write('ME sil sil sil\n')
    fh.write('ME sp sil sil\n')
    fh.close()

    cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' %(new_mlf, merge_sil, ' '.join(outputs), output_dir)
            
    if model.local == 1: os.system(cmd)
    else: util.run(cmd, output_dir)

    ## Prune failed alignments from the mfc list
    bad_count = 0
    mlf_labels = os.popen('grep "\.lab" %s' %new_mlf).read().splitlines()
    mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels])
    mfc_labels = open(mfc_list).read().splitlines()
    fh = open(mfc_list, 'w')
    for mfc in mfc_labels:
        id = os.path.basename(mfc).split('.')[0]

        ## Check for missing transcriptions
        if id not in mlf_labels:
            if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' %id)
            bad_count += 1
        else: fh.write(mfc + '\n')
    fh.close()
    util.log_write(model.logfh, 'removed alignments [%d]' %bad_count)

    ## Clean up
    os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir))
    return output_dir
예제 #19
0
  
  # LOAD FILES
  files = io.get_dumps_list(path)
  if len(files) == 0:
      util.warn("INVALID PATH TO DUMP FOLDER")
      sys.exit(1)

  frame_dir = "frames_"+movie_type
  util.make_dir(frame_dir)

  hdr = io.load_hdr(files[0])
  geom = io.load_geom(hdr, path)

  jmin, jmax = get_j_vals(geom)
  #print("jmin: {} jmax: {}".format(jmin, jmax))

  if diag_post:
    # Load fluxes from post-analysis: more flexible
    diag = pickle.load(open("eht_out.p", 'rb'))
  else:
    # Load diagnostics from HARM itself
    diag = io.load_log(path)

  nthreads = util.calc_nthreads(hdr, pad=0.3)
  if debug:
    # Run sequentially to make backtraces work
    for i in range(len(files)):
      plot(i)
  else:
    util.run_parallel(plot, len(files), nthreads)
예제 #20
0
파일: mmi.py 프로젝트: Debanjan1234/pyhtk
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm, dict, model_list):

    sys.stderr.write('Phonemarking lattices\n')

    ## Create a config file to use with HDecode
    hdecode_config = '%s/hdecode.config' %output_dir
    fh = open(hdecode_config, 'w')
    #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n')
    fh.write('HNETFILTER = "gunzip -c < $.gz"\n')
    fh.write('HNETOFILTER = "gzip -c > $.gz"\n')
    fh.write('RAWMITFORMAT = T\n')
    fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n')
    fh.write('GCFREQ = 50\n')
    fh.write('HLAT:TRACE = 19\n')
    fh.write('HLVNET:TRACE = 1\n')
    fh.write('HLVREC:TRACE = 1\n')
    fh.write('HLVLM:TRACE = 1\n')
    fh.write('LATPRUNEBEAM = 500.0\n')
    fh.write('MAXLMLA = 3.0\n')
    fh.write('BUILDLATSENTEND = T\n')
    fh.write('FORCELATOUT = F\n')
    fh.write('STARTWORD = <s>\n')
    fh.write('ENDWORD = </s>\n')
    fh.close()
    
    ## HDecode parameters
    utts_per_split = 100
    block_size = 5
    beam = 200.0
    lm_scale = 15.0
    word_insertion_penalty = 0.0

    def hdecode_mod(input, path):
        input_dir = '%s/%s/' %(lattice_dir, path)
        if not os.path.isdir(input_dir):
            input_dir = '%s/%s/' %(lattice_dir, path.replace('_', ''))
        cmd  = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' %hdecode_config
        cmd += ' -H %s/MMF' %model_dir
        cmd += ' -k %d' %block_size
        cmd += ' -t %f' %beam
        cmd += ' -s %f' %lm_scale
        cmd += ' -p %f' %word_insertion_penalty
        cmd += ' -w' # %s' %lm
        cmd += ' -S %s' %input
        cmd += ' -l %s/%s/' %(output_dir, path)
        cmd += ' -L %s' %input_dir
        cmd += ' %s %s' %(dict, model_list)
        if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input))
        return cmd

    ## Split up MFC list with unix split
    split_mfc = SplitList(output_dir, mfc_list, by_path=True)

    ## Create the HDecode commands
    cmds = []
    inputs = split_mfc.get_files()
    for input in inputs:
        key = split_mfc.get_key(input)
        new_output = '%s/%s' %(output_dir, key)
        if not os.path.isdir(new_output): os.makedirs(new_output)
        
        cmds.append(hdecode_mod(input, key))

    if model.local == 1:
        for cmd in cmds:
            print cmd
            print os.popen(cmd).read()
    else:
        cmds_file = '%s/hdecode_mod.commands' %output_dir
        fh = open(cmds_file, 'w')
        for cmd in cmds: fh.write('%s\n' %cmd)
        fh.close()
        util.run_parallel(cmds_file, model.jobs, output_dir)
        
    ## Copy old mfc list
    old_mfc_list = '%s/mfc_old.list' %output_dir
    os.system('cp %s %s' %(mfc_list, old_mfc_list))
        
    ## Prune bad lats from the mfc list
    lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')]
    bad_count = 0
    fh = open(mfc_list, 'w')
    for mfc in open(old_mfc_list):
        id = os.path.basename(mfc.strip()).split('.')[0]

        ## Check for missing transcriptions
        if id not in lat_ids:
            if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id)
            bad_count += 1
        else: fh.write(mfc)
    fh.close()
    util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count)