def add_lm_lattices(model, lattice_dir, output_dir, dict, lm): sys.stderr.write('adding LM scores to numerator lattices\n') ## Create a config file to use with HLRescore hlrescore_config = '%s/hlrescore.config' %output_dir fh = open(hlrescore_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HLRESCORE: FIXBADLATS = TRUE\n') fh.write('HLRESCORE: STARTWORD = <s>\n') fh.write('HLRESCORE: ENDWORD = </s>\n') fh.close() ## HLRescore parameters grammar_scale = 15.0 trans_penalty = 0.0 def hlrescore(input, path): cmd = 'HLRescore -A -D -T 1 -w -c -q tvaldm -C %s' %hlrescore_config cmd += ' -S %s' %input cmd += ' -L %s/%s/' %(lattice_dir, path) cmd += ' -l %s/%s/' %(output_dir, path) cmd += ' -s %f' %grammar_scale cmd += ' -p %f' %trans_penalty cmd += ' -n %s' %lm cmd += ' %s' %dict if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split up lattice list lattice_list = '%s/lattice.list' %output_dir fh = open(lattice_list, 'w') remove_gz = lambda x: x.replace('.gz', '') files = map(remove_gz, util.get_files(lattice_dir, r'.*\.lat')) fh.write('\n'.join(files)) fh.close() split_lattice = SplitList(output_dir, lattice_list, by_path=True) ## Create the HLRescore commands cmds = [] inputs = split_lattice.get_files() for input in inputs: key = split_lattice.get_key(input) new_output = '%s/%s' %(output_dir, key) if not os.path.isdir(new_output): os.makedirs(new_output) cmds.append(hlrescore(input, key)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hlrescore.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir)
def create_num_lattices(model, output_dir, lm, dict, word_mlf): sys.stderr.write('Creating numerator word lattices\n') ## Create a config file to use with HLRescore hlrescore_config = '%s/hlrescore.config' % output_dir fh = open(hlrescore_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HLRESCORE: FIXBADLATS = TRUE\n') fh.write('HLRESCORE: STARTWORD = <s>\n') fh.write('HLRESCORE: ENDWORD = </s>\n') fh.close() def hlrescore(input, output): cmd = 'HLRescore -A -D -T 1 -w -f -q tvalqr -C %s' % hlrescore_config cmd += ' -S %s' % input cmd += ' -I %s' % word_mlf cmd += ' -l %s/' % output cmd += ' %s' % dict if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split the word mlf labels to create inputs for HLRescore label_list = '%s/labels.list' % output_dir cmd = 'grep "\.lab" %s > %s' % (word_mlf, label_list) os.system(cmd) split_label = SplitList(output_dir, label_list, by_path=False, by_letters=model.split_path_letters) ## Create the HLRescore commands cmds = [] inputs = split_label.get_files() for input in inputs: output = '%s/%s' % (output_dir, split_label.get_key(input)) if not os.path.isdir(output): os.makedirs(output) cmds.append(hlrescore(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hlrescore.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir)
def create_num_lattices(model, output_dir, lm, dict, word_mlf): sys.stderr.write('Creating numerator word lattices\n') ## Create a config file to use with HLRescore hlrescore_config = '%s/hlrescore.config' %output_dir fh = open(hlrescore_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HLRESCORE: FIXBADLATS = TRUE\n') fh.write('HLRESCORE: STARTWORD = <s>\n') fh.write('HLRESCORE: ENDWORD = </s>\n') fh.close() def hlrescore(input, output): cmd = 'HLRescore -A -D -T 1 -w -f -q tvalqr -C %s' %hlrescore_config cmd += ' -S %s' %input cmd += ' -I %s' %word_mlf cmd += ' -l %s/' %output cmd += ' %s' %dict if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split the word mlf labels to create inputs for HLRescore label_list = '%s/labels.list' %output_dir cmd = 'grep "\.lab" %s > %s' %(word_mlf, label_list) os.system(cmd) split_label = SplitList(output_dir, label_list, by_path=False, by_letters=model.split_path_letters) ## Create the HLRescore commands cmds = [] inputs = split_label.get_files() for input in inputs: output = '%s/%s' %(output_dir, split_label.get_key(input)) if not os.path.isdir(output): os.makedirs(output) cmds.append(hlrescore(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hlrescore.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir)
def run_iter(model, model_dir, num_lattice_dir, den_lattice_dir, root_dir, model_list, mfc_list, mix_size, iter): """ Run an iteration of modified Baum-Welch training using HMMIRest """ output_dir = '%s/HMMI-%d-%d' % (root_dir, mix_size, iter) util.create_new_dir(output_dir) utts_per_split = max(250, (1 + (model.setup_length / 200))) ## Create a config file to use with HLRescore hmmirest_config = '%s/hmmirest.config' % output_dir fh = open(hmmirest_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') #fh.write('HMMDEFOFILTER = "gzip -c > $.gz"\n') #fh.write('HMMDEFFILTER = "gunzip -c < $.gz"\n') fh.write('HMMIREST: LATMASKNUM = */%%%?????.???\n') fh.write('HMMIREST: LATMASKDEN = */%%%?????.???\n') #fh.write('HMMIREST: LATMASKNUM = */%%%%%%%%/???????????????????.???\n') #fh.write('HMMIREST: LATMASKDEN = */%%%%%%%%/???????????????????.???\n') fh.write('HFBLAT: LATPROBSCALE = 0.06667\n') fh.write('HMMIREST: E = 2.0\n') fh.write('ISMOOTHTAU = 50\n') fh.write('MPE = TRUE\n') #fh.write('MWE = TRUE\n') fh.close() def hmmirest(input, split_num): cmd = 'HMMIRest -A -D -T 1 -C %s' % hmmirest_config cmd += ' -H %s/MMF' % model_dir cmd += ' -q %s' % num_lattice_dir cmd += ' -r %s' % den_lattice_dir if split_num == 0: cmd += ' -u mv' cmd += ' -p %d' % split_num cmd += ' -S %s' % input cmd += ' -M %s %s' % (output_dir, model_list) return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HERest commands cmds = [] inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines() split_num = 0 for input in inputs: split_num += 1 cmds.append(hmmirest(input, split_num)) ## Non-parallel case if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd) ## Parallel case: one command per line in cmds_file else: cmds_file = '%s/hmmirest.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Gather the created .acc files acc_file = '%s/hmmirest.list' % output_dir os.system('ls %s/HDR*.acc* > %s' % (output_dir, acc_file)) ## Combine acc files into a new HMM cmd = hmmirest(acc_file, 0) cmd += ' >> %s/hmmirest.log' % output_dir if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Clean up #os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir)) return output_dir
def add_lm_lattices(model, lattice_dir, output_dir, dict, lm): sys.stderr.write('adding LM scores to numerator lattices\n') ## Create a config file to use with HLRescore hlrescore_config = '%s/hlrescore.config' % output_dir fh = open(hlrescore_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HLRESCORE: FIXBADLATS = TRUE\n') fh.write('HLRESCORE: STARTWORD = <s>\n') fh.write('HLRESCORE: ENDWORD = </s>\n') fh.close() ## HLRescore parameters grammar_scale = 15.0 trans_penalty = 0.0 def hlrescore(input, path): cmd = 'HLRescore -A -D -T 1 -w -c -q tvaldm -C %s' % hlrescore_config cmd += ' -S %s' % input cmd += ' -L %s/%s/' % (lattice_dir, path) cmd += ' -l %s/%s/' % (output_dir, path) cmd += ' -s %f' % grammar_scale cmd += ' -p %f' % trans_penalty cmd += ' -n %s' % lm cmd += ' %s' % dict if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split up lattice list lattice_list = '%s/lattice.list' % output_dir fh = open(lattice_list, 'w') remove_gz = lambda x: x.replace('.gz', '') files = map(remove_gz, util.get_files(lattice_dir, r'.*\.lat')) fh.write('\n'.join(files)) fh.close() split_lattice = SplitList(output_dir, lattice_list, by_path=True) ## Create the HLRescore commands cmds = [] inputs = split_lattice.get_files() for input in inputs: key = split_lattice.get_key(input) new_output = '%s/%s' % (output_dir, key) if not os.path.isdir(new_output): os.makedirs(new_output) cmds.append(hlrescore(input, key)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hlrescore.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir)
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm, dict, model_list): sys.stderr.write('Phonemarking lattices\n') ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' % output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 200.0 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode_mod(input, path): input_dir = '%s/%s/' % (lattice_dir, path) if not os.path.isdir(input_dir): input_dir = '%s/%s/' % (lattice_dir, path.replace('_', '')) cmd = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' % hdecode_config cmd += ' -H %s/MMF' % model_dir cmd += ' -k %d' % block_size cmd += ' -t %f' % beam cmd += ' -s %f' % lm_scale cmd += ' -p %f' % word_insertion_penalty cmd += ' -w' # %s' %lm cmd += ' -S %s' % input cmd += ' -l %s/%s/' % (output_dir, path) cmd += ' -L %s' % input_dir cmd += ' %s %s' % (dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split up MFC list with unix split split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: key = split_mfc.get_key(input) new_output = '%s/%s' % (output_dir, key) if not os.path.isdir(new_output): os.makedirs(new_output) cmds.append(hdecode_mod(input, key)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode_mod.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' % output_dir os.system('cp %s %s' % (mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [ os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat') ] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' % id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count)
def wav_to_mfc(model, output_dir, mfc_list): """ Use HCopy to code each wav in the setup file. HCopy takes a config file (-C) and an input (-S). The input is a file where each line looks like: <wav file> <mfc file> """ def hcopy(config, input): cmd = 'HCopy -A -T 1 -C %s -C %s -S %s' % (model.mfc_config, config, input) return cmd ## Create list files for HCopy <wav file> <mfc file> lines_per_split = 500 count = 0 prev_config = '' cmds = [] mfcs = [] file = '%s/hcopy.list.0' % output_dir fh = open(file, 'w') if model.setup.endswith('gz'): setup_reader = lambda x: gzip.open(x) else: setup_reader = lambda x: open(x) for line in setup_reader(model.setup): count += 1 [wav, config] = line.strip().split()[0:2] if not os.path.isfile(wav): sys.stderr.write('missing [%s]\n' % wav) mfc = get_mfc_name_from_wav(wav, model.data) mfcs.append(mfc) if count > 1 and (count % lines_per_split == 0 or config != prev_config): cmds.append(hcopy(prev_config, file)) fh.close() file = '%s/hcopy.list.%d' % (output_dir, len(cmds)) fh = open(file, 'w') fh.write('%s %s\n' % (wav, mfc)) prev_config = config cmds.append(hcopy(config, file)) fh.close() ## Non-parallel case if model.local == 1: for cmd in cmds: os.system(cmd) ## Parallel case: one command per line in cmds_file else: cmds_file = '%s/hcopy.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Create a file listing all created MFCs fh = open(mfc_list, 'w') for mfc in mfcs: fh.write('%s\n' % mfc) fh.close() ## Clean up os.system('rm -f %s/hcopy.list.*' % output_dir) return count
def decode(self, model, mfc_list, gold_mlf, lm_file, gaussians, iter, mmi=False, diag=False, xword_id='', output_dir=None): if mmi: model_file = '%s/MMI/Models/HMMI-%d-%d/MMF' %(model.exp, gaussians, iter) elif diag: model_file = '%s/Diag/HMM-%d-%d/MMF' %(model.exp, gaussians, iter) else: model_file = '%s/Xword%s/HMM-%d-%d/MMF' %(model.exp, xword_id, gaussians, iter) model_list = '%s/tied.list' %model.exp if not output_dir: output_dir = '%s/decode' %self.exp output_dir = '%s/decode' %output_dir util.create_new_dir(output_dir) results_log = '%s/hresults.log' %output_dir output_mlf = '%s/decoded.mlf' %output_dir def hvite(input, output): cmd = 'HVite -D -A -T 1 -l "*" ' cmd += '-t %f ' %self.beam cmd += '-C %s ' %self.decode_config cmd += '-H %s ' %model_file cmd += '-S %s ' %input cmd += '-i %s ' %output cmd += '-w %s ' %lm_file cmd += '-p %f ' %self.insertion_penalty cmd += '-s %f ' %self.lm_scale cmd += '%s %s' %(self.dict, model_list) return cmd ## HDecode parameters utts_per_split = 5 block_size = 1 word_end_beam = 150.0 max_model = 0 def hdecode(input, output): cmd = 'HDecode -D -A -V -T 9 -o M -C %s' %self.decode_config cmd += ' -H %s' %model_file cmd += ' -k %d' %block_size cmd += ' -t %f 100.0' %self.beam cmd += ' -v %f 115.0' %word_end_beam cmd += ' -u %d' %max_model cmd += ' -s %f' %self.lm_scale cmd += ' -p %f' %self.insertion_penalty cmd += ' -w %s' %lm_file cmd += ' -S %s' %input cmd += ' -i %s' %output cmd += ' %s %s' %(self.dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create appropriate config file self.decode_config = '%s/%s.config' %(output_dir, self.decode_func) fh = open(self.decode_config, 'w') if self.decode_func == 'hvite': fh.write('FORCECXTEXP = T\n') fh.write('ALLOWXWRDEXP = T\n') elif self.decode_func == 'hdecode': #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## Create the HVite/HDecode commands cmds = [] outputs = [] inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines() for input in inputs: output = input.replace('mfc.list', 'align.output') outputs.append(output) if self.decode_func == 'hvite': cmds.append(hvite(input, output)) else: cmds.append(hdecode(input, output)) if self.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hvite.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, self.jobs, output_dir) #os.system('rm -f %s' %cmds_file) ## Merge outputs os.popen('rm -f %s' %output_mlf) os.popen('cat %s | grep -v "<" - > %s' %(' '.join(outputs), output_mlf)) ## Evaluate cmd = 'HResults -h -n -A -T 1 -c' cmd += ' -I %s' %gold_mlf cmd += ' %s %s > %s' %(model_list, output_mlf, results_log) os.system(cmd) print os.popen('cat ' + results_log).read() cmd = open(results_log).read().splitlines()[0] raw_wer = 100 - float(re.findall(r'Acc=([0-9.]*)', os.popen(cmd.replace('-h ', '')).read())[0].split('=')[-1]) return raw_wer os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir))
def decode(self, model, mfc_list, gold_mlf, lm_file, gaussians, iter, mmi=False, diag=False, xword_id='', output_dir=None): if mmi: model_file = '%s/MMI/Models/HMMI-%d-%d/MMF' % (model.exp, gaussians, iter) elif diag: model_file = '%s/Diag/HMM-%d-%d/MMF' % (model.exp, gaussians, iter) else: model_file = '%s/Xword%s/HMM-%d-%d/MMF' % (model.exp, xword_id, gaussians, iter) model_list = '%s/tied.list' % model.exp if not output_dir: output_dir = '%s/decode' % self.exp output_dir = '%s/decode' % output_dir util.create_new_dir(output_dir) results_log = '%s/hresults.log' % output_dir output_mlf = '%s/decoded.mlf' % output_dir def hvite(input, output): cmd = 'HVite -D -A -T 1 -l "*" ' cmd += '-t %f ' % self.beam cmd += '-C %s ' % self.decode_config cmd += '-H %s ' % model_file cmd += '-S %s ' % input cmd += '-i %s ' % output cmd += '-w %s ' % lm_file cmd += '-p %f ' % self.insertion_penalty cmd += '-s %f ' % self.lm_scale cmd += '%s %s' % (self.dict, model_list) return cmd ## HDecode parameters utts_per_split = 5 block_size = 1 word_end_beam = 150.0 max_model = 0 def hdecode(input, output): cmd = 'HDecode -D -A -V -T 9 -o M -C %s' % self.decode_config cmd += ' -H %s' % model_file cmd += ' -k %d' % block_size cmd += ' -t %f 100.0' % self.beam cmd += ' -v %f 115.0' % word_end_beam cmd += ' -u %d' % max_model cmd += ' -s %f' % self.lm_scale cmd += ' -p %f' % self.insertion_penalty cmd += ' -w %s' % lm_file cmd += ' -S %s' % input cmd += ' -i %s' % output cmd += ' %s %s' % (self.dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create appropriate config file self.decode_config = '%s/%s.config' % (output_dir, self.decode_func) fh = open(self.decode_config, 'w') if self.decode_func == 'hvite': fh.write('FORCECXTEXP = T\n') fh.write('ALLOWXWRDEXP = T\n') elif self.decode_func == 'hdecode': #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## Create the HVite/HDecode commands cmds = [] outputs = [] inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines() for input in inputs: output = input.replace('mfc.list', 'align.output') outputs.append(output) if self.decode_func == 'hvite': cmds.append(hvite(input, output)) else: cmds.append(hdecode(input, output)) if self.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hvite.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, self.jobs, output_dir) #os.system('rm -f %s' %cmds_file) ## Merge outputs os.popen('rm -f %s' % output_mlf) os.popen('cat %s | grep -v "<" - > %s' % (' '.join(outputs), output_mlf)) ## Evaluate cmd = 'HResults -h -n -A -T 1 -c' cmd += ' -I %s' % gold_mlf cmd += ' %s %s > %s' % (model_list, output_mlf, results_log) os.system(cmd) print os.popen('cat ' + results_log).read() cmd = open(results_log).read().splitlines()[0] raw_wer = 100 - float( re.findall(r'Acc=([0-9.]*)', os.popen(cmd.replace('-h ', '')).read())[0].split('=')[-1]) return raw_wer os.system('rm -f %s/mfc.list.* %s/align.output.*' % (output_dir, output_dir))
"""predicate for whether or not to keep a card""" return card['rarity'] != 'Basic Land' def download(card): mid = card['multiverseid'] download_image(BASE_URL.format(mid), 'imgs/{}.jpg'.format(mid)) def download_image(url, path): res = requests.get(url, stream=True) if res.status_code == 200: with open(path, 'wb') as f: for chunk in res: f.write(chunk) else: print('failed to download:', url) # res.raise_for_status() if __name__ == '__main__': downloaded = [f.replace('.jpg', '') for f in os.listdir('imgs')] to_download = [] for mid, card in cards.items(): if keep(card) and mid not in downloaded: to_download.append(card) print('REMAINING:', len(to_download)) util.run_parallel(to_download, download)
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): """ Create a new alignment based on a model and the word alignment with HVite """ output_dir = '%s/Align' % root_dir util.create_new_dir(output_dir) utts_per_split = max(100, (1 + (model.setup_length / 200))) ## Copy old mfc list os.system('cp %s %s/mfc_old.list' % (mfc_list, output_dir)) ## HVite parameters prune_thresh = 250 def hvite(input, output): #-o SWT cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab ' cmd += '-t %d' % prune_thresh cmd += ' -C %s' % align_config cmd += ' -H %s/MMF' % model_dir cmd += ' -i %s' % output cmd += ' -I %s' % word_mlf cmd += ' -S %s' % input cmd += ' %s %s' % (dict, model_list) cmd += ' >> %s.hvite.log' % output return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HVite commands cmds = [] outputs = [] inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines() for input in inputs: output = input.replace('mfc.list', 'align.output') outputs.append(output) cmds.append(hvite(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hvite.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Merge and fix silences ## TODO: -s file_list merge_sil = '%s/merge_sp_sil.led' % output_dir fh = open(merge_sil, 'w') fh.write('ME sil sp sil\n') fh.write('ME sil sil sil\n') fh.write('ME sp sil sil\n') fh.close() cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' % ( new_mlf, merge_sil, ' '.join(outputs), output_dir) if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Prune failed alignments from the mfc list bad_count = 0 mlf_labels = os.popen('grep "\.lab" %s' % new_mlf).read().splitlines() mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels]) mfc_labels = open(mfc_list).read().splitlines() fh = open(mfc_list, 'w') for mfc in mfc_labels: id = os.path.basename(mfc).split('.')[0] ## Check for missing transcriptions if id not in mlf_labels: if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' % id) bad_count += 1 else: fh.write(mfc + '\n') fh.close() util.log_write(model.logfh, 'removed alignments [%d]' % bad_count) ## Clean up os.system('rm -f %s/mfc.list.* %s/align.output.*' % (output_dir, output_dir)) return output_dir
def run_iter(model, root_dir, prev_dir, mlf_file, model_list, mix_size, iter, extra): """ Run an iteration of Baum-Welch training using HERest """ output_dir = '%s/HMM-%d-%d' % (root_dir, mix_size, iter) util.create_new_dir(output_dir) mfc_list = '%s/mfc.list' % model.exp utts_per_split = max(250, (1 + (model.setup_length / 200))) ## HERest parameters min_train_examples = 0 prune_thresh = 250 prune_inc = 150 prune_limit = 2000 def herest(input, split_num, extra): try: log_id = os.path.basename(input).split('.')[2] except: log_id = 'acc' cmd = '%s -D -A -T 1 -m %d' % (HEREST_CMD, min_train_examples) cmd += ' -t %d %d %d' % (prune_thresh, prune_inc, prune_limit) cmd += ' -s %s/stats' % output_dir cmd += ' -C %s%s' % (model.mfc_config, extra) cmd += ' -I %s' % mlf_file cmd += ' -H %s/MMF' % prev_dir cmd += ' -p %d' % split_num cmd += ' -S %s' % input #cmd += ' -M %s %s' %(output_dir, model_list) cmd += ' -M %s %s >> %s/herest.%s.log' % (output_dir, model_list, output_dir, log_id) return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' % (utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HERest commands cmds = [] inputs = os.popen('ls %s/mfc.list.*' % output_dir).read().splitlines() split_num = 0 for input in inputs: split_num += 1 cmds.append(herest(input, split_num, extra)) ## Non-parallel case if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd) ## Parallel case: one command per line in cmds_file else: cmds_file = '%s/herest.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Gather the created .acc files acc_file = '%s/herest.list' % output_dir os.system('ls %s/HER*.acc > %s' % (output_dir, acc_file)) ## Combine acc files into a new HMM cmd = herest(acc_file, 0, extra) cmd = cmd.split('>>')[0] cmd += ' >> %s/herest.log' % output_dir if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Clean up os.system('rm -f %s/mfc.list.* %s/HER*.acc' % (output_dir, output_dir)) os.system('bzip2 %s/herest.*.log %s/run-command*.log' % (output_dir, output_dir)) ## Get a few stats num_models = int( os.popen('grep "<MEAN>" %s/MMF -c' % output_dir).read().strip()) likelihood = float( os.popen('cat %s/herest.log | grep aver' % output_dir).read().strip().split()[-1]) return output_dir, num_models, likelihood
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict, model_list, gold_mlf): sys.stderr.write('Decoding to lattices\n') output_mlf = '%s/train_recog.mlf' %output_dir results_log = '%s/results.log' %output_dir ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' %output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 150.0 word_end_beam = 125.0 max_model = 10000 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode(input, output): cmd = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' %hdecode_config cmd += ' -H %s/MMF' %model_dir cmd += ' -k %d' %block_size cmd += ' -t %f 100.0' %beam cmd += ' -v %f 115.0' %word_end_beam cmd += ' -u %d' %max_model cmd += ' -s %f' %lm_scale cmd += ' -p %f' %word_insertion_penalty cmd += ' -w %s' %lm cmd += ' -S %s' %input cmd += ' -l %s/' %output cmd += ' %s %s' %(dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split up MFC list split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: output = '%s/%s' %(output_dir, split_mfc.get_key(input)) if not os.path.isdir(output): os.makedirs(output) cmds.append(hdecode(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' %output_dir os.system('cp %s %s' %(mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count) ## Create an MLF from the recognition output outputs = util.get_files(output_dir, r'.*\.rec') os.popen('rm -f %s' %output_mlf) fh = open(output_mlf, 'w') fh.write('#!MLF!#\n') for output in outputs: fh.write('"%s"\n' %output) for line in open(output): if '<s>' in line or '</s>' in line: continue fh.write(line) fh.write('.\n') fh.close() ## Evaluate cmd = 'HResults -h -n -A -T 1' cmd += ' -I %s' %gold_mlf cmd += ' %s %s > %s' %(model_list, output_mlf, results_log) os.system(cmd) print os.popen('cat ' + results_log).read()
def run_iter(model, model_dir, num_lattice_dir, den_lattice_dir, root_dir, model_list, mfc_list, mix_size, iter): """ Run an iteration of modified Baum-Welch training using HMMIRest """ output_dir = '%s/HMMI-%d-%d' %(root_dir, mix_size, iter) util.create_new_dir(output_dir) utts_per_split = max(250, (1 + (model.setup_length / 200))) ## Create a config file to use with HLRescore hmmirest_config = '%s/hmmirest.config' %output_dir fh = open(hmmirest_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') #fh.write('HMMDEFOFILTER = "gzip -c > $.gz"\n') #fh.write('HMMDEFFILTER = "gunzip -c < $.gz"\n') fh.write('HMMIREST: LATMASKNUM = */%%%?????.???\n') fh.write('HMMIREST: LATMASKDEN = */%%%?????.???\n') #fh.write('HMMIREST: LATMASKNUM = */%%%%%%%%/???????????????????.???\n') #fh.write('HMMIREST: LATMASKDEN = */%%%%%%%%/???????????????????.???\n') fh.write('HFBLAT: LATPROBSCALE = 0.06667\n') fh.write('HMMIREST: E = 2.0\n') fh.write('ISMOOTHTAU = 50\n') fh.write('MPE = TRUE\n') #fh.write('MWE = TRUE\n') fh.close() def hmmirest(input, split_num): cmd = 'HMMIRest -A -D -T 1 -C %s' %hmmirest_config cmd += ' -H %s/MMF' %model_dir cmd += ' -q %s' %num_lattice_dir cmd += ' -r %s' %den_lattice_dir if split_num == 0: cmd += ' -u mv' cmd += ' -p %d' %split_num cmd += ' -S %s' %input cmd += ' -M %s %s' %(output_dir, model_list) return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HERest commands cmds = [] inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines() split_num = 0 for input in inputs: split_num += 1 cmds.append(hmmirest(input, split_num)) ## Non-parallel case if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd) ## Parallel case: one command per line in cmds_file else: cmds_file = '%s/hmmirest.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Gather the created .acc files acc_file = '%s/hmmirest.list' %output_dir os.system('ls %s/HDR*.acc* > %s' %(output_dir, acc_file)) ## Combine acc files into a new HMM cmd = hmmirest(acc_file, 0) cmd += ' >> %s/hmmirest.log' %output_dir if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Clean up #os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir)) return output_dir
def decode_to_lattices(model, output_dir, model_dir, mfc_list, lm, dict, model_list, gold_mlf): sys.stderr.write('Decoding to lattices\n') output_mlf = '%s/train_recog.mlf' % output_dir results_log = '%s/results.log' % output_dir ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' % output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 150.0 word_end_beam = 125.0 max_model = 10000 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode(input, output): cmd = 'HDecode -A -D -V -T 9 -o M -z lat -C %s' % hdecode_config cmd += ' -H %s/MMF' % model_dir cmd += ' -k %d' % block_size cmd += ' -t %f 100.0' % beam cmd += ' -v %f 115.0' % word_end_beam cmd += ' -u %d' % max_model cmd += ' -s %f' % lm_scale cmd += ' -p %f' % word_insertion_penalty cmd += ' -w %s' % lm cmd += ' -S %s' % input cmd += ' -l %s/' % output cmd += ' %s %s' % (dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' % (output_dir, os.path.basename(input)) return cmd ## Split up MFC list split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: output = '%s/%s' % (output_dir, split_mfc.get_key(input)) if not os.path.isdir(output): os.makedirs(output) cmds.append(hdecode(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode.commands' % output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' % cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' % output_dir os.system('cp %s %s' % (mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [ os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat') ] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' % id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' % bad_count) ## Create an MLF from the recognition output outputs = util.get_files(output_dir, r'.*\.rec') os.popen('rm -f %s' % output_mlf) fh = open(output_mlf, 'w') fh.write('#!MLF!#\n') for output in outputs: fh.write('"%s"\n' % output) for line in open(output): if '<s>' in line or '</s>' in line: continue fh.write(line) fh.write('.\n') fh.close() ## Evaluate cmd = 'HResults -h -n -A -T 1' cmd += ' -I %s' % gold_mlf cmd += ' %s %s > %s' % (model_list, output_mlf, results_log) os.system(cmd) print os.popen('cat ' + results_log).read()
def run_iter(model, root_dir, prev_dir, mlf_file, model_list, mix_size, iter, extra): """ Run an iteration of Baum-Welch training using HERest """ output_dir = '%s/HMM-%d-%d' %(root_dir, mix_size, iter) util.create_new_dir(output_dir) mfc_list = '%s/mfc.list' %model.exp utts_per_split = max(250, (1 + (model.setup_length / 200))) ## HERest parameters min_train_examples = 0 prune_thresh = 250 prune_inc = 150 prune_limit = 2000 def herest(input, split_num, extra): try: log_id = os.path.basename(input).split('.')[2] except: log_id = 'acc' cmd = '%s -D -A -T 1 -m %d' %(HEREST_CMD, min_train_examples) cmd += ' -t %d %d %d' %(prune_thresh, prune_inc, prune_limit) cmd += ' -s %s/stats' %output_dir cmd += ' -C %s%s' %(model.mfc_config, extra) cmd += ' -I %s' %mlf_file cmd += ' -H %s/MMF' %prev_dir cmd += ' -p %d' %split_num cmd += ' -S %s' %input #cmd += ' -M %s %s' %(output_dir, model_list) cmd += ' -M %s %s >> %s/herest.%s.log' %(output_dir, model_list, output_dir, log_id) return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HERest commands cmds = [] inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines() split_num = 0 for input in inputs: split_num += 1 cmds.append(herest(input, split_num, extra)) ## Non-parallel case if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd) ## Parallel case: one command per line in cmds_file else: cmds_file = '%s/herest.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Gather the created .acc files acc_file = '%s/herest.list' %output_dir os.system('ls %s/HER*.acc > %s' %(output_dir, acc_file)) ## Combine acc files into a new HMM cmd = herest(acc_file, 0, extra) cmd = cmd.split('>>')[0] cmd += ' >> %s/herest.log' %output_dir if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Clean up os.system('rm -f %s/mfc.list.* %s/HER*.acc' %(output_dir, output_dir)) os.system('bzip2 %s/herest.*.log %s/run-command*.log' %(output_dir, output_dir)) ## Get a few stats num_models = int(os.popen('grep "<MEAN>" %s/MMF -c' %output_dir).read().strip()) likelihood = float(os.popen('cat %s/herest.log | grep aver' %output_dir).read().strip().split()[-1]) return output_dir, num_models, likelihood
def wav_to_mfc(model, output_dir, mfc_list): """ Use HCopy to code each wav in the setup file. HCopy takes a config file (-C) and an input (-S). The input is a file where each line looks like: <wav file> <mfc file> """ def hcopy(config, input): cmd = 'HCopy -A -T 1 -C %s -C %s -S %s' %(model.mfc_config, config, input) return cmd ## Create list files for HCopy <wav file> <mfc file> lines_per_split = 500 count = 0 prev_config = '' cmds = [] mfcs = [] file = '%s/hcopy.list.0' %output_dir fh = open(file, 'w') if model.setup.endswith('gz'): setup_reader = lambda x: gzip.open(x) else: setup_reader = lambda x: open(x) for line in setup_reader(model.setup): count += 1 [wav, config] = line.strip().split()[0:2] if not os.path.isfile(wav): sys.stderr.write('missing [%s]\n' %wav) mfc = get_mfc_name_from_wav(wav, model.data) mfcs.append(mfc) if count > 1 and (count % lines_per_split == 0 or config != prev_config): cmds.append(hcopy(prev_config, file)) fh.close() file = '%s/hcopy.list.%d' %(output_dir, len(cmds)) fh = open(file, 'w') fh.write('%s %s\n' %(wav, mfc)) prev_config = config cmds.append(hcopy(config, file)) fh.close() ## Non-parallel case if model.local == 1: for cmd in cmds: os.system(cmd) ## Parallel case: one command per line in cmds_file else: cmds_file = '%s/hcopy.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Create a file listing all created MFCs fh = open(mfc_list, 'w') for mfc in mfcs: fh.write('%s\n' %mfc) fh.close() ## Clean up os.system('rm -f %s/hcopy.list.*' %output_dir) return count
def align(model, root_dir, mfc_list, model_dir, word_mlf, new_mlf, model_list, dict, align_config): """ Create a new alignment based on a model and the word alignment with HVite """ output_dir = '%s/Align' %root_dir util.create_new_dir(output_dir) utts_per_split = max(100, (1 + (model.setup_length / 200))) ## Copy old mfc list os.system('cp %s %s/mfc_old.list' %(mfc_list, output_dir)) ## HVite parameters prune_thresh = 250 def hvite(input, output): #-o SWT cmd = 'HVite -D -A -T 1 -b silence -a -m -y lab ' cmd += '-t %d' %prune_thresh cmd += ' -C %s' %align_config cmd += ' -H %s/MMF' %model_dir cmd += ' -i %s' %output cmd += ' -I %s' %word_mlf cmd += ' -S %s' %input cmd += ' %s %s' %(dict, model_list) cmd += ' >> %s.hvite.log' %output return cmd ## Split up MFC list with unix split cmd = 'split -a 4 -d -l %d %s %s/%s' %(utts_per_split, mfc_list, output_dir, 'mfc.list.') os.system(cmd) ## Create the HVite commands cmds = [] outputs = [] inputs = os.popen('ls %s/mfc.list.*' %output_dir).read().splitlines() for input in inputs: output = input.replace('mfc.list', 'align.output') outputs.append(output) cmds.append(hvite(input, output)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hvite.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Merge and fix silences ## TODO: -s file_list merge_sil = '%s/merge_sp_sil.led' %output_dir fh = open(merge_sil, 'w') fh.write('ME sil sp sil\n') fh.write('ME sil sil sil\n') fh.write('ME sp sil sil\n') fh.close() cmd = 'HLEd -D -A -T 1 -i %s %s %s >> %s/hled.log' %(new_mlf, merge_sil, ' '.join(outputs), output_dir) if model.local == 1: os.system(cmd) else: util.run(cmd, output_dir) ## Prune failed alignments from the mfc list bad_count = 0 mlf_labels = os.popen('grep "\.lab" %s' %new_mlf).read().splitlines() mlf_labels = set([os.path.basename(s).split('.')[0] for s in mlf_labels]) mfc_labels = open(mfc_list).read().splitlines() fh = open(mfc_list, 'w') for mfc in mfc_labels: id = os.path.basename(mfc).split('.')[0] ## Check for missing transcriptions if id not in mlf_labels: if model.verbose > 0: util.log_write(model.logfh, 'removed bad alignment [%s]' %id) bad_count += 1 else: fh.write(mfc + '\n') fh.close() util.log_write(model.logfh, 'removed alignments [%d]' %bad_count) ## Clean up os.system('rm -f %s/mfc.list.* %s/align.output.*' %(output_dir, output_dir)) return output_dir
# LOAD FILES files = io.get_dumps_list(path) if len(files) == 0: util.warn("INVALID PATH TO DUMP FOLDER") sys.exit(1) frame_dir = "frames_"+movie_type util.make_dir(frame_dir) hdr = io.load_hdr(files[0]) geom = io.load_geom(hdr, path) jmin, jmax = get_j_vals(geom) #print("jmin: {} jmax: {}".format(jmin, jmax)) if diag_post: # Load fluxes from post-analysis: more flexible diag = pickle.load(open("eht_out.p", 'rb')) else: # Load diagnostics from HARM itself diag = io.load_log(path) nthreads = util.calc_nthreads(hdr, pad=0.3) if debug: # Run sequentially to make backtraces work for i in range(len(files)): plot(i) else: util.run_parallel(plot, len(files), nthreads)
def phonemark_lattices(model, lattice_dir, output_dir, model_dir, mfc_list, lm, dict, model_list): sys.stderr.write('Phonemarking lattices\n') ## Create a config file to use with HDecode hdecode_config = '%s/hdecode.config' %output_dir fh = open(hdecode_config, 'w') #fh.write('HLANGMODFILTER = "gunzip -c $.gz"\n') fh.write('HNETFILTER = "gunzip -c < $.gz"\n') fh.write('HNETOFILTER = "gzip -c > $.gz"\n') fh.write('RAWMITFORMAT = T\n') fh.write('HPARM: TARGETKIND = MFCC_0_D_A_Z\n') fh.write('GCFREQ = 50\n') fh.write('HLAT:TRACE = 19\n') fh.write('HLVNET:TRACE = 1\n') fh.write('HLVREC:TRACE = 1\n') fh.write('HLVLM:TRACE = 1\n') fh.write('LATPRUNEBEAM = 500.0\n') fh.write('MAXLMLA = 3.0\n') fh.write('BUILDLATSENTEND = T\n') fh.write('FORCELATOUT = F\n') fh.write('STARTWORD = <s>\n') fh.write('ENDWORD = </s>\n') fh.close() ## HDecode parameters utts_per_split = 100 block_size = 5 beam = 200.0 lm_scale = 15.0 word_insertion_penalty = 0.0 def hdecode_mod(input, path): input_dir = '%s/%s/' %(lattice_dir, path) if not os.path.isdir(input_dir): input_dir = '%s/%s/' %(lattice_dir, path.replace('_', '')) cmd = 'HDecode.mod -A -D -V -T 9 -q tvaldm -z lat -X lat -C %s' %hdecode_config cmd += ' -H %s/MMF' %model_dir cmd += ' -k %d' %block_size cmd += ' -t %f' %beam cmd += ' -s %f' %lm_scale cmd += ' -p %f' %word_insertion_penalty cmd += ' -w' # %s' %lm cmd += ' -S %s' %input cmd += ' -l %s/%s/' %(output_dir, path) cmd += ' -L %s' %input_dir cmd += ' %s %s' %(dict, model_list) if model.verbose > 0: cmd += ' >%s/%s.log' %(output_dir, os.path.basename(input)) return cmd ## Split up MFC list with unix split split_mfc = SplitList(output_dir, mfc_list, by_path=True) ## Create the HDecode commands cmds = [] inputs = split_mfc.get_files() for input in inputs: key = split_mfc.get_key(input) new_output = '%s/%s' %(output_dir, key) if not os.path.isdir(new_output): os.makedirs(new_output) cmds.append(hdecode_mod(input, key)) if model.local == 1: for cmd in cmds: print cmd print os.popen(cmd).read() else: cmds_file = '%s/hdecode_mod.commands' %output_dir fh = open(cmds_file, 'w') for cmd in cmds: fh.write('%s\n' %cmd) fh.close() util.run_parallel(cmds_file, model.jobs, output_dir) ## Copy old mfc list old_mfc_list = '%s/mfc_old.list' %output_dir os.system('cp %s %s' %(mfc_list, old_mfc_list)) ## Prune bad lats from the mfc list lat_ids = [os.path.basename(f).split('.')[0] for f in util.get_files(output_dir, r'.*\.lat')] bad_count = 0 fh = open(mfc_list, 'w') for mfc in open(old_mfc_list): id = os.path.basename(mfc.strip()).split('.')[0] ## Check for missing transcriptions if id not in lat_ids: if model.verbose > 1: util.log_write(model.logfh, 'removed bad lat [%s]' %id) bad_count += 1 else: fh.write(mfc) fh.close() util.log_write(model.logfh, 'removed bad lats [%d]' %bad_count)