def normalize(target, source, env): """ Takes the combined IV and OOV results Remove keywords not in kwlist NEEDS WORK! CONVERT TO BUILDER! """ tmpfile_fid, tmpfile_name = tempfile.mkstemp() res_xml = et.parse(meta_open(source[0].rstr())) kw_ids = {(a, b.lstrip("0")) : "%s-%s" % (a, b) for a, b in [x.split("-") for x in set([x.get("kwid") for x in et.parse(meta_open(source[1].rstr())).getiterator("kw")])]} elems = [x for x in res_xml.getiterator("detected_termlist")] # if x.get("termid") not in kw_ids] for e in elems: a, b = e.get("termid").split("-") b = b.lstrip("0") if (a, b) not in kw_ids: res_xml.getroot().remove(e) #print kw_ids[(a, b)] else: #print kw_ids[(a, b)] e.set("termid", kw_ids[(a, b)]) res_xml.write(tmpfile_name) stdout, stderr, success = run_command(env.subst("${PYTHON} ${F4DENORMALIZATIONPY} ${SOURCE} ${TARGET}", target=target, source=tmpfile_name)) os.remove(tmpfile_name) if not success: print stderr return None
def query_to_phone_fst(target, source, env): """ Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/query2phonefst [-opts] [outputdir] [querylist] -d file dictionary file -s file use external phone table specified -O file file containing prons for oovs, output of l2s system -l file file to output list of all fsts corresponding to queries -I int ignore (print empty fst-file) if query has less than <int> phones -t double if specified, tag oovs with soft threshold indicated. -u if specified, ignore weight of alternative prons -g add gamma penalty for query length p = p^gamma (gamma=1/lenght-phone) -w if specified, query is represented as one arc per word, not converted to phones -p p2pfile p2pfile, to allow for fuzziness in query (default:no p2p) -n nbest if p2pfile, this limits number of paths retaind after composing query with p2p -? info/options """ args = source[-1].read() try: os.makedirs(args["OUTDIR"]) except: pass command = env.subst("${QUERY2PHONEFST} -p ${SOURCES[0]} -s ${SOURCES[1]} -d ${SOURCES[2]} -l ${TARGETS[0]} -n %(n)d -I %(I)d %(OUTDIR)s ${SOURCES[3]}" % args, target=target, source=source) #command = env.subst("${BABEL_REPO}/KWS/bin64/query2phonefst -s ${SOURCES[1]} -d ${SOURCES[2]} -l ${TARGETS[0]} -I %(I)d %(OUTDIR)s ${SOURCES[3]}" % args, target=target, source=source) #print command stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not success: return stderr return None
def run_asr_experiment_torque(target, source, env): args = source[-1].read() construct_command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[1].abspath}", source=source) out, err, success = run_command(construct_command) if not success: return out + err stdout = env.Dir(args.get("stdout", args["path"])).Dir("stdout").rstr() stderr = env.Dir(args.get("stderr", args["path"])).Dir("stderr").rstr() if not os.path.exists(stdout): os.makedirs(stdout) if not os.path.exists(stderr): os.makedirs(stderr) command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[2].abspath} -n ${TORQUE_JOBS_PER_SCONS_INSTANCE} -j $${PBS_ARRAYID} -w ${ACOUSTIC_WEIGHT} -l 1", source=source) interval = args.get("interval", 10) job = torque.Job(args.get("name", "scons"), commands=[command], path=args["path"], stdout_path=stdout, stderr_path=stderr, array=args.get("array", 0), other=args.get("other", ["#PBS -W group_list=yeticcls"]), ) if env["HAS_TORQUE"]: job.submit(commit=True) while job.job_id in [x[0] for x in torque.get_jobs(True)]: logging.debug("sleeping...") time.sleep(interval) else: logging.info("no Torque server, but I would submit:\n%s" % (job)) with meta_open(target[0].rstr(), "w") as ofd: ofd.write(time.asctime() + "\n") return None
def normalize_sum_to_one(target, source, env): """ NEEDS WORK! CONVERT TO BUILDER! """ stdout, stderr, success = run_command(env.subst("java -cp ${JAVA_NORM} normalization.ApplySumToOneNormalization ${SOURCE} ${TARGET}", target=target, source=source)) if not success: return stderr return None
def merge_scores(target, source, env): """ NEEDS WORK! CONVERT TO BUILDER! """ stdout, stderr, success = run_command(env.subst("${MERGESCORESSUMPOSTNORMPL} ${SOURCES[0]}", target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")}) if not success: return stderr meta_open(target[0].rstr(), "w").write(stdout) return None
def fst_compile(target, source, env): """ Compile an FST using OpenFST's binary 'fstcompile'. """ command = env.subst("${FSTCOMPILE} --isymbols=${SOURCES[0]} --osymbols=${SOURCES[0]} ${SOURCES[1]}", target=target, source=source) stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not success: return stderr meta_open(target[0].rstr(), "w").write(stdout) return None
def build_pad_fst(target, source, env): """ printing usage Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/buildpadfst [symtable_file] [output_fst_file] """ command = env.subst("${BUILDPADFST} ${SOURCE} ${TARGET}", target=target, source=source) stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not success: return stderr return None
def run_asr_experiment(target, source, env): args = source[-1].read() construct_command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[1].abspath}", source=source) out, err, success = run_command(construct_command) if not success: return out + err command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[2].abspath} -n ${LOCAL_JOBS_PER_SCONS_INSTANCE} -j %d -w ${ACOUSTIC_WEIGHT} -l 1", source=source) procs = [subprocess.Popen(shlex.split(command % i)) for i in range(env["LOCAL_JOBS_PER_SCONS_INSTANCE"])] for p in procs: p.wait() return None
def ibm_train_language_model(target, source, env): text_file = source[0].rstr() vocab_file = source[1].rstr() n = source[2].read() # first create count files temp_dir = tempfile.mkdtemp() prefix = os.path.join(temp_dir, "temp") cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, text_file, vocab_file, prefix) out, err, success = run_command(env.subst(cmd)) # build LM lm = ".".join(target[0].rstr().split(".")[0:-2]) cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm) out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")}) # clean up for i in range(1, n + 1): os.remove("%s.count.%d" % (prefix, i)) os.remove("%s.count.check" % (prefix)) os.rmdir(temp_dir) return None
def merge(target, source, env): """ NEEDS WORK! CONVERT TO BUILDER! Combines the output of several searches input: XML files (<term>) output: """ args = source[-1].read() #stdout, stderr, success = run_command(env.subst("${BABEL_REPO}/KWS/scripts/printQueryTermList.prl -padlength=%(PADLENGTH)d ${SOURCES[0]}" % args, # target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")}) stdout, stderr, success = run_command(env.subst("${PRINTQUERYTERMLISTPRL} -prefix=KW%(LANGUAGE_ID)s- -padlength=%(PADLENGTH)d ${SOURCES[0]}" % args, target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")}) meta_open(target[0].rstr(), "w").write(stdout) meta_open(target[1].rstr(), "w").write("\n".join([x.rstr() for x in source[1:-1]])) if args["MODE"] == "merge-atwv": return "merge-atwv option not supported!" else: merge_search_from_par_index = "${MERGESEARCHFROMPARINDEXPRL} -force-decision=\"YES\" ${TARGETS[0]} ${TARGETS[1]}" stdout, stderr, success = run_command(env.subst(merge_search_from_par_index, target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")}) meta_open(target[2].rstr(), "w").write(stdout) meta_open(target[3].rstr(), "w").write("\n".join(stdout.split("\n"))) return None
def run_g2p(target, source, env): with temp_file() as tfname, meta_open(source[0].rstr()) as pl_fd: words = set([x.split()[0].split("(")[0] for x in pl_fd]) with meta_open(tfname, "w") as t_fd: t_fd.write("\n".join(words)) out, err, success = run_command(env.subst("%s %s/bin/g2p.py --model %s --encoding=%s --apply %s --variants-mass=%f --variants-number=%d" % (env["PYTHON"], env["OVERLAY"], source[1].rstr(), "utf-8", tfname, .9, 4)), env={"PYTHONPATH" : env.subst("${OVERLAY}/lib/python2.7/site-packages")}, ) if not success: return err else: with meta_open(target[0].rstr(), "w") as out_fd: out_fd.write(out) return None
def build_index(target, source, env): """ Creates an index of files listed in the input, using the IBM binary 'buildindex'. Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/buildindex [-opts] [lattice_list] [output_file] Options: -f file filter fst (default : none) -p push costs -J int job-batch (for parallel run) -N int total number of jobs (for parallel run) -v (verbose) if specified all debug output is printed to stderr -? help """ command = env.subst("${BUILDINDEX} -p ${SOURCE} ${TARGET}", target=target, source=source) stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not success: return stderr return None
def train_pronunciation_model(target, source, env): """ g2p.py --train - --devel 5% --model test.model2 --ramp-up --write-model test.model3 """ train_fname = source[0].rstr() dev_percent = source[1].read() if len(source) == 3: previous = source[2].rstr() cmd = "${SEQUITUR_PATH}/bin/g2p.py --train - --devel %d%% --write-model %s --ramp-up --model %s" % (dev_percent, target[0].rstr(), previous) else: cmd = "${SEQUITUR_PATH}/bin/g2p.py --train - --devel %d%% --write-model %s" % (dev_percent, target[0].rstr()) with open(train_fname) as ifd: data = "\n".join([re.sub(r"^(\S+)\(\d+\) (\S+) \[ wb \] (.*) \[ wb \]$", r"\1 \2 \3", line.strip()) for line in ifd if "REJ" not in line and line[0] != "<" and "SIL" not in line]) #print data out, err, success = run_command(env.subst(cmd), env={"PYTHONPATH" : env.subst("${SEQUITUR_PATH}/lib/python2.7/site-packages")}, data=data) if not success: return err else: return None
def score(target, source, env): """ NEEDS WORK! CONVERT TO BUILDER! """ args = source[-1].read() with temp_dir("kws_work") as work_dir, temp_dir("kws_out") as out_dir: cmd = env.subst("${PERL} ${F4DE}/bin/BABEL13_Scorer -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source) #cmd = env.subst("${F4DE}/KWSEval/BABEL/Participants/BABEL_Scorer.pl -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source) stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), "F4DE_BASE" : env.subst(env["F4DE"]), "PERL5LIB" : env.subst("$PERL_LIBRARIES"), "PATH" : ":".join([env.subst("${OVERLAY}/bin")] + os.environ["PATH"].split(":"))}) if not success: return stderr + stdout else: shutil.rmtree(os.path.dirname(target[0].rstr()), ignore_errors=False) shutil.copytree(out_dir, os.path.dirname(target[0].rstr())) return None #tmpfile_fid, tmpfile_name = tempfile.mkstemp() #theargs = {} #theargs.update(args) #theargs.update({"KWS_LIST_FILE" : source[0].rstr(), "PREFIX" : tmpfile_name}) #cmd = env.subst("${KWSEVALPL} -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs, # source=source, target=target) #cmd = env.subst("${F4DE}/bin/BABEL13_Scorer -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs, #source=source, target=target) #stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), "PERL5LIB" : env.subst("${OVERLAY}/lib/perl5/site_perl:${F4DE}/common/lib:${F4DE}/KWSEval/lib/"), "PATH" : "/usr/bin"}) #if not success: # return stderr + stdout #os.remove(tmpfile_name) #shutil.move("%s.sum.txt" % tmpfile_name, target[0].rstr()) #shutil.move("%s.bsum.txt" % tmpfile_name, target[1].rstr()) return None
def standard_search(target, source, env): """ Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/stdsearch [-opts] [result_file] [query_file] Options: -d file data file [data.list] with lines in the following format : utt_name start_time fst_path (default: data.list) -f filt filter fst (default: none) -i fst index fst (default: index.fst) -n N return N-best results (default: return all) -p fst pad fst (default: fspad.fst) -s symbols arc symbols (default: word.list) -t threshold min score needed to decide YES, (if specified it overrides term-spec-threshold (default)) -T true/false true (default)=queries are in text format, false=fst format txtformat: query list is a list of queries, fstformat: query list is a list of full-paths to query fsts -J int job-batch (for parallel run) -N int total number of jobs (for parallel run) -a string title on results list (default: stdbn.tlist.xml) -b string prefix on termid (default: TERM-0) -m string termid numerical formatting string (default: -1524500936) -O if specified, don't optimize (default : optimize = true) -v (verbose) if specified, print all debug outputs to stderr -? info/options """ data_list, isym, idx, pad, queryph = source[0:5] args = source[-1].read() if source[-2].stat().st_size == 0: with meta_open(target[0].rstr(), "w") as ofd: ofd.write("""<stdlist termlist_filename="std.xml" indexing_time="68.51" language="english" index_size="" system_id="" />\n""") return None command = env.subst("${STDSEARCH} -F ${TARGET} -i ${SOURCES[2]} -b KW%(LANGUAGE_ID)s- -s ${SOURCES[1]} -p ${SOURCES[3]} -d ${SOURCES[0]} -a %(TITLE)s -m %(PRECISION)s ${SOURCES[4]}" % args, target=target, source=source) stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if not success: return stderr return None
def score_results(target, source, env): """ """ ctm_path = source[0].rstr() transcript = source[1].rstr() out_path = os.path.dirname(target[0].rstr()) # Get a list of IDs from the reference. All must appear in the CTM output spkD = set() with codecs.open(transcript, "rb", encoding="utf-8") as f: for line in f: if line.startswith(";;"): continue spkD.add(line.split()[0]) # skip eval data isEval = re.compile("/eval/") # Merge and clean up CTM skipD = frozenset([u"~SIL", u"<s>", u"</s>", u"<HES>", u"<hes>"]) ctmL = [] for file_ in glob(pjoin(ctm_path, "*.ctm")): with codecs.open(file_, "rb", encoding="utf-8") as ctmF: for line in ctmF: uttid, pcm, beg, dur, token = line.split() if isEval.search(pcm): continue token = token[:-4] if token in skipD: continue idx = uttid.find("#") spk = uttid[:idx] spkD.discard(spk) ctmL.append((spk, float(beg), dur, token)) ctmL.sort() # add in missing speakers for spk in spkD: bisect.insort(ctmL, (spk, 0.0, "0.0", "@")) with codecs.open(pjoin(out_path, "all.ctm"), "wb", encoding="utf-8") as outF: for ctm in sorted(ctmL): outF.write("%s 1 %7.3f %s %s\n" % ctm) args = {"SCLITE" : env["SCLITE_BINARY"], "TRANSCRIPT" : transcript, "TRANSCRIPT_FORMAT" : "stm", "HYPOTHESIS" : os.path.abspath(pjoin(out_path, "all.ctm")), "HYPOTHESIS_FORMAT" : "ctm", "ENCODING" : "utf-8", "OUTPUT_NAME" : "babel", "OUTPUT_ROOT" : os.path.abspath(out_path), "OUTPUT_TYPES" : "all dtl sgml", } # Run scoring cmd =env.subst("%(SCLITE)s -r %(TRANSCRIPT)s %(TRANSCRIPT_FORMAT)s -O %(OUTPUT_ROOT)s -h %(HYPOTHESIS)s %(HYPOTHESIS_FORMAT)s -n %(OUTPUT_NAME)s -o %(OUTPUT_TYPES)s -e %(ENCODING)s -D -F" % args) out, err, success = run_command(cmd) if not success: return out + err return None