def split_words(target, source, env, words): #print len(words) good, bad = set(), set() r = True with temp_dir(remove=r) as raw, temp_dir(remove=r) as tokenized, temp_dir(remove=r) as analyzed: with meta_open(os.path.join(raw, "file.txt"), "w") as ofd: ofd.write(" ".join(words)) #.encode("utf-8")) cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/tokenizer.jar %s %s" % (raw, tokenized)) pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE) out, err = pid.communicate() #print out, err cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/morphAnalyzer.jar false %s %s" % (tokenized, analyzed)) pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE) out, err = pid.communicate() #print out, err with meta_open(os.path.join(analyzed, "file.xml")) as ifd: xml = et.parse(ifd) for token in xml.getiterator("token"): word = token.get("surface") unk = [x for x in token.getiterator("unknown")] if len(unk) == 0: good.add(word) else: bad.add(word) return (good, bad)
def train_language_model(target, source, env): """Train an n-gram language model using a plain text transcript. Uses IBM's compiled LM tools that ship with Attila. This can also be used on a segmented transcript, in which case the n-grams are over morphs rather than words. Sources: transcript file, n Targets: language model file """ text_file = source[0].rstr() n = source[1].read() with temp_dir() as prefix_dir, temp_file() as vocab_file, temp_file(suffix=".txt") as sentence_file, meta_open(text_file) as text_fd: sentences = ["<s> %s </s>" % (l) for l in text_fd] words = set(sum([s.split() for s in sentences], []) + ["<s>", "</s>", "<UNK>"]) with meta_open(vocab_file, "w") as ofd: ofd.write("\n".join(words)) with meta_open(sentence_file, "w") as ofd: ofd.write("\n".join(sentences)) prefix = os.path.join(prefix_dir, "counts") cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, sentence_file, vocab_file, prefix) out, err, success = run_command(env.subst(cmd)) if not success: return err lm = ".".join(target[0].rstr().split(".")[0:-2]) cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm) out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")}) if not success: return err return None
def score(target, source, env): """ NEEDS WORK! CONVERT TO BUILDER! """ args = source[-1].read() with temp_dir("kws_work") as work_dir, temp_dir("kws_out") as out_dir: cmd = env.subst("${PERL} ${F4DE}/bin/BABEL13_Scorer -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source) #cmd = env.subst("${F4DE}/KWSEval/BABEL/Participants/BABEL_Scorer.pl -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source) stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), "F4DE_BASE" : env.subst(env["F4DE"]), "PERL5LIB" : env.subst("$PERL_LIBRARIES"), "PATH" : ":".join([env.subst("${OVERLAY}/bin")] + os.environ["PATH"].split(":"))}) if not success: return stderr + stdout else: shutil.rmtree(os.path.dirname(target[0].rstr()), ignore_errors=False) shutil.copytree(out_dir, os.path.dirname(target[0].rstr())) return None #tmpfile_fid, tmpfile_name = tempfile.mkstemp() #theargs = {} #theargs.update(args) #theargs.update({"KWS_LIST_FILE" : source[0].rstr(), "PREFIX" : tmpfile_name}) #cmd = env.subst("${KWSEVALPL} -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs, # source=source, target=target) #cmd = env.subst("${F4DE}/bin/BABEL13_Scorer -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs, #source=source, target=target) #stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), "PERL5LIB" : env.subst("${OVERLAY}/lib/perl5/site_perl:${F4DE}/common/lib:${F4DE}/KWSEval/lib/"), "PATH" : "/usr/bin"}) #if not success: # return stderr + stdout #os.remove(tmpfile_name) #shutil.move("%s.sum.txt" % tmpfile_name, target[0].rstr()) #shutil.move("%s.bsum.txt" % tmpfile_name, target[1].rstr()) return None