Пример #1
0
def split_words(target, source, env, words):
    #print len(words)
    good, bad = set(), set()
    r = True
    with temp_dir(remove=r) as raw, temp_dir(remove=r) as tokenized, temp_dir(remove=r) as analyzed:
        with meta_open(os.path.join(raw, "file.txt"), "w") as ofd:
            ofd.write(" ".join(words)) #.encode("utf-8"))
        cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/tokenizer.jar %s %s" % (raw, tokenized))
        pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE)
        out, err = pid.communicate()
        #print out, err
        cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/morphAnalyzer.jar false %s %s" % (tokenized, analyzed))
        pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE)
        out, err = pid.communicate()
        #print out, err
        with meta_open(os.path.join(analyzed, "file.xml")) as ifd:
            xml = et.parse(ifd)
            for token in xml.getiterator("token"):
                word = token.get("surface")
                unk = [x for x in token.getiterator("unknown")]
                if len(unk) == 0:
                    good.add(word)
                else:
                    bad.add(word)
    return (good, bad)
Пример #2
0
def train_language_model(target, source, env):
    """Train an n-gram language model using a plain text transcript.

    Uses IBM's compiled LM tools that ship with Attila.  This can also be used on a segmented transcript,
    in which case the n-grams are over morphs rather than words.

    Sources: transcript file, n
    Targets: language model file
    """
    text_file = source[0].rstr()
    n = source[1].read()
    with temp_dir() as prefix_dir, temp_file() as vocab_file, temp_file(suffix=".txt") as sentence_file, meta_open(text_file) as text_fd:
        sentences = ["<s> %s </s>" % (l) for l in text_fd]
        words =  set(sum([s.split() for s in sentences], []) + ["<s>", "</s>", "<UNK>"])
        with meta_open(vocab_file, "w") as ofd:
            ofd.write("\n".join(words))
        with meta_open(sentence_file, "w") as ofd:
            ofd.write("\n".join(sentences))
        prefix = os.path.join(prefix_dir, "counts")
        cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, sentence_file, vocab_file, prefix)
        out, err, success = run_command(env.subst(cmd))
        if not success:
            return err
        
        lm = ".".join(target[0].rstr().split(".")[0:-2])
        cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm)
        out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")})
        if not success:
            return err
        
    return None
Пример #3
0
def score(target, source, env):
    """
    NEEDS WORK!
    CONVERT TO BUILDER!
    """
    args = source[-1].read()

    with temp_dir("kws_work") as work_dir, temp_dir("kws_out") as out_dir:
        cmd = env.subst("${PERL} ${F4DE}/bin/BABEL13_Scorer -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source)
        #cmd = env.subst("${F4DE}/KWSEval/BABEL/Participants/BABEL_Scorer.pl -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source)
        stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), 
                                                        "F4DE_BASE" : env.subst(env["F4DE"]),
                                                        "PERL5LIB" : env.subst("$PERL_LIBRARIES"),
                                                        "PATH" : ":".join([env.subst("${OVERLAY}/bin")] + os.environ["PATH"].split(":"))})
        if not success:
            return stderr + stdout
        else:
            shutil.rmtree(os.path.dirname(target[0].rstr()), ignore_errors=False)
            shutil.copytree(out_dir, os.path.dirname(target[0].rstr()))
    return None
    #tmpfile_fid, tmpfile_name = tempfile.mkstemp()
    

    #theargs = {}
    #theargs.update(args)
    #theargs.update({"KWS_LIST_FILE" : source[0].rstr(), "PREFIX" : tmpfile_name})
    #cmd = env.subst("${KWSEVALPL} -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs,
    #                source=source, target=target)                    
    #cmd = env.subst("${F4DE}/bin/BABEL13_Scorer -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs,
    #source=source, target=target)                    
    #stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), "PERL5LIB" : env.subst("${OVERLAY}/lib/perl5/site_perl:${F4DE}/common/lib:${F4DE}/KWSEval/lib/"), "PATH" : "/usr/bin"})
    #if not success:
    #    return stderr + stdout
    #os.remove(tmpfile_name)
    #shutil.move("%s.sum.txt" % tmpfile_name, target[0].rstr())
    #shutil.move("%s.bsum.txt" % tmpfile_name, target[1].rstr())
    return None