示例#1
0
def segmented_pronunciations(target, source, env):
    """Takes a word pronunciation file and a word segmentation file and creates a "segmented word pronunciation file" suitable for training G2P

    Sources: pronunciation file, word segmentation file
    Targets: word pronunciation, morph list file
    """
    sep = "+"
    data = {}
    segs = {}
    morphs = set()
    w2w = {}
    with meta_open(source[0].rstr()) as pron_fd, meta_open(source[1].rstr()) as seg_fd:
        for l in pron_fd:
            word, pron = re.match(r"^(\S+)\(\d+\) (.*)$", l.strip().replace(" [ wb ]", "")).groups()
            if env.get("LOWER_CASE"):
                word = word.lower()
            data[word] = data.get(word, []) + [pron]
        vals = [[y.strip(sep) for y in x.split()] for x in seg_fd]
        if env.get("LOWER_CASE"):
            vals = [[y.lower() for y in x] for x in vals]
        segs = {sep.join(x) : x for x in vals}
        w2s = {"".join(x) : sep.join(x) for x in vals}
        for m in segs.values():
            if len(m) == 1:
                morphs.add(m[0])
            if len(m) >= 2:
                morphs.add("%s%s" % (m[0], sep))
                morphs.add("%s%s" % (sep, m[1]))
                for x in m[1:-1]:
                    morphs.add("%s%s%s" % (sep, x, sep))
    with meta_open(target[0].rstr(), "w") as seg_ofd, meta_open(target[1].rstr(), "w") as morph_ofd:
        seg_ofd.write("\n".join(sum([["%s %s" % (w2s.get(k, k), p) for p in v] for k, v in data.iteritems()], [])))
        morph_ofd.write("\n".join(morphs))
    return None
示例#2
0
def split_words(target, source, env, words):
    #print len(words)
    good, bad = set(), set()
    r = True
    with temp_dir(remove=r) as raw, temp_dir(remove=r) as tokenized, temp_dir(remove=r) as analyzed:
        with meta_open(os.path.join(raw, "file.txt"), "w") as ofd:
            ofd.write(" ".join(words)) #.encode("utf-8"))
        cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/tokenizer.jar %s %s" % (raw, tokenized))
        pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE)
        out, err = pid.communicate()
        #print out, err
        cmd = env.subst("java -Xmx4024M -jar ${MILA_PATH}/morphAnalyzer.jar false %s %s" % (tokenized, analyzed))
        pid = Popen(cmd.split(), cwd=env.subst("${MILA_PATH}"), stdout=PIPE, stderr=PIPE)
        out, err = pid.communicate()
        #print out, err
        with meta_open(os.path.join(analyzed, "file.xml")) as ifd:
            xml = et.parse(ifd)
            for token in xml.getiterator("token"):
                word = token.get("surface")
                unk = [x for x in token.getiterator("unknown")]
                if len(unk) == 0:
                    good.add(word)
                else:
                    bad.add(word)
    return (good, bad)
示例#3
0
def alter_iv_oov(target, source, env):
    """
    NEEDS WORK!
    If the vocabulary has been expanded, some OOV terms are now IV.
    """
    iv_q, oov_q, iv, term_map, kw_file, w2w_file = source
    with meta_open(iv_q.rstr()) as iv_q_fd, meta_open(oov_q.rstr()) as oov_q_fd, meta_open(iv.rstr()) as iv_fd, meta_open(term_map.rstr()) as term_map_fd, meta_open(kw_file.rstr()) as kw_file_fd, meta_open(w2w_file.rstr()) as w2w_fd:
        iv_queries = [x.strip() for x in iv_q_fd]
        oov_queries = [x.strip() for x in oov_q_fd]
        iv_words = [x.strip().split("(")[0] for x in iv_fd]
        oov_to_iv_indices = [i for i, q in enumerate(oov_queries) if all([x in iv_words for x in q.split()])]
        oov_to_oov_indices = enumerate([i for i, q in enumerate(oov_queries) if not all([x in iv_words for x in q.split()])])
        new_iv_queries = iv_queries + [oov_queries[i] for i in oov_to_iv_indices]
        new_oov_queries = [x for i, x in enumerate(oov_queries) if i not in oov_to_iv_indices]
        old_mapping = {(y[0], int(y[2])) : y[1] for y in [x.strip().split() for x in term_map_fd]}
        new_mapping = old_mapping.copy()
        for i, old_oov_num in enumerate(oov_to_iv_indices):
            x = old_mapping[("oov", old_oov_num + 1)]
            new_iv_num = len(iv_queries) + i + 1
            del new_mapping[("oov", old_oov_num + 1)]
            new_mapping[("iv", new_iv_num)] = x
        for new_oov, old_oov in oov_to_oov_indices:
            x = old_mapping[("oov", old_oov + 1)]
            del new_mapping[("oov", old_oov + 1)]
            new_mapping[("oov", new_oov + 1)] = x
        #old_xml = et.fromstring(kw_file_fd.read())
        new_w2w = [" ".join(y) for y in set([tuple(x.split()) for x in w2w_fd if len(x.split()) == 5] + [("0", "0", x, x, "0") for x in iv_words])]
        open(target[0].rstr(), "w").write("\n".join(new_iv_queries) + "\n")
        open(target[1].rstr(), "w").write("\n".join(new_oov_queries) + "\n")
        #open(target[2].rstr(), "w").write(open(term_map.rstr()).read())
        open(target[2].rstr(), "w").write("\n".join(["%s %s %0.5d" % (s, on, n) for (s, n), on in sorted(new_mapping.iteritems(), lambda x, y : cmp(x[1], y[1]))]))
        open(target[3].rstr(), "w").write(kw_file_fd.read())
        open(target[4].rstr(), "w").write("\n".join(new_w2w) + "\n0\n")
    return None
示例#4
0
def strip_logging(target, source, env):
   for t, s in zip(target, source):
      with meta_open(s.rstr()) as ifd:
         lines = [l for l in ifd if "logger.fin" not in l and "assert" not in l]
         with meta_open(t.rstr(), "w") as ofd:
            ofd.write("".join(lines))
   return None
示例#5
0
def query_files(target, source, env):
    # OUTPUT:
    # iv oov map w2w <- 
    # INPUT: kw, iv, id
    # pad id to 4
    remove_vocab = source[-1].read()
    with meta_open(source[0].rstr()) as kw_fd, meta_open(source[1].rstr()) as iv_fd:
        keyword_xml = et.parse(kw_fd)
        keywords = set([(x.get("kwid"), x.find("kwtext").text.lower()) for x in keyword_xml.getiterator("kw")])
        #print list(keywords)[0][1].split()
        vocab = [x.decode("utf-8") for x in Pronunciations(iv_fd).get_words()]
        #print list(vocab)[0].split()
        #set([x.split()[1].strip().decode("utf-8") for x in iv_fd])
        if remove_vocab:
            remove_vocab = Vocabulary(meta_open(remove_vocab)).get_words()
        else:
            remove_vocab = []
        iv_keywords = sorted([(int(tag.split("-")[-1]), tag, term) for tag, term in keywords if all([y in vocab for y in term.split()]) and term not in remove_vocab])
        oov_keywords = sorted([(int(tag.split("-")[-1]), tag, term) for tag, term in keywords if any([y not in vocab for y in term.split()])])
        language_id = source[-2].read()
        with meta_open(target[0].rstr(), "w") as iv_ofd, meta_open(target[1].rstr(), "w") as oov_ofd, meta_open(target[2].rstr(), "w") as map_ofd, meta_open(target[3].rstr(), "w") as w2w_ofd, meta_open(target[4].rstr(), "w") as kw_ofd:
            iv_ofd.write("\n".join([x[2].encode("utf-8") for x in iv_keywords]))
            oov_ofd.write("\n".join([x[2].encode("utf-8") for x in oov_keywords]))
            map_ofd.write("\n".join(["%s %.5d %.5d" % x for x in 
                                     sorted([("iv", gi, li) for li, (gi, tag, term) in enumerate(iv_keywords, 1)] + 
                                            [("oov", gi, li) for li, (gi, tag, term) in enumerate(oov_keywords, 1)], lambda x, y : cmp(x[1], y[1]))]))
            w2w_ofd.write("\n".join([("0 0 %s %s 0" % (x.encode("utf-8"), x.encode("utf-8"))) for x in vocab if x != "VOCAB_NIL_WORD"] + ["0"]))
            for x in keyword_xml.getiterator("kw"):
                x.set("kwid", "KW%s-%s" % (language_id, x.get("kwid").split("-")[-1]))
            keyword_xml.write(kw_ofd) #.write(et.tostring(keyword_xml.))
    return None
示例#6
0
def probability_list_to_vocabulary(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        probs = ProbabilityList(ifd)
    with meta_open(target[0].rstr(), "w") as ofd:
        vocab = Vocabulary.from_set(probs.get_words())
        ofd.write(vocab.format())
    return None
示例#7
0
def perform_search(target, source, env):
    """Searches for each query term in the index.

    Sources: index file, phone symbol file, index symbol file, fst header, keyword 1, keyword 2 ...
    Targets: search result file
    """
    index, phone_symbols, index_symbols, fst_header = source[0:4]
    terms = source[4:]
    prune = env.get("PRUNE")
    results = []
    for f in terms:
        with meta_open(f.rstr()) as ifd:
            for fname in ifd.getnames():
                key = fname.split(".")[0]
                e = codecs.getreader("utf-8")
                query = ifd.extractfile(fname).read()
                cmd = env.subst("fstcompose - ${SOURCES[0]} | fstprune -weight=${PRUNE} - | fstrmepsilon | fstprint -isymbols=${SOURCES[1]}  -osymbols=${SOURCES[2]} -  | cat ${SOURCES[3]} - | bin/FsmOp -out-cost - -n-best 50000 -gen | perl bin/process.1.pl - 100 1e-40 %s | sort -k 5 -gr | perl ${CN_KWS_SCRIPTS}/clean_result.words.pl -" % key, source=source, target=target)
                pid = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
                out, err = pid.communicate(query)
                if not re.match(r"^\s*$", out):
                    for line in out.strip().split("\n"):
                        toks = line.split()
                        toks[3] = key
                        results.append(" ".join(toks))
    with meta_open(target[0].rstr(), "w", None) as ofd:
        ofd.write("\n".join(results).strip() + "\n")            
    return None
示例#8
0
def freqs_to_tab(target, source, env):
    items = {}
    data = {}
    tempdata = {}
    totals = {}
    avs = {}
    for f in source[0].children():
        subject = os.path.basename(f.rstr()).split("_")[0]
        tempdata[subject] = {}
        for r in csv.reader(meta_open(f.rstr()), delimiter="\t"):
            if r[1] == "COUNT":
                continue
            tempdata[subject][r[0]] = r[1]
            totals[subject] = totals.get(subject, 0) + int(r[1])
            items[r[0]] = items.get(r[0], 0) + int(r[1])
    for s, wordcounts in tempdata.iteritems():
        data[s] = {}
        for w, c in wordcounts.iteritems():
            data[s][w] = float(tempdata[s].get(w, 0)) / float(totals[s])
            avs[w] = avs.get(w, []) + [data[s][w]]
    # items = sorted([x for x, y in items.iteritems() if y > 20000 and not re.match("^.*\W.*$", x)])
    items = sorted([x for x, y in items.iteritems()], lambda x, y: cmp(sum(avs[y]), sum(avs[x])))[
        0 : min(2000, len(items))
    ]
    fd = csv.writer(meta_open(target[0].rstr(), "w"), delimiter="\t")
    fd.writerow(["SUBJECT"] + ["'%s'" % esc_weka(x) for x in items])
    for k, v in data.iteritems():
        fd.writerow(["'%s'" % esc_weka(k)] + [v.get(x, 0) for x in items])
    return None
示例#9
0
def prepare_segmentations_for_release(target, source, env):
    """Intended to produce files suitable for shipping directly to partners.

    This is not well-planned, and should probably not be used as-is, but I think
    it would be useful to incorporate something automatic into the build system
    to make our deliverables more consistent.

    Sources: segmentation file 1, word file 1, segmentation file 2, word file 2 ...
    Targets: deliverable file 1, deliverable file 2 ...
    """
    nag = env.get("NON_ACOUSTIC_GRAPHEMES")
    rx_str = "^(%s)+$" % ("|".join([unichr(int(x, base=16)) for x in env.get("NON_ACOUSTIC_GRAPHEMES")]))
    rx = re.compile(rx_str)
    for (seg_file, word_file), out in zip(pairs(source, 2), target):
        with meta_open(seg_file.rstr()) as ifd:
            data = [line.strip().split() for line in ifd]
            morphs = {"".join([x.strip("+") for x in ms]) : ms for ms in data}
        with meta_open(word_file.rstr()) as ifd:
            lines = [l.strip().split() for l in ifd if "_" not in l]
            
        for words in lines:
            for word in sum([x.split("-") for x in words], []):
                if word != "" and word not in morphs and "_" not in word and "<" not in word and not re.match(r"^\d+$", word):
                    return "%s, %s, %s" % (seg_file, word_file, word)
                    
        with meta_open(out.rstr(), "w") as ofd:
            for morph, seg in sorted(morphs.iteritems()):
                ofd.write("%s\t%s\n" % (morph, " ".join(seg)))

    return None
示例#10
0
def conllish_to_xml(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        sentences = [[(w, t, []) for w, t in [re.split(r"\s+", x) for x in s.split("\n") if not re.match(r"^\s*$", x)]] for s in re.split(r"\n\n", ifd.read(), flags=re.M)]
    data = DataSet.from_sentences(sentences)
    with meta_open(target[0].rstr(), "w") as ofd:
        data.write(ofd)
    return None
示例#11
0
def apply_morfessor(target, source, env):
    """Applies a trained Morfessor model to an unseen word list.

    Sources: morfessor model file, word list file
    Targets: segmented word list
    """
    parser = get_default_argparser()
    args = parser.parse_args([])
    io = MorfessorIO(encoding=args.encoding,
                     compound_separator=args.cseparator,
                     atom_separator=args.separator)
    model = io.read_binary_model_file(source[0].rstr())
    words = []
    terms = {}
    for fname in source[1:]:
        try:
            with meta_open(fname.rstr(), enc=None) as ifd:
                for t in et.parse(ifd).getiterator("kw"):
                    text = list(t.getiterator("kwtext"))[0].text
                    words += text.strip().split()
        except:
            with meta_open(fname.rstr()) as ifd:
                words = [l.strip().split()[0] for l in ifd]
    words = set(sum([w.strip("-").split("-") for w in words if "_" not in w], []))
    for w in words:
        toks, score = model.viterbi_segment(w)
        if len(toks) >= 2:
            toks = ["%s+" % toks[0]] + ["+%s+" % t for t in toks[1:-1]] + ["+%s" % toks[-1]]
        terms[w] = toks
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write(("\n".join(sorted(["%s" % (" ".join(v)) for k, v in terms.iteritems()]))) + "\n")
    return None
示例#12
0
def train_language_model(target, source, env):
    """Train an n-gram language model using a plain text transcript.

    Uses IBM's compiled LM tools that ship with Attila.  This can also be used on a segmented transcript,
    in which case the n-grams are over morphs rather than words.

    Sources: transcript file, n
    Targets: language model file
    """
    text_file = source[0].rstr()
    n = source[1].read()
    with temp_dir() as prefix_dir, temp_file() as vocab_file, temp_file(suffix=".txt") as sentence_file, meta_open(text_file) as text_fd:
        sentences = ["<s> %s </s>" % (l) for l in text_fd]
        words =  set(sum([s.split() for s in sentences], []) + ["<s>", "</s>", "<UNK>"])
        with meta_open(vocab_file, "w") as ofd:
            ofd.write("\n".join(words))
        with meta_open(sentence_file, "w") as ofd:
            ofd.write("\n".join(sentences))
        prefix = os.path.join(prefix_dir, "counts")
        cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, sentence_file, vocab_file, prefix)
        out, err, success = run_command(env.subst(cmd))
        if not success:
            return err
        
        lm = ".".join(target[0].rstr().split(".")[0:-2])
        cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm)
        out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")})
        if not success:
            return err
        
    return None
示例#13
0
def gaussier_morph(target, source, env):
    if isinstance(source[0], Value):
        words = source[0].read()
    else:
        words = set([x.text for x in et.parse(meta_open(source[0].rstr())).getiterator("f") if x.attrib["name"] == "morph" and x.text and len(x.text) > 4])
    
    pairs = gaussier.suffix_pairs(words, min_psimilarity=int(env["MIN_PSIM"]), min_occurrence=int(env["MIN_OCCURRENCES"]))
    print len(pairs)
    #if len(pairs) > 0:
    #    dv, words = gaussier.similarity_matrix(pairs)
        #print pairs
    #try:
    rel_fams = dict([(i, [r + s for s in x[0] for r in x[1]]) for i, x in enumerate(pairs.iteritems())])
    #gaussier.relational_families(dv, words)
    #        print rel_fams
    p = Proteus(fams=rel_fams,
                title="""Gaussier Morphology, min. p-similarity=%s, unique words=%s,
                min. occurrences=%s, clustering=(method=%s, threshold=%s)""" % (env["MIN_PSIM"],
                                                                                env["WORD_COUNT"],
                                                                                env["MIN_OCCURRENCES"],
                                                                                env["CLUSTERING_METHOD"],
                                                                                env["CLUSTERING_THRESHOLD"]))
        
    fd = meta_open(target[0].rstr(), 'w')
    fd.write(
        """<?xml version="1.0" encoding="utf-8"?>
        <?xml-stylesheet href="morphology.xsl" type="text/xsl"?>"""
        )
    p.write(fd)
    #except:
    #    pass
    return None
示例#14
0
def create_data_list(target, source, env):
    """
    NEEDS WORK!
    Creates the master list of lattice transformations.
    """
    args = source[-1].read()
    data = {}
    for line in meta_open(source[0].rstr()):
        toks = line.split()
        bn = os.path.basename(toks[2])
        data[toks[0]] = data.get(toks[0], {})
        data[toks[0]][toks[1]] = (bn, toks[4], toks[5])
    ofd = meta_open(target[0].rstr(), "w")
    for lattice_file in glob(os.path.join(args["LATTICE_DIR"], "*")):
        bn = os.path.basename(lattice_file)
        path = os.path.join(env["BASE_PATH"], "lattices")
        uttname, delim, uttnum = re.match(r"(.*)([^\w])(\d+)\.%s$" % (args["oldext"]), bn).groups()
        try:
            name, time, timeend = data[uttname][uttnum]
            newname = os.path.abspath(os.path.join(path, "%s%s%s.%s" % (uttname, delim, uttnum, args["ext"])))
            ofd.write("%s %s %s %s %s.osym %s\n" % (os.path.splitext(name)[0], time, timeend, newname, newname, os.path.abspath(lattice_file)))
        except:
            return "lattice file not found in database: %s (are you sure your database file matches your lattice directory?)" % bn
    ofd.close()
    return None
示例#15
0
def index_to_symbol_tables(target, source, env):
    """Create symbol and transducer symbol tables based on an index file.

    Sources: index file
    Targets: symbol file, bsymbol file
    """
    osyms = set()
    bsyms = set()
    with meta_open(source[0].rstr()) as ifd:
        for line in ifd:
            toks = line.strip().split()
            if len(toks) == 1:
                continue
            isym = toks[2]
            osym = toks[3]
            osyms.add(osym)
            if not (isym == "<epsilon>" and osym == "<epsilon>"):
                bsym = "%s:%s" % (isym, osym)
                bsym.replace("<epsilon>:", "").replace(":<epsilon>", "")
                bsyms.add(bsym)

    with meta_open(target[0].rstr(), "w") as osym_ofd:
        osym_ofd.write("<epsilon> 0\n")
        for i, osym in enumerate(osyms):
            osym_ofd.write("%s %d\n" % (osym, i + 1))
    with meta_open(target[1].rstr(), "w") as bsym_ofd:
        bsym_ofd.write("<epsilon> 0\n")
        for i, bsym in enumerate(bsyms):
            bsym_ofd.write("%s %d\n" % (bsym, i + 1))
    return None
示例#16
0
def rasp_parse(target, source, env):
    """
    Parse one file, one sentence per line.
    """
    pid = Popen(["/bin/sh", "/home/tom/parsers/rasp/scripts/rasp.sh"], cwd="/home/tom/parsers/rasp", stdin=PIPE, stdout=PIPE, stderr=PIPE)
    out, err = pid.communicate("\n".join([meta_open(f.rstr()).read() for f in source[0:-1]]))
    meta_open(target[0].rstr(), "w").write(out)
    return None
示例#17
0
def pronunciations_to_vocab_dict(target, source, env):
    """Convert a pronunciation file to a vocabulary file (IBM format).

    Sources: pronunciation file, dictionary_file, boolean
    Targets: vocabulary file, pronunciations_file
    """
    graphemic = source[-1].read()
    prons = {}
    with meta_open(source[0].rstr()) as ifd:
        for l in ifd:
            try:
                morph, num, prob, phones = l.strip().split("\t")
            except:
                try:
                    morph, num, prob = l.strip().split("\t")
                except:
                    try:
                        morph, phones = l.strip().split("\t")
                        num = "1"
                    except:
                        morph = l.strip()
                        phones = "SIL"
                        num = "1"
            num = int(num) + 1
            prons["%s(%.2d)" % (morph, num)] = (morph, phones.split())
    with meta_open(target[0].rstr(), "w") as vocab_ofd, meta_open(target[1].rstr(), "w") as dict_ofd:
        wb = ["[", "wb", "]"]
        wb = []
        for w, (m, p) in prons.iteritems():
            if not graphemic:
                if len(p) == 1:
                    p = p + wb
                else:
                    p = [p[0]] + wb + p[1:] + wb
            dict_ofd.write("%s %s\n" % (w, " ".join(p)))
            vocab_ofd.write("%s %s\n" % (w, m))
        vocab_ofd.write("""<s>(01) <s>
</s>(01) </s>
~SIL(01) VOCAB_NIL_WORD 1.4771
~SIL(02) VOCAB_NIL_WORD 1.4771
~SIL(03) VOCAB_NIL_WORD 1.4771
""")
        if graphemic:
            dict_ofd.write("""<s>(01) SIL
</s>(01) SIL
~SIL(01) SIL
~SIL(02) NS
~SIL(03) VN
""")
        else:
            dict_ofd.write("""<s>(01) SIL
</s>(01) SIL
~SIL(01) SIL
~SIL(02) NS
~SIL(03) VN
""")
    return None
示例#18
0
def create_subset(target, source, env):
    amount = source[1].read()
    method = source[2].read()
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)
    subset = data.get_subset(range(amount))
    with meta_open(target[0].rstr(), "w") as ofd:
        subset.write(ofd)
    return None
示例#19
0
def evaluate_morphology(target, source, env):
    with meta_open(source[0].rstr()) as gold_fd, meta_open(source[1].rstr()) as pred_fd:
        gold = DataSet.from_stream(gold_fd)
        pred = DataSet.from_stream(pred_fd)
        gold_analyses = gold.get_analyses()
        pred_analyses = pred.get_analyses()
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write("%f\n" % 1.0)
    return None
示例#20
0
def scf_sents(target, source, env):
    fd = meta_open(target[0].rstr(), "w")
    regexes = [re.compile("[^V]*%s\S+V.*" % x) for x in source[0].read()]
    for i, f in enumerate(env["FILES"]):
        logging.info("%d %s", i, f)
        for i, s in enumerate(re.split("\n\s*\n", meta_open(f).read())):
            if any([x.match(s) for x in regexes]):
                fd.write("%s %d\n" % (f, i) + s.strip() + "\n\n\n")
    return None
示例#21
0
def lattice_list(target, source, env):
    """
    Creates a file that's simply a list of the lattices in the given directory (absolute paths, one per line).
    """
    lattice_dir = source[1].read()
    if not os.path.exists(lattice_dir):
        return "No such directory: %s" % lattice_dir
    meta_open(target[0].rstr(), "w").write("\n".join([os.path.abspath(x) for x in glob(os.path.join(lattice_dir, "*"))]) + "\n")
    return None
示例#22
0
def conll_to_data(target, source, env):
    args = source[-1].read()
    if args.get("verbs"):
        keep_verbs = [x.split()[0] for x in meta_open(args["verbs"])]
    else:
        keep_verbs = []
    lookups = dict([(x, {}) for x in ["verb_to_id", "gr_to_id", "lemma_to_id"]])
    data = dict([(x, []) for x in ["instance_starts", "instance_verbs", "instance_lengths", "instance_grs"]])
    for fname in args.get("inputs", []):
        this_verb = None
        if "0parsed" in fname:
            this_verb = re.match(r".*0parsed\.(.*?)\..*", fname).group(1)
            if len(keep_verbs) > 0 and this_verb not in keep_verbs:
                continue
        fd = meta_open(fname)
        text = fd.read()
        fd.close()
        for stext in sentence_rx.split(text.strip()):
            try:
                sent = Sentence(stext)
            except:
                continue
            for verb in [x for x in sent if x.pos.startswith("V") and x.gr not in ["auxpass", "cop"] and (not keep_verbs or x.lemma in keep_verbs) and (not this_verb or this_verb == x.lemma)]:
                try:
                    if verb.head():
                        if verb.head().pos[0] in fpos:
                            grs = ["%s(%s-%s, %s)" % (verb.gr, verb.head().pos, verb.head().lemma, verb.pos)]
                        else:
                            grs = ["%s(%s, %s)" % (verb.gr, verb.head().pos, verb.pos)]
                        lemmas = [verb.head().lemma]
                    else:
                        grs = []
                        lemmas = []
                except:
                    continue
                for tok in sent:
                    if tok.head_index == verb.index:
                        if tok.pos[0] in fpos:
                            grs.append("%s(%s, %s-%s)" % (tok.gr, verb.pos, tok.pos, tok.lemma))
                        else:
                            grs.append("%s(%s, %s)" % (tok.gr, verb.pos, tok.pos))
                        lemmas.append(tok.lemma)
                if len(grs) == 0:
                    continue
                lookups["verb_to_id"][verb.lemma] = lookups["verb_to_id"].get(verb.lemma, len(lookups["verb_to_id"]) + 1)
                data["instance_starts"].append(len(data["instance_grs"]) + 1)
                data["instance_verbs"].append(lookups["verb_to_id"][verb.lemma])
                data["instance_lengths"].append(len(grs))
                for gr in grs:
                    lookups["gr_to_id"][gr] = lookups["gr_to_id"].get(gr, len(lookups["gr_to_id"]) + 1)
                    data["instance_grs"].append(lookups["gr_to_id"][gr])
    data["verbs"] = len(lookups["verb_to_id"])
    data["grs"] = len(lookups["gr_to_id"])
    data["ns"] = max(data["instance_lengths"])
    pickle.dump((data, lookups), meta_open(target[0].rstr(), "w"))
    return None
示例#23
0
def top_words(target, source, env):
    args = source[-1].read()
    with meta_open(source[0].rstr()) as words_ifd, meta_open(source[1].rstr()) as pron_ifd:
        top = ProbabilityList(words_ifd).get_top_n(args["COUNT"])
        prons = Pronunciations(pron_ifd)
        prons.filter_by(top)
    with meta_open(target[0].rstr(), "w") as words_ofd, meta_open(target[1].rstr(), "w") as pron_ofd:
        words_ofd.write(top.format())
        pron_ofd.write(prons.format())
    return None
示例#24
0
def fst_compile(target, source, env):
    """
    Compile an FST using OpenFST's binary 'fstcompile'.
    """
    command = env.subst("${FSTCOMPILE} --isymbols=${SOURCES[0]} --osymbols=${SOURCES[0]} ${SOURCES[1]}", target=target, source=source)
    stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if not success:
        return stderr
    meta_open(target[0].rstr(), "w").write(stdout)
    return None
示例#25
0
def merge_scores(target, source, env):
    """
    NEEDS WORK!
    CONVERT TO BUILDER!
    """
    stdout, stderr, success = run_command(env.subst("${MERGESCORESSUMPOSTNORMPL} ${SOURCES[0]}", target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")})
    if not success:
        return stderr
    meta_open(target[0].rstr(), "w").write(stdout)
    return None
示例#26
0
def cluster_verbs(target, source, env):
    args = source[-1].read()
    verbs, samples = pickle.load(meta_open(source[0].rstr()))
    samples = samples.sum(2)
    data = numpy.transpose(samples.T / samples.sum(1))
    res = stats.kmeans(numpy2ri(data), centers=args.get("clusters", 20)) #data[args["matrix"]].shape[0] / 10)
    ofd = meta_open(target[0].rstr(), "w")
    for c in set(res.rx2("cluster")):
        ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n")
    return None
示例#27
0
def plot_lsa(target, source, env):
    args = source[-1].read()
    words, codebook, distortion, whitened = cPickle.load(meta_open(source[0].rstr()))
    words = [x.get("_NAME") for x in words]
    assignments = zip(vq(whitened, codebook)[0], words)
    clusters = dict([(x, [y[1] for y in assignments if y[0] == x]) for x in set([y[0] for y in assignments])])
    fd = meta_open(target[0].rstr(), "w")
    for k, v in clusters.iteritems():
        fd.write("%s\n\t%s\n\n" % (k, "\n\t".join(v).encode("utf-8")))
    return None
示例#28
0
def evaluate_scfs(target, source, env):
    args = source[-1].read()
    raw = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tlraw")
    traw = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tltraw")
    tout = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tlout")
    lexicon = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tllex")
    gold = tempfile.NamedTemporaryFile(dir=env["TEMP_DIR"], delete=False, prefix="tlgold")
    if source[1].rstr().endswith(".tgz"):
        tf = tarfile.open(source[1].rstr(), "r:gz")
        traw.write(tf.extractfile([x for x in tf.getnames() if x.endswith("count.scf")][0]).read())
    else:
        traw.write(meta_open(source[1].rstr()).read())
    traw.close()
    raw_v = set([l.split()[0] for l in meta_open(traw.name)])
    lex_v = set([l.split()[0] for l in meta_open(source[0].rstr())])
    gold_v = set([l.split()[0] for l in meta_open(source[2].rstr()) if len(l.split()) == 1])
    if "FILTER_BY" in args:
        #filt_v = set([l.split()[0] for l in meta_open(args["FILTER_BY"]) if len(l.split()) == 1])
        filt_v = set([l.strip() for l in meta_open(args["FILTER_BY"])])
        verbs = set.intersection(raw_v, lex_v, gold_v, filt_v)
    else:
        verbs = set.intersection(raw_v, lex_v, gold_v)
    rx = re.compile("^(%s)\s+" % ("|".join(verbs)))
    for l in [l for l in meta_open(traw.name) if rx.match(l)]:
        verb, scf, freq, count = l.strip().split()
        scf = scf.split("_")[0].lstrip("0")
        raw.write("%s %s %s %s\n" % (verb, scf, freq, count))
    for l in [l for l in meta_open(source[0].rstr()) if rx.match(l)]:
        verb, scf, freq, count = l.strip().split()
        scf = scf.split("_")[0].lstrip("0")
        lexicon.write("%s %s %s %s\n" % (verb, scf, freq, count))
    gold.write(meta_open(source[2].rstr()).read())
    gold.close()
    raw.close()
    lexicon.close()
    pid = Popen(["%s/scripts/eval-scf-counts.pl" % env["SUBCAT_2009"], lexicon.name, tout.name, "-raw", raw.name, "-gold", gold.name])
    pid.communicate()
    text = open(tout.name).read()
    xml = et.TreeBuilder()
    xml.start("xml", {})
    for k, v in [x for x in args.iteritems() if not x[0].startswith("_")]:
        xml.start(k, {})
        xml.data(str(v))
        xml.end(k)
    xml.start("text", {})
    xml.data(text)
    xml.end("text")
    xml.end("xml")
    meta_open(target[0].rstr(), "w").write(et.tostring(xml.close()))
    os.remove(lexicon.name)
    os.remove(gold.name)
    os.remove(raw.name)
    os.remove(traw.name)
    os.remove(tout.name)
    return None
示例#29
0
def graphemic_pronunciations(target, source, env):
    """Convert a list of words into a list of graphemic pronunciations.

    Sources: word list file
    Targets: graphemic pronunciation file
    """
    with meta_open(source[0].rstr()) as ifd:
        items = [x.strip() for x in ifd]
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write("\n".join(["%s\t%s" % (w, " ".join(["u%.4x" % (ord(c)) for c in w if unicodedata.category(c)[0] == "L" and c not in [unichr(1100), unichr(1098)]])) for w in items]))
    return None
示例#30
0
def rtm_to_data(target, source, env):
    sentences = []
    with meta_open(source[0].rstr()) as ifd:
        for sentence in ifd:
            words = [w for w in sentence.split()[5:] if w not in ["(())", "IGNORE_TIME_SEGMENT_IN_SCORING"]]
            if len(words) > 0:
                sentences.append(words)
    dataset = DataSet.from_sentences([[(w, None, []) for w in s] for s in sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        dataset.write(ofd)
    return None