示例#1
0
def to_heb(translit):
    return Transcription.to_hebrew(Transcription.suffix_and_finales(translit)[0])
示例#2
0
def get_kq(kq_file):
    msg("Reading Ketiv-Qere data")

    info = collections.defaultdict(lambda: [])
    not_found = set()
    missing = collections.defaultdict(lambda: [])
    missed = collections.defaultdict(lambda: [])

    error_limit = 10

    kq_handle = open(kq_file)

    ln = 0
    can = 0
    cur_label = None
    for line in kq_handle:
        ln += 1
        can += 1
        vlab = line[0:10]
        fields = line.rstrip("\n")[10:].split()
        (ketiv, qere) = fields[0:2]
        (qtrim, qtrailer) = Transcription.suffix_and_finales(qere)
        vnode = vlab2vnode.get(vlab, None)
        if vnode == None:
            not_found.add(vlab)
            continue
        info[vnode].append((ketiv, qtrim, qtrailer))
    kq_handle.close()
    msg("Read {} ketiv-qere annotations".format(ln))

    data = []
    for vnode in info:
        wlookup = collections.defaultdict(lambda: [])
        wvisited = collections.defaultdict(lambda: -1)
        wnodes = L.d("word", vnode)
        for w in wnodes:
            gw = F.g_word.v(w)
            if "*" in gw:
                gw = F.g_cons.v(w)
                if gw == "":
                    gw = "."
                wlookup[gw].append(w)
        for (ketiv, qere, qtrailer) in info[vnode]:
            wvisited[ketiv] += 1
            windex = wvisited[ketiv]
            ws = wlookup.get(ketiv, None)
            if ws == None or windex > len(ws) - 1:
                missing[vnode].append((windex, ketiv, qere))
                continue
            w = ws[windex]
            qere_u = Transcription.to_hebrew(qere)
            qtrailer_u = Transcription.to_hebrew(qtrailer)
            data.append((w, ketiv, qere_u, qtrailer_u))
        for ketiv in wlookup:
            if ketiv not in wvisited or len(wlookup[ketiv]) - 1 > wvisited[ketiv]:
                missed[vnode].append((len(wlookup[ketiv]) - (wvisited.get(ketiv, -1) + 1), ketiv))
    msg("Parsed {} ketiv-qere annotations".format(len(data)))

    if not_found:
        msg("Could not find {} verses: {}".format(len(not_found), sorted(not_found)))
    else:
        msg("All verses entries found in index")
    if missing:
        msg("Could not locate ketivs in the text: {} verses".format(len(missing)))
        e = 0
        for vnode in sorted(missing):
            if e > error_limit:
                break
            vlab = F.label.v(vnode)
            for (windex, ketiv, qere) in missing[vnode]:
                e += 1
                if e > error_limit:
                    break
                print("NOT IN TEXT: {:<10} {:<20} #{} {}".format(vlab, ketiv, windex, qere))
    else:
        msg("All ketivs found in the text")
    if missed:
        msg("Could not lookup qeres in the data: {} verses".format(len(missing)))
        e = 0
        for vnode in sorted(missed):
            if e > error_limit:
                break
            vlab = F.label.v(vnode)
            for (windex, ketiv) in missed[vnode]:
                e += 1
                if e > error_limit:
                    break
                print("NOT IN DATA: {:<10} {:<20} #{}".format(vlab, ketiv, windex))
    else:
        msg("All ketivs found in the data")
    return [(x[0], x[2], x[3]) for x in data]