def to_heb(translit): return Transcription.to_hebrew(Transcription.suffix_and_finales(translit)[0])
def get_kq(kq_file): msg("Reading Ketiv-Qere data") info = collections.defaultdict(lambda: []) not_found = set() missing = collections.defaultdict(lambda: []) missed = collections.defaultdict(lambda: []) error_limit = 10 kq_handle = open(kq_file) ln = 0 can = 0 cur_label = None for line in kq_handle: ln += 1 can += 1 vlab = line[0:10] fields = line.rstrip("\n")[10:].split() (ketiv, qere) = fields[0:2] (qtrim, qtrailer) = Transcription.suffix_and_finales(qere) vnode = vlab2vnode.get(vlab, None) if vnode == None: not_found.add(vlab) continue info[vnode].append((ketiv, qtrim, qtrailer)) kq_handle.close() msg("Read {} ketiv-qere annotations".format(ln)) data = [] for vnode in info: wlookup = collections.defaultdict(lambda: []) wvisited = collections.defaultdict(lambda: -1) wnodes = L.d("word", vnode) for w in wnodes: gw = F.g_word.v(w) if "*" in gw: gw = F.g_cons.v(w) if gw == "": gw = "." wlookup[gw].append(w) for (ketiv, qere, qtrailer) in info[vnode]: wvisited[ketiv] += 1 windex = wvisited[ketiv] ws = wlookup.get(ketiv, None) if ws == None or windex > len(ws) - 1: missing[vnode].append((windex, ketiv, qere)) continue w = ws[windex] qere_u = Transcription.to_hebrew(qere) qtrailer_u = Transcription.to_hebrew(qtrailer) data.append((w, ketiv, qere_u, qtrailer_u)) for ketiv in wlookup: if ketiv not in wvisited or len(wlookup[ketiv]) - 1 > wvisited[ketiv]: missed[vnode].append((len(wlookup[ketiv]) - (wvisited.get(ketiv, -1) + 1), ketiv)) msg("Parsed {} ketiv-qere annotations".format(len(data))) if not_found: msg("Could not find {} verses: {}".format(len(not_found), sorted(not_found))) else: msg("All verses entries found in index") if missing: msg("Could not locate ketivs in the text: {} verses".format(len(missing))) e = 0 for vnode in sorted(missing): if e > error_limit: break vlab = F.label.v(vnode) for (windex, ketiv, qere) in missing[vnode]: e += 1 if e > error_limit: break print("NOT IN TEXT: {:<10} {:<20} #{} {}".format(vlab, ketiv, windex, qere)) else: msg("All ketivs found in the text") if missed: msg("Could not lookup qeres in the data: {} verses".format(len(missing))) e = 0 for vnode in sorted(missed): if e > error_limit: break vlab = F.label.v(vnode) for (windex, ketiv) in missed[vnode]: e += 1 if e > error_limit: break print("NOT IN DATA: {:<10} {:<20} #{}".format(vlab, ketiv, windex)) else: msg("All ketivs found in the data") return [(x[0], x[2], x[3]) for x in data]