def parseMorph(): print('Parsing morphological data') global unknowns for (sura, suraData) in morphDb.items(): for (aya, ayaData) in suraData.items(): for (group, groupData) in ayaData.items(): for (word, (form, tag, featureStr)) in groupData.items(): wordFeatures.setdefault((sura, aya, group, word), {})['ascii'] = form wordFeatures.setdefault((sura, aya, group, word), {})['unicode'] = tr.to_arabic(form) (theseFeatures, theseUnknowns) = parseMorphItem(tag, featureStr) for (k, v) in theseFeatures.items(): wordFeatures.setdefault((sura, aya, group, word), {})[k] = v unknowns |= theseUnknowns if unknownFeatures: feats = ' '.join(unknownFeatures) print(f'\tUnknown features: {feats}') else: print(f'\tAll features known') if (unknownPerFeat): for feat in sorted(unknownPerFeat): vals = ' '.join(sorted(unknownPerFeat[feat])) print(f'\tUnknown: {feat}: {vals}') if unknowns: vals = ' '.join(sorted(unknowns)) print(f'\tUnknown labels: {vals}') if not unknownPerFeat and not unknowns: print(f'\tAll feature values known') print(f'Done')
def readData(): print('Reading sura metadata') suraPat = r'<sura(.*?)/>' suraRe = re.compile(suraPat) juzPat = r'<juz(.*?)/>' juzRe = re.compile(juzPat) hizbPat = r'<quarter(.*?)/>' hizbRe = re.compile(hizbPat) manzilPat = r'<manzil(.*?)/>' manzilRe = re.compile(manzilPat) rukuPat = r'<ruku(.*?)/>' rukuRe = re.compile(rukuPat) pagePat = r'<page(.*?)/>' pageRe = re.compile(pagePat) sajdaPat = r'<sajda(.*?)/>' sajdaRe = re.compile(sajdaPat) with open(DATA_PATH) as fh: data = fh.read() suras = suraRe.findall(data) for sura in suras: atts = dict(attRe.findall(sura)) sI = int(atts.get('index', 0)) suraFeatures[sI] = { 'name': atts.get('name', ''), 'nameAscii': tr.from_arabic(atts.get('name', '')), 'nameTrans': atts.get('tname', ''), 'name@en': atts.get('ename', ''), 'type': atts.get('type', ''), } if 'order' in atts: suraFeatures[sI]['order'] = int(atts['order']) print(f'Read features for {len(suras)} suras') for (sectionName, sectionRe, info) in ( ('juz', juzRe, ()), ('hizb', hizbRe, ()), ('manzil', manzilRe, ()), ('ruku', rukuRe, ()), ('page', pageRe, ()), ('sajda', sajdaRe, ('type', )), ): sections = sectionRe.findall(data) for section in sections: atts = dict(attRe.findall(section)) sI = int(atts.get('index', 0)) sura = int(atts.get('sura', 0)) aya = int(atts.get('aya', 0)) features = {k: atts[k] for k in info} if sI > 1: sectionEnd.setdefault((sura, aya), []).append( (sectionName, sI - 1)) sectionStart.setdefault((sura, aya), []).append( (sectionName, sI, features))
def director(cv): """Read tsv data fields. This is a function that does the work as indicated in the [walker converion engine of Text-Fabric](https://annotation.github.io/text-fabric/tf/convert/walker.html) See `fusus.convert` for a description of the fields in the TSV files. """ stops = U.stops errors = collections.defaultdict(set) cur = [None, None, None, None] prev = [None, None, None, None] nSec = len(prev) data = [] with open(SRC_FILE) as fh: next(fh) for line in fh: row = tuple(line.rstrip("\n").split("\t")) page = int(row[0]) if pageNums is not None and page not in pageNums: continue if OCRED: row = ( page, int(row[1]), row[2], int(row[3]), *(None if c == "?" else int(c) for c in row[4:8]), int(row[8]), *row[9:11], ) else: row = ( page, *(int(c) for c in row[1:4]), row[4], *(None if c == "?" else int(c) for c in row[5:9]), *row[9:11], ) data.append(row) boxL = nSec if OCRED else nSec + 1 if HAS_TOC: toc = getToc(data) curPiece = cv.node("piece") cv.feature(curPiece, n=1, title="front") curSentence = cv.node("sentence") nSentence = 1 cv.feature(curSentence, n=nSentence) for (r, fields) in enumerate(data): if HAS_TOC: page = fields[0] if page in toc and page != prev[0]: for i in reversed(range(nSec)): cv.terminate(cur[i]) cv.terminate(curSentence) cv.terminate(curPiece) nSentence = 1 curSentence = cv.node("sentence") cv.feature(curSentence, n=nSentence) (n, np, title) = toc[page] curPiece = cv.node("piece") cv.feature(curPiece, n=n, title=title) if np is not None: cv.feature(curPiece, np=np) for i in range(nSec): if fields[i] != prev[i]: for j in reversed(range(i, nSec)): cv.terminate(cur[j]) for j in range(i, nSec): cn = cv.node(TYPE_MAP[j]) cur[j] = cn if OCRED and j == 2: cv.feature(cn, b=fields[j]) elif OCRED and j == 3 or not OCRED and j == 1: cv.feature(cn, ln=fields[j]) else: cv.feature(cn, n=fields[j]) if not OCRED and j == nSec - 1: cv.feature(cn, dir=fields[nSec]) break for i in range(nSec): prev[i] = fields[i] letters = fields[-2] punc = fields[-1] lettersp = Tr.asciiFromArabic(letters) if letters else "" lettersn = Tr.latinFromArabic(letters) if letters else "" letterst = Tr.standardFromArabic(letters) if letters else "" punca = Tr.asciiFromArabic(punc) if punc else "" s = cv.slot() cv.feature( s, boxl=fields[boxL], boxt=fields[boxL + 1], boxr=fields[boxL + 2], boxb=fields[boxL + 3], letters=letters, lettersp=lettersp, lettersn=lettersn, letterst=letterst, ) cv.feature(s, punc=punc, punca=punca) if any(c in stops for c in punc): cv.terminate(curSentence) curSentence = cv.node("sentence") nSentence += 1 cv.feature(curSentence, n=nSentence) if OCRED: cv.feature(s, confidence=fields[-3]) cv.terminate(curSentence) for i in reversed(range(nSec)): if cur[i]: cv.terminate(cur[i]) if HAS_TOC: cv.terminate(curPiece) for feat in featureMeta: if not cv.occurs(feat): cv.meta(feat) if errors: for kind in sorted(errors): instances = sorted(errors[kind]) nInstances = len(instances) showInstances = instances[0:20] print(f"ERROR {kind}: {nInstances} x") print(", ".join(showInstances))
from tf.fabric import Fabric VERSION = "0.2" GH_BASE = os.path.expanduser("~/github") ORG = "etcbc" REPO = "peshitta" SOURCE_DIR = f"source/{VERSION}" PLAIN_DIR = f"plain/{VERSION}" TF_DIR = f"tf/{VERSION}" SOURCE_PATH = f"{GH_BASE}/{ORG}/{REPO}/{SOURCE_DIR}" PLAIN_PATH = f"{GH_BASE}/{ORG}/{REPO}/{PLAIN_DIR}" TF_PATH = f"{GH_BASE}/{ORG}/{REPO}/{TF_DIR}" TR = Transcription() allAcrosSeq = """ Gn Ex Lv Nm Dt Jb Jos Jd Sm1 Sm2 Ps Rg1
def toHeb(translit): return Transcription.toHebrew( Transcription.suffix_and_finales(translit)[0])
def director(cv): print('Parsing morphological data') global unknowns lemmaIndex = {} sectionIndex = {} for (sura, suraData) in morphDb.items(): curSura = cv.node('sura') cv.feature(curSura, number=sura) theseSuraFeatures = suraFeatures.get(sura, None) if theseSuraFeatures: cv.feature(curSura, **theseSuraFeatures) for (aya, ayaData) in suraData.items(): curAya = cv.node('aya') cv.feature(curAya, number=aya) transFeatures = { f'translation@{lang}': trans[(sura, aya)] for (lang, trans) in translations.items() } cv.feature(curAya, **transFeatures) for s in sectionEnd.get((sura, aya), []): curSection = sectionIndex[s] cv.terminate(curSection) del sectionIndex[s] for (sName, sI, sFeatures) in sectionStart.get((sura, aya), []): curSection = cv.node(sName) cv.feature(curSection, number=sI, **sFeatures) sectionIndex[(sName, sI)] = curSection nAya = len(ayaData) for (ig, (group, groupData)) in enumerate(ayaData.items()): curGroup = cv.node('group') cv.feature(curGroup, number=group) nGroup = len(groupData) for (iw, (word, (form, tag, featureStr))) in enumerate(groupData.items()): (theseFeatures, theseUnknowns) = parseMorphItem(tag, featureStr) lemma = theseFeatures.get('lemma', None) if lemma: thisLemma = lemmaIndex.get(lemma, None) if thisLemma: cv.resume(thisLemma) else: thisLemma = cv.node('lex') lemmaIndex[lemma] = thisLemma cv.feature(thisLemma, lemma=lemma) curWord = cv.slot() if lemma: cv.terminate(thisLemma) cv.feature( curWord, ascii=form, unicode=tr.to_arabic(form), space=' ' if iw == nGroup - 1 and ig != nAya - 1 else '', number=word, ) cv.feature(curWord, **theseFeatures) unknowns |= theseUnknowns cv.terminate(curGroup) cv.terminate(curAya) cv.terminate(curSura) for curSection in sectionIndex.values(): cv.terminate(curSection) if unknownFeatures: feats = ' '.join(unknownFeatures) print(f'\tUnknown features: {feats}') else: print(f'\tAll features known') if (unknownPerFeat): for feat in sorted(unknownPerFeat): vals = ' '.join(sorted(unknownPerFeat[feat])) print(f'\tUnknown: {feat}: {vals}') if unknowns: vals = ' '.join(sorted(unknowns)) print(f'\tUnknown labels: {vals}') if not unknownPerFeat and not unknowns: print(f'\tAll feature values known') print(f'Done')
error_limit = 10 kqFile = '{}/ketivqere.txt'.format(thisSource) kqHandle = open(kqFile) ln = 0 can = 0 cur_label = None for line in kqHandle: ln += 1 can += 1 vlab = line[0:10] fields = line.rstrip('\n')[10:].split() (ketiv, qere) = fields[0:2] (qtrim, qtrailer) = Transcription.suffix_and_finales(qere) vnode = nodeFromLabel.get(vlab, None) if vnode == None: notFound.add(vlab) continue verseInfo[vnode].append((ketiv, qtrim, qtrailer)) kqHandle.close() utils.caption(0, '\tRead {} ketiv-qere annotations'.format(ln)) # In[10]: data = [] for vnode in verseInfo: wlookup = collections.defaultdict(lambda: []) wvisited = collections.defaultdict(lambda: -1)