def parseMorph(): print('Parsing morphological data') global unknowns for (sura, suraData) in morphDb.items(): for (aya, ayaData) in suraData.items(): for (group, groupData) in ayaData.items(): for (word, (form, tag, featureStr)) in groupData.items(): wordFeatures.setdefault((sura, aya, group, word), {})['ascii'] = form wordFeatures.setdefault((sura, aya, group, word), {})['unicode'] = tr.to_arabic(form) (theseFeatures, theseUnknowns) = parseMorphItem(tag, featureStr) for (k, v) in theseFeatures.items(): wordFeatures.setdefault((sura, aya, group, word), {})[k] = v unknowns |= theseUnknowns if unknownFeatures: feats = ' '.join(unknownFeatures) print(f'\tUnknown features: {feats}') else: print(f'\tAll features known') if (unknownPerFeat): for feat in sorted(unknownPerFeat): vals = ' '.join(sorted(unknownPerFeat[feat])) print(f'\tUnknown: {feat}: {vals}') if unknowns: vals = ' '.join(sorted(unknowns)) print(f'\tUnknown labels: {vals}') if not unknownPerFeat and not unknowns: print(f'\tAll feature values known') print(f'Done')
def director(cv): print('Parsing morphological data') global unknowns lemmaIndex = {} sectionIndex = {} for (sura, suraData) in morphDb.items(): curSura = cv.node('sura') cv.feature(curSura, number=sura) theseSuraFeatures = suraFeatures.get(sura, None) if theseSuraFeatures: cv.feature(curSura, **theseSuraFeatures) for (aya, ayaData) in suraData.items(): curAya = cv.node('aya') cv.feature(curAya, number=aya) transFeatures = { f'translation@{lang}': trans[(sura, aya)] for (lang, trans) in translations.items() } cv.feature(curAya, **transFeatures) for s in sectionEnd.get((sura, aya), []): curSection = sectionIndex[s] cv.terminate(curSection) del sectionIndex[s] for (sName, sI, sFeatures) in sectionStart.get((sura, aya), []): curSection = cv.node(sName) cv.feature(curSection, number=sI, **sFeatures) sectionIndex[(sName, sI)] = curSection nAya = len(ayaData) for (ig, (group, groupData)) in enumerate(ayaData.items()): curGroup = cv.node('group') cv.feature(curGroup, number=group) nGroup = len(groupData) for (iw, (word, (form, tag, featureStr))) in enumerate(groupData.items()): (theseFeatures, theseUnknowns) = parseMorphItem(tag, featureStr) lemma = theseFeatures.get('lemma', None) if lemma: thisLemma = lemmaIndex.get(lemma, None) if thisLemma: cv.resume(thisLemma) else: thisLemma = cv.node('lex') lemmaIndex[lemma] = thisLemma cv.feature(thisLemma, lemma=lemma) curWord = cv.slot() if lemma: cv.terminate(thisLemma) cv.feature( curWord, ascii=form, unicode=tr.to_arabic(form), space=' ' if iw == nGroup - 1 and ig != nAya - 1 else '', number=word, ) cv.feature(curWord, **theseFeatures) unknowns |= theseUnknowns cv.terminate(curGroup) cv.terminate(curAya) cv.terminate(curSura) for curSection in sectionIndex.values(): cv.terminate(curSection) if unknownFeatures: feats = ' '.join(unknownFeatures) print(f'\tUnknown features: {feats}') else: print(f'\tAll features known') if (unknownPerFeat): for feat in sorted(unknownPerFeat): vals = ' '.join(sorted(unknownPerFeat[feat])) print(f'\tUnknown: {feat}: {vals}') if unknowns: vals = ' '.join(sorted(unknowns)) print(f'\tUnknown labels: {vals}') if not unknownPerFeat and not unknowns: print(f'\tAll feature values known') print(f'Done')