예제 #1
0
def parseMorph():
    print('Parsing morphological data')

    global unknowns

    for (sura, suraData) in morphDb.items():
        for (aya, ayaData) in suraData.items():
            for (group, groupData) in ayaData.items():
                for (word, (form, tag, featureStr)) in groupData.items():
                    wordFeatures.setdefault((sura, aya, group, word),
                                            {})['ascii'] = form
                    wordFeatures.setdefault((sura, aya, group, word),
                                            {})['unicode'] = tr.to_arabic(form)
                    (theseFeatures,
                     theseUnknowns) = parseMorphItem(tag, featureStr)
                    for (k, v) in theseFeatures.items():
                        wordFeatures.setdefault((sura, aya, group, word),
                                                {})[k] = v
                    unknowns |= theseUnknowns

    if unknownFeatures:
        feats = ' '.join(unknownFeatures)
        print(f'\tUnknown features: {feats}')
    else:
        print(f'\tAll features known')
    if (unknownPerFeat):
        for feat in sorted(unknownPerFeat):
            vals = ' '.join(sorted(unknownPerFeat[feat]))
            print(f'\tUnknown: {feat}: {vals}')
    if unknowns:
        vals = ' '.join(sorted(unknowns))
        print(f'\tUnknown labels: {vals}')
    if not unknownPerFeat and not unknowns:
        print(f'\tAll feature values known')
    print(f'Done')
예제 #2
0
파일: tfFromMorph.py 프로젝트: q-ran/quran
def director(cv):
    print('Parsing morphological data')

    global unknowns

    lemmaIndex = {}
    sectionIndex = {}

    for (sura, suraData) in morphDb.items():
        curSura = cv.node('sura')
        cv.feature(curSura, number=sura)
        theseSuraFeatures = suraFeatures.get(sura, None)
        if theseSuraFeatures:
            cv.feature(curSura, **theseSuraFeatures)
        for (aya, ayaData) in suraData.items():
            curAya = cv.node('aya')
            cv.feature(curAya, number=aya)
            transFeatures = {
                f'translation@{lang}': trans[(sura, aya)]
                for (lang, trans) in translations.items()
            }
            cv.feature(curAya, **transFeatures)
            for s in sectionEnd.get((sura, aya), []):
                curSection = sectionIndex[s]
                cv.terminate(curSection)
                del sectionIndex[s]
            for (sName, sI, sFeatures) in sectionStart.get((sura, aya), []):
                curSection = cv.node(sName)
                cv.feature(curSection, number=sI, **sFeatures)
                sectionIndex[(sName, sI)] = curSection
            nAya = len(ayaData)
            for (ig, (group, groupData)) in enumerate(ayaData.items()):
                curGroup = cv.node('group')
                cv.feature(curGroup, number=group)
                nGroup = len(groupData)
                for (iw, (word, (form, tag,
                                 featureStr))) in enumerate(groupData.items()):
                    (theseFeatures,
                     theseUnknowns) = parseMorphItem(tag, featureStr)
                    lemma = theseFeatures.get('lemma', None)
                    if lemma:
                        thisLemma = lemmaIndex.get(lemma, None)
                        if thisLemma:
                            cv.resume(thisLemma)
                        else:
                            thisLemma = cv.node('lex')
                            lemmaIndex[lemma] = thisLemma
                        cv.feature(thisLemma, lemma=lemma)
                    curWord = cv.slot()
                    if lemma:
                        cv.terminate(thisLemma)
                    cv.feature(
                        curWord,
                        ascii=form,
                        unicode=tr.to_arabic(form),
                        space=' '
                        if iw == nGroup - 1 and ig != nAya - 1 else '',
                        number=word,
                    )
                    cv.feature(curWord, **theseFeatures)
                    unknowns |= theseUnknowns
                cv.terminate(curGroup)
            cv.terminate(curAya)
        cv.terminate(curSura)
    for curSection in sectionIndex.values():
        cv.terminate(curSection)

    if unknownFeatures:
        feats = ' '.join(unknownFeatures)
        print(f'\tUnknown features: {feats}')
    else:
        print(f'\tAll features known')
    if (unknownPerFeat):
        for feat in sorted(unknownPerFeat):
            vals = ' '.join(sorted(unknownPerFeat[feat]))
            print(f'\tUnknown: {feat}: {vals}')
    if unknowns:
        vals = ' '.join(sorted(unknowns))
        print(f'\tUnknown labels: {vals}')
    if not unknownPerFeat and not unknowns:
        print(f'\tAll feature values known')
    print(f'Done')