Пример #1
0
def highlight(txt, extra, fieldDict, field, mod_field):
    '''When a field is marked with the 'focusMorph' command, we format it by
    wrapping all the morphemes in <span>s with attributes set to its maturity'''
    from .util import getFilterByTagsAndType
    from .morphemizer import getMorphemizerByName
    from .morphemes import getMorphemes

    # must avoid formatting a smaller morph that is contained in a bigger morph
    # => do largest subs first and don't sub anything already in <span>
    def nonSpanSub(sub, repl, string):
        return ''.join(
            re.sub(sub, repl, s, flags=re.IGNORECASE
                   ) if not s.startswith('<span') else s
            for s in re.split('(<span.*?</span>)', string))

    frequencyListPath = cfg1('path_frequency')
    try:
        with codecs.open(frequencyListPath, 'r', 'utf-8') as f:
            frequencyList = [
                line.strip().split('\t')[0] for line in f.readlines()
            ]
    except:
        pass  # User does not have a frequency.txt

    priorityDb = main.MorphDb(cfg1('path_priority'), ignoreErrors=True).db
    tags = fieldDict['Tags'].split()
    filter = getFilterByTagsAndType(fieldDict['Type'], tags)
    if filter is None:
        return txt
    morphemizer = getMorphemizerByName(filter['Morphemizer'])
    if morphemizer is None:
        return txt
    ms = getMorphemes(morphemizer, txt, tags)

    for m in sorted(ms, key=lambda x: len(x.inflected),
                    reverse=True):  # largest subs first
        locs = allDb().getMatchingLocs(m)
        mat = max(loc.maturity for loc in locs) if locs else 0

        if mat >= cfg1('threshold_mature'): mtype = 'mature'
        elif mat >= cfg1('threshold_known'): mtype = 'known'
        elif mat >= cfg1('threshold_seen'): mtype = 'seen'
        else: mtype = 'unknown'

        if m in priorityDb:
            priority = 'true'
        else:
            priority = 'false'

        focusMorphString = m.show().split()[0]
        try:
            focusMorphIndex = frequencyList.index(focusMorphString)
            frequency = 'true'
        except:
            frequency = 'false'

        repl = '<span class="morphHighlight" mtype="{mtype}" priority="{priority}" frequency="{frequency}" mat="{mat}">\\1</span>'.format(
            mtype=mtype, priority=priority, frequency=frequency, mat=mat)
        txt = nonSpanSub('(%s)' % m.inflected, repl, txt)
    return txt
Пример #2
0
def highlight(txt: str, field, filter: str, ctx) -> str:
    """When a field is marked with the 'focusMorph' command, we format it by
    wrapping all the morphemes in <span>s with attributes set to its maturity"""

    print("morphHighlight filter %s" % filter)
    if filter != "morphHighlight":
        return txt

    from .util import getFilter
    from .morphemizer import getMorphemizerByName
    from .morphemes import getMorphemes

    # must avoid formatting a smaller morph that is contained in a bigger morph
    # => do largest subs first and don't sub anything already in <span>
    def nonSpanSub(sub, repl, string):
        return ''.join(
            re.sub(sub, repl, s, flags=re.IGNORECASE
                   ) if not s.startswith('<span') else s
            for s in re.split('(<span.*?</span>)', string))

    frequency_list_path = cfg('path_frequency')
    try:
        with codecs.open(frequency_list_path, encoding='utf-8') as f:
            frequency_list = [
                line.strip().split('\t')[0] for line in f.readlines()
            ]
    except:
        frequency_list = []

    priority_db = main.MorphDb(cfg('path_priority'), ignoreErrors=True).db

    note = ctx.note()
    tags = note.stringTags()
    filter = getFilter(note)
    if filter is None:
        return txt
    morphemizer = getMorphemizerByName(filter['Morphemizer'])
    if morphemizer is None:
        return txt

    ms = getMorphemes(morphemizer, txt, tags)

    proper_nouns_known = cfg('Option_ProperNounsAlreadyKnown')

    for m in sorted(ms, key=lambda x: len(x.inflected),
                    reverse=True):  # largest subs first
        locs = allDb().getMatchingLocs(m)
        mat = max(loc.maturity for loc in locs) if locs else 0

        if proper_nouns_known and m.isProperNoun():
            mtype = 'mature'
        elif mat >= cfg('threshold_mature'):
            mtype = 'mature'
        elif mat >= cfg('threshold_known'):
            mtype = 'known'
        elif mat >= cfg('threshold_seen'):
            mtype = 'seen'
        else:
            mtype = 'unknown'

        priority = 'true' if m in priority_db else 'false'

        focus_morph_string = m.show().split()[0]
        frequency = 'true' if focus_morph_string in frequency_list else 'false'

        repl = '<span class="morphHighlight" mtype="{mtype}" priority="{priority}" frequency="{frequency}" mat="{mat}">\\1</span>'.format(
            mtype=mtype, priority=priority, frequency=frequency, mat=mat)
        txt = nonSpanSub('(%s)' % m.inflected, repl, txt)
    return txt