def highlight(txt, extra, fieldDict, field, mod_field): '''When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity''' from .util import getFilterByTagsAndType from .morphemizer import getMorphemizerByName from .morphemes import getMorphemes # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub(sub, repl, string): return ''.join( re.sub(sub, repl, s, flags=re.IGNORECASE ) if not s.startswith('<span') else s for s in re.split('(<span.*?</span>)', string)) frequencyListPath = cfg1('path_frequency') try: with codecs.open(frequencyListPath, 'r', 'utf-8') as f: frequencyList = [ line.strip().split('\t')[0] for line in f.readlines() ] except: pass # User does not have a frequency.txt priorityDb = main.MorphDb(cfg1('path_priority'), ignoreErrors=True).db tags = fieldDict['Tags'].split() filter = getFilterByTagsAndType(fieldDict['Type'], tags) if filter is None: return txt morphemizer = getMorphemizerByName(filter['Morphemizer']) if morphemizer is None: return txt ms = getMorphemes(morphemizer, txt, tags) for m in sorted(ms, key=lambda x: len(x.inflected), reverse=True): # largest subs first locs = allDb().getMatchingLocs(m) mat = max(loc.maturity for loc in locs) if locs else 0 if mat >= cfg1('threshold_mature'): mtype = 'mature' elif mat >= cfg1('threshold_known'): mtype = 'known' elif mat >= cfg1('threshold_seen'): mtype = 'seen' else: mtype = 'unknown' if m in priorityDb: priority = 'true' else: priority = 'false' focusMorphString = m.show().split()[0] try: focusMorphIndex = frequencyList.index(focusMorphString) frequency = 'true' except: frequency = 'false' repl = '<span class="morphHighlight" mtype="{mtype}" priority="{priority}" frequency="{frequency}" mat="{mat}">\\1</span>'.format( mtype=mtype, priority=priority, frequency=frequency, mat=mat) txt = nonSpanSub('(%s)' % m.inflected, repl, txt) return txt
def highlight(txt: str, field, filter: str, ctx) -> str: """When a field is marked with the 'focusMorph' command, we format it by wrapping all the morphemes in <span>s with attributes set to its maturity""" print("morphHighlight filter %s" % filter) if filter != "morphHighlight": return txt from .util import getFilter from .morphemizer import getMorphemizerByName from .morphemes import getMorphemes # must avoid formatting a smaller morph that is contained in a bigger morph # => do largest subs first and don't sub anything already in <span> def nonSpanSub(sub, repl, string): return ''.join( re.sub(sub, repl, s, flags=re.IGNORECASE ) if not s.startswith('<span') else s for s in re.split('(<span.*?</span>)', string)) frequency_list_path = cfg('path_frequency') try: with codecs.open(frequency_list_path, encoding='utf-8') as f: frequency_list = [ line.strip().split('\t')[0] for line in f.readlines() ] except: frequency_list = [] priority_db = main.MorphDb(cfg('path_priority'), ignoreErrors=True).db note = ctx.note() tags = note.stringTags() filter = getFilter(note) if filter is None: return txt morphemizer = getMorphemizerByName(filter['Morphemizer']) if morphemizer is None: return txt ms = getMorphemes(morphemizer, txt, tags) proper_nouns_known = cfg('Option_ProperNounsAlreadyKnown') for m in sorted(ms, key=lambda x: len(x.inflected), reverse=True): # largest subs first locs = allDb().getMatchingLocs(m) mat = max(loc.maturity for loc in locs) if locs else 0 if proper_nouns_known and m.isProperNoun(): mtype = 'mature' elif mat >= cfg('threshold_mature'): mtype = 'mature' elif mat >= cfg('threshold_known'): mtype = 'known' elif mat >= cfg('threshold_seen'): mtype = 'seen' else: mtype = 'unknown' priority = 'true' if m in priority_db else 'false' focus_morph_string = m.show().split()[0] frequency = 'true' if focus_morph_string in frequency_list else 'false' repl = '<span class="morphHighlight" mtype="{mtype}" priority="{priority}" frequency="{frequency}" mat="{mat}">\\1</span>'.format( mtype=mtype, priority=priority, frequency=frequency, mat=mat) txt = nonSpanSub('(%s)' % m.inflected, repl, txt) return txt