def parse_ucd(): def add_word(w, c): if c <= 32 or c == 127 or 128 <= c <= 159: return word_search_map[w.lower()].add(c) first = None for word, c in html5.items(): if len(c) == 1: add_word(word.rstrip(';'), ord(c)) word_search_map['nnbsp'].add(0x202f) for line in get_data('ucd/UnicodeData.txt'): parts = [x.strip() for x in line.split(';')] codepoint = int(parts[0], 16) name = parts[1] if name: name_map[codepoint] = name for word in name.lower().split(): add_word(word, codepoint) category = parts[2] s = class_maps.setdefault(category, set()) desc = parts[1] codepoints = (codepoint, ) if first is None: if desc.endswith(', First>'): first = codepoint continue else: codepoints = range(first, codepoint + 1) first = None for codepoint in codepoints: s.add(codepoint) not_assigned.discard(codepoint) if category.startswith('M'): marks.add(codepoint)
def parse_ucd() -> None: def add_word(w: str, c: int) -> None: if c <= 32 or c == 127 or 128 <= c <= 159: return if len(w) > 1: word_search_map[w.lower()].add(c) first: Optional[int] = None for word, c in html5.items(): if len(c) == 1: add_word(word.rstrip(';'), ord(c)) word_search_map['nnbsp'].add(0x202f) for line in get_data('ucd/UnicodeData.txt'): parts = [x.strip() for x in line.split(';')] codepoint = int(parts[0], 16) name = parts[1] or parts[10] if name == '<control>': name = parts[10] if name: name_map[codepoint] = name for word in name.lower().split(): add_word(word, codepoint) category = parts[2] s = class_maps.setdefault(category, set()) desc = parts[1] codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint, ) if first is None: if desc.endswith(', First>'): first = codepoint continue else: codepoints = range(first, codepoint + 1) first = None for codepoint in codepoints: s.add(codepoint) not_assigned.discard(codepoint) if category.startswith('M'): marks.add(codepoint) elif category.startswith('S'): all_symbols.add(codepoint) with open('nerd-fonts-glyphs.txt') as f: for line in f: line = line.strip() if not line or line.startswith('#'): continue code, category, name = line.split(' ', 2) codepoint = int(code, 16) if name and codepoint not in name_map: name_map[codepoint] = name.upper() for word in name.lower().split(): add_word(word, codepoint) # Some common synonyms word_search_map['bee'] |= word_search_map['honeybee'] word_search_map['lambda'] |= word_search_map['lamda'] word_search_map['lamda'] |= word_search_map['lambda'] word_search_map['diamond'] |= word_search_map['gem']
def encode(string): """Return the encoded string with corresponding hmtl5 entities.""" string = string.replace('&', '&') string = string.replace(';', ';') for k, v in html5.items(): if k[-1] == ';' and k[0].islower() and 'amp' not in k and 'semi' not in k: string = string.replace(v, '&' + k) return string
from hashlib import md5 import pymysql import re from typing import Union,Sequence import os from html.entities import html5 as htmlentities from fastapi.exceptions import HTTPException import sys htmlentities=dict(map(reversed,htmlentities.items())) for each in tuple(htmlentities.keys()): if len(each)>1: htmlentities.pop(each) continue if len(hex(ord(each)))>6: htmlentities.pop(each) pat="".join(htmlentities.keys()) pat=re.escape(pat) try: from config import * except: print(f"No config found! Initlizing...") db_host,db_user,db_passwd,db_db=[input(f"Setup MySQL {each}:") for each in ("Host","User","Password","DataBase")] host_ip=input("Setup Web Host IP:") host_port=input("Setup Web Host Port:") with open(sys.path[0]+"/config.py","w")as f: f.write(f'db_host="{db_host}"\ndb_user="******"\ndb_passwd="{db_passwd}"\ndb_db="{db_db}"\nhost_ip="{host_ip}"\nhost_port={host_port}') fake_db = {"*****@*****.**": {"nickname":"admin","password":md5(b"123456").hexdigest(),"favor":None,"avator":"data:image/jpg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD//gA7Q1JFQVRPUjogZ2QtanBlZyB2MS4wICh1c2luZyBJSkcgSlBFRyB2ODApLCBxdWFsaXR5ID0gOTAK/9sAQwADAgIDAgIDAwMDBAMDBAUIBQUEBAUKBwcGCAwKDAwLCgsLDQ4SEA0OEQ4LCxAWEBETFBUVFQwPFxgWFBgSFBUU/9sAQwEDBAQFBAUJBQUJFA0LDRQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQU/8AAEQgAZABkAwEiAAIRAQMRAf/EAB8AAAEFAQEBAQEBAAAAAAAAAAABAgMEBQYHCAkKC//EALUQAAIBAwMCBAMFBQQEAAABfQECAwAEEQUSITFBBhNRYQcicRQygZGhCCNCscEVUtHwJDNicoIJChYXGBkaJSYnKCkqNDU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6g4SFhoeIiYqSk5SVlpeYmZqio6Slpqeoqaqys7S1tre4ubrCw8TFxsfIycrS09TV1tfY2drh4uPk5ebn6Onq8fLz9PX29/j5+v/EAB8BAAMBAQEBAQEBAQEAAAAAAAABAgMEBQYHCAkKC//EALURAAIBAgQEAwQHBQQEAAECdwABAgMRBAUhMQYSQVEHYXETIjKBCBRCkaGxwQkjM1LwFWJy0QoWJDThJfEXGBkaJicoKSo1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoKDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uLj5OXm5+jp6vLz9PX29/j5+v/aAAwDAQACEQMRAD8A/VGilx/nFJ/npQBwnxY+JEPw80R7h7i3tTsLvc3TYjgQdWPqewHf9D8val+2Ek9t59hr+pX9tuJW4g0rzYSenIUA7fyIqb9vXwvrms+KPDl08UsvheC1mDYUmIXRUiJpAOynkE8Dd7mr3wN+M/hv4deDYtL1fQ47KwjjUR3qMp28cq+eVOeefXIJ7dUYpQ5rXO6EEoKVrs6v4Aftdaf491m30HWRHYXN4pNncCQtDM4I+VWOQN2RgbiQQRnla5f47fETUYRqkcM2wxyXRck4OUVwvXsEMbfiK+J/2nvjr4I1L4sprPw4t5bSFTIb2YAIl1OeFdQPvEYzvIBOR1wDXEN+1R4svFnF7eDVQxDI96vmSIQMcNnLcEjnPHHTitI09eZG8aMb8y08j6D8UeGrTRP2hPAVv4Du7i+vZJUaSeMn968TRF5fUBi0uSeqjnoa/U633iCMSEGTaNx98c1+OX7N/wC1xpnwz8aW1/deGbPVb64QxG5vbny2gAPCxuUwmexJI57V+rPwq+L+gfF3QItR0iSSCYoGmsLpQs8B/wBoZII9GBIORzWdZPQwxKbt2XU7f8aKMf5xRj/OK5ThDcMkZGfTNFVvsCm+NyWYnaFC54GM8479atf56UlcBM0UuP8AOKKYBg+tRTJKzxFHCqG+cEdR/nFSY96Me9AHm/x/+I2m/DD4cX+r6lZQ6ln93DaXChkkfaW+YHsApPvgDIzX48/Hr43av4nlkS2S20xblyZIdNto4I4k4wihVz1Dckknk9CAP0x/b5jD/Cy3UYeWSR4lQj+8F+Ye4YL+Bavxn13VJr++lPmHYH2Ko4BAGAfrzXZRStc9PDxSp83VmQqOSZCM9+akgOyVODgdvwrSs7I3QCBX3DDBVXII5zk/XFdZo/ww1PVrNr+OIfZwOWY4xwCePYY/OnKooayZ206E6r91HFxusUju0SyblIAbPGR1GD1FfSf7PH7R+r+A9ZtdQtpFeGwxusYFEZjh6Ps7FT1IPfnI+bd5HN8NL+xhlkuIyY0jjlDDkhHDHP4Y59DWRo88ujalaz/NGVwQ4GSAf5j29Mil7WNS9mdH1adO3tFoz96vhH8UNK+LXgyx13TJ0bzUHmxDho25HIPIzj/ODXa4PrX5xfsU+Mdb8I67Y6a7BtI1ORlsp0P7lnwCYyf4SRjg/rha/Ri2mFzBHKuQHUMAevIrlkrM8HE0fY1HHoS4PrRg+tJj3ox71Jyi496KTb70UAHFH5UvNJzQB8wf8FBXuLL4Lw6jaoJJbO8EmO33SQD9WCj8q/IXWLGG4124FpEwspZjLax45CP90Y9eg+or91f2gfCtp4u+EniO0vYhLDHaPPtzg/IN+QexG3IODyBwelfjn4y8GK73FjZW8guLN28veuMrnpkEj3rqpP3bHt4K04WfQwfCvhlyZsRFmCxoMdtxP+Ir3/wvoRttPs7CCNp2LHCBMtIzN6d88DFcp8KE01fs9tdxXMEz/u5VmQAJInBUjrgHH5ivc/hX4e+IXjL4/wBjp2iGDwd4M0yRJ47lrSG5m1NIyhkZ2OSiuC4G07l3BSeTjw8ROU5ON7W7n1cZww9PmUbq1zN+IPwrFol1YPGYL1rGISbByhaPdtIPoXOc+hrx34yfAq70DRdE1kWpt4b2BZUhGP3KHO1GAJI+XBycZ3e1fp3qnwdtdb8aNq1ysa2oKYjHLNtA5P8AnjAr538KfsgQfD7xf4r8UeJLy81bVdfvpYPtcs67ZYJBJKRsXgBPLjCggAYPGMBfPw1aSej1/M51mNKqkn21+fRfmzF/4J3aY+r22tabfRs8Vs8dzAZBnyZ4zzkHoTuX644xivvy3KKWiH3l5Ix0zzXzN+zd4Vj+GHizxBawWyuLtY5HKDBaIHHmx9sDIDJ1AwRnofp/nrXvQk5RTZ8pmTi8TJx20/IPyo/KjmoLO8F4jssciBXZP3i7ScHBIz29+9XfoeYT0Uc0UwEd/LRmPQDNZVp4ggkWVmdtoOQShGATgD861sUm0elAGLrk+m6xo93p9zKTBe27ROArZKOpB7ema8N1D9knwLqPjv8A4Su2DGzU759HEX7tnI4w2QVXuRg9xkDgfQqTRGVgJA3OMDoD6fWq2o6pb2s8NpKjk3Hygp2zx160XfQ1hUlD4WfOnxu/Zx0bxn4YFz4b0qy0TX7NxNbTW0IhWQKuDE23joFwcZBRe2a8i/Zl+M1x4d1240q6laK3JJktJOhdeHVR/CwGSB/wHpgj6P8AGXjibwt4sg07C3mlq/lPbSNtcsV3BsjGcc8EYPTvmvBLvR01P4jaJJpCxL4gsp45IJoIhG0qq6/uZQOCu3OGxkDIzjIrzsTQ543ue9hcUvZvD11dSV15H16usT4uXk0e/hhgBbzW8phIOfuhXJP4gdfrjHv9DufFTQXeqB7G0t8zW1nFIC5JUjfIRkE4YgKMgZJJJI26MnixZdHuXfNneQu8bxx4kI2tyVOMHI5GR3GR2r56/az+JfiH4UfDfR9X0jUZFSO7FrdSw4PnRSKfJfvt4VgSMcjjqKypUaMZrl1b2PNoxnUlyppPY9Q0fRtQ8M+NtPESLJBM7qjsOCmDuA/Tj2r2QZr51+AP7SPhb4/+DbOzXVIdO8VWWxpbd3VJNynHmoOAQe4HqRjBFe+WU1zdWdtJKFglx++UcgEcHHtn9K9VKxz4jmUrTWq0L3NGDVbTb+LVLJLmBg8b5AZTkHBIyPbiq9rDfpqUzSyBrY52j8eKZzGjzRQRRQAZ9qZ5okTdFiQZxwffmnkEjFULOE6bHcIpaYL86oBz9P0oAzdO1OLStVl0u7dIHlkL2u84EoPYepGDxXNfE3xYng/U9P1C+vXstHRHElwqF0ilypHmAAnay5AOODU/xR8NL458OWkVvAYtaikS6sHml8k28g5DOQCcdiACenTqPnn41eF/FmuaBNbeKUvbGxLCOWe0uPMWTK5ADNnjg8bcDv2rSCUrM2got3bOTv8A4qJ8VPiJ4i1bTOdGtClvbXB4LyODucDsNq9Dzhge9df8OdRi8M+LLSaG0855UljidznZKQD5rnqcKsg/4FjjqPAvCl7o/wAJILiwhjvptOup/tNyZplmnjcDG8KFGVxjOPY17VoPinw1qmnW1/p+sWs8kf7wFZQMcYII6g4JGDWk6cZwcWVWlKNS8VpbT7j2eS9VIjGGZ3LFnkbqzEkk/Ukk1wt54bsfidpOseBddkL2cgayjLDd5UcpDQuo/vRyFBn+6cV88+Nf2/NO8OX17Yaf4SvL2eCV4PPuLyOOIsrFc/JvyOM4yKk/Zu/aXufiz8UhaanBbWWp3ciJZwWoO2RcYKcnJI+9n2PpSlRikmlsc9CFenebPDvFHwI8WfAbxqft8eq6dFZTeUNb0fcjJkcZK9UbBKn+IAjqrIvtHw6+L1/rGoWVl4p8d+IfEOgkhZYEvWjDLn7sndvr/Kv0b8R+FNF8V2ywazptrqMIBULcRhuGxkfQ4GR3rBv/AIW+A9RhGjt4e0aJ47fEcNvbRRSwxfdBTaAVXPHHFKNRdUenLGKpFKcde50nhr+zh4f04aR5baWIEFsYjlfLAAXB+laX4V4X8NNab4TeK9X8C30pn0C2niOn3x6wmcFhE4HQZBGemSOmePclUhmbcSD2PQVnJWZ58lZjvwoo5oqSBarynypg54UrtLenpn9anz7UHntQByd7u+1SFn35OQwPUdqWK9lQgFjInQo/II9Oa3ZNJtZfMITDt/ED0NY13p8lm53DKZwH9a0umgPlL9sH9lTUfF2i/wDCWeAIR9rtVMt1otvlXkGeXgx/EMZ2cZx8vPB/PDUtJ8RzJO0UD6hEjFJ4ljPmxMOCskf3lYEdR+JzX7d29y9s4KNj1HY18pftZfssP4/urjxv8PraODxikipqGmFlij1AcYkUlgBJgjv8wPZhye1ULJnXSnd2kfmU9/eQ/wCjzF0jJwUnyQPXrz+XNU9JkvbC9gvrSV7Ke3lEsU8LssgcYK7SD2I4I/OvUfiR4I8Y6Zri6Z4q0HWNKuYzkQXT8HjqmRtI9xn616F8Gf2QvG3xRvIPs3h+fSdF3gS6xrA8uFEIzlF6yZ9FGM4yRXUqqS1OiSS1voevfs3ft6+NzqNt4f8AEGlDxLahFVJ4ZT9oU8A5J+/1HHXPf0+xvCvinV/HPia+uNQ8Kat4SjaBIrTUpUAkli+8yOuMphiT1wcj0rK+E3wX8J/A7w5Fp+jW8epaiJWmfVLqBPNDlQvyHblVGOFye/PJrtHvrueWM+a7uPlXFcspJu8Ucc7PVFqT4YaXFpd9b2yEz37q9ze3TGSVsHOST39OgFdNDczJqEdokZe3WEEznu2cYzWJBbaw9lND5Rfzhhmnbp9Oa6LTbX7FYwQE7mRQpI9agxbb3LPPvRQTRSEKKM0UUAMjiSNmKqFLHLY7mklcqjHAOB3ooqXotAK82nwS/MYwDjqvFYOo6fCdRWDafLu7N/MB7FCoUj0P7w8+w9KKK48X/Cf9dTSm/eOWOvX9qkKJcswwrgyAMQSAepFP8FX8/i+xm/tRzcF4EPHy7TlhkY6fdB+tFFcdaTdPc9KaSV0tTrfDWmW93pFldzRiSWWBHbd0yQCTirunXOdZv7NYokihCMuxcEkjnNFFevHZHly3ZfEjfa2T+HZn9ag0qLyllO933SE/Mc4ooqN5L5kl4nFFFFbAf//Z"}}
'&dgr;': 'δ', '&ggr;': 'γ', '&Sgr;': 'Σ', '&rgr;': 'ρ', '&zgr;': 'ζ', '&Circlesolid;': '◯', '\': ' ' } utf_encodings = { i[0]: str(i[1].encode('UTF-8')).replace('b', '').replace('\\', '').replace('\'', '') for i in entity_characters_mapping.items() } html5_entity_mappings = {f'&{key}': value for key, value in html5.items()} utf_encodings = {**html5_entity_mappings, **utf_encodings} def map_entity_characters(string, mapping_dictionary): mapped_string = string patent_entity_chars = list(set(re.findall(r"&\w+;", string))) for entity in patent_entity_chars: try: mapped_string = re.sub(entity, mapping_dictionary[entity], mapped_string) except: mapped_string = re.sub(entity, entity.replace('&', '"').replace(';', '"'), mapped_string) return mapped_string
# Some of the named characters appear both with and without a trailing semicolon; # see https://docs.python.org/3.9/library/html.entities.html#html.entities.html5 if not named_character.endswith(";"): return False if len(unicode_character) != 1: return False # The character should be escaped (and therefore included in the dict) if it's is not ASCII. # Might have to tweak the condition below, e.g. by checking if the character is part of `string.printable` instead. should_escape = ord(unicode_character) >= 128 return should_escape ESCAPE_UNICODE_TO_HTML5 = { unicode_character: f"&{named_character}" for named_character, unicode_character in html5.items() if _should_include_escape_entry(unicode_character, named_character) } def escape_to_named_characters(string: str): return "".join(ESCAPE_UNICODE_TO_HTML5.get(c, c) for c in string) def block_join(object_collection, sep="<b>•</b>", multiline=True): if len(object_collection) == 0: return "" tag = '<div style="display: inline-block; white-space: nowrap;">' if multiline: return format_html_join(
def _populate_class_variables(): """Initialize variables used by this class to manage the plethora of HTML5 named entities. This function returns a 3-tuple containing two dictionaries and a regular expression: unicode_to_name - A mapping of Unicode strings like "⦨" to entity names like "angmsdaa". When a single Unicode string has multiple entity names, we try to choose the most commonly-used name. name_to_unicode: A mapping of entity names like "angmsdaa" to Unicode strings like "⦨". named_entity_re: A regular expression matching (almost) any Unicode string that corresponds to an HTML5 named entity. """ unicode_to_name = {} name_to_unicode = {} short_entities = set() long_entities_by_first_character = defaultdict(set) for name_with_semicolon, character in sorted(html5.items()): # "It is intentional, for legacy compatibility, that many # code points have multiple character reference names. For # example, some appear both with and without the trailing # semicolon, or with different capitalizations." # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references # # The parsers are in charge of handling (or not) character # references with no trailing semicolon, so we remove the # semicolon whenever it appears. if name_with_semicolon.endswith(';'): name = name_with_semicolon[:-1] else: name = name_with_semicolon # When parsing HTML, we want to recognize any known named # entity and convert it to a sequence of Unicode # characters. if name not in name_to_unicode: name_to_unicode[name] = character # When _generating_ HTML, we want to recognize special # character sequences that _could_ be converted to named # entities. unicode_to_name[character] = name # We also need to build a regular expression that lets us # _find_ those characters in output strings so we can # replace them. # # This is tricky, for two reasons. if (len(character) == 1 and ord(character) < 128 and character not in '<>&'): # First, it would be annoying to turn single ASCII # characters like | into named entities like # |. The exceptions are <>&, which we _must_ # turn into named entities to produce valid HTML. continue if len(character) > 1 and all(ord(x) < 128 for x in character): # We also do not want to turn _combinations_ of ASCII # characters like 'fj' into named entities like 'fj', # though that's more debateable. continue # Second, some named entities have a Unicode value that's # a subset of the Unicode value for some _other_ named # entity. As an example, \u2267' is ≧, # but '\u2267\u0338' is ≧̸. Our regular # expression needs to match the first two characters of # "\u2267\u0338foo", but only the first character of # "\u2267foo". # # In this step, we build two sets of characters that # _eventually_ need to go into the regular expression. But # we won't know exactly what the regular expression needs # to look like until we've gone through the entire list of # named entities. if len(character) == 1: short_entities.add(character) else: long_entities_by_first_character[character[0]].add(character) # Now that we've been through the entire list of entities, we # can create a regular expression that matches any of them. particles = set() for short in short_entities: long_versions = long_entities_by_first_character[short] if not long_versions: particles.add(short) else: ignore = "".join([x[1] for x in long_versions]) # This finds, e.g. \u2267 but only if it is _not_ # followed by \u0338. particles.add("%s(?![%s])" % (short, ignore)) for long_entities in list(long_entities_by_first_character.values()): for long_entity in long_entities: particles.add(long_entity) re_definition = "(%s)" % "|".join(particles) # If an entity shows up in both html5 and codepoint2name, it's # likely that HTML5 gives it several different names, such as # 'rsquo' and 'rsquor'. When converting Unicode characters to # named entities, the codepoint2name name should take # precedence where possible, since that's the more easily # recognizable one. for codepoint, name in list(codepoint2name.items()): character = chr(codepoint) unicode_to_name[character] = name return unicode_to_name, name_to_unicode, re.compile(re_definition)