Пример #1
0
def parse_ucd():
    def add_word(w, c):
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        word_search_map[w.lower()].add(c)

    first = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints = (codepoint, )
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            if category.startswith('M'):
                marks.add(codepoint)
Пример #2
0
def parse_ucd() -> None:
    def add_word(w: str, c: int) -> None:
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        if len(w) > 1:
            word_search_map[w.lower()].add(c)

    first: Optional[int] = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1] or parts[10]
        if name == '<control>':
            name = parts[10]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint, )
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            if category.startswith('M'):
                marks.add(codepoint)
            elif category.startswith('S'):
                all_symbols.add(codepoint)

    with open('nerd-fonts-glyphs.txt') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            code, category, name = line.split(' ', 2)
            codepoint = int(code, 16)
            if name and codepoint not in name_map:
                name_map[codepoint] = name.upper()
                for word in name.lower().split():
                    add_word(word, codepoint)

    # Some common synonyms
    word_search_map['bee'] |= word_search_map['honeybee']
    word_search_map['lambda'] |= word_search_map['lamda']
    word_search_map['lamda'] |= word_search_map['lambda']
    word_search_map['diamond'] |= word_search_map['gem']
Пример #3
0
def encode(string):
    """Return the encoded string with corresponding hmtl5 entities."""
    
    string = string.replace('&', '&amp;')
    string = string.replace(';', '&semi;')
    for k, v in html5.items():
        if k[-1] == ';' and k[0].islower() and 'amp' not in k and 'semi' not in k:
            string = string.replace(v, '&' + k)
    return string
Пример #4
0
from hashlib import md5
import pymysql
import re
from typing import Union,Sequence
import os
from html.entities import html5 as htmlentities
from fastapi.exceptions import HTTPException
import sys

htmlentities=dict(map(reversed,htmlentities.items()))
for each in tuple(htmlentities.keys()):
    if len(each)>1:
        htmlentities.pop(each)
        continue
    if len(hex(ord(each)))>6:
        htmlentities.pop(each)
pat="".join(htmlentities.keys())
pat=re.escape(pat)

try:
    from config import *
except:
    print(f"No config found! Initlizing...")
    db_host,db_user,db_passwd,db_db=[input(f"Setup MySQL {each}:") for each in ("Host","User","Password","DataBase")]
    host_ip=input("Setup Web Host IP:")
    host_port=input("Setup Web Host Port:")
    with open(sys.path[0]+"/config.py","w")as f:
        f.write(f'db_host="{db_host}"\ndb_user="******"\ndb_passwd="{db_passwd}"\ndb_db="{db_db}"\nhost_ip="{host_ip}"\nhost_port={host_port}')


fake_db = {"*****@*****.**": {"nickname":"admin","password":md5(b"123456").hexdigest(),"favor":None,"avator":"data:image/jpg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD//gA7Q1JFQVRPUjogZ2QtanBlZyB2MS4wICh1c2luZyBJSkcgSlBFRyB2ODApLCBxdWFsaXR5ID0gOTAK/9sAQwADAgIDAgIDAwMDBAMDBAUIBQUEBAUKBwcGCAwKDAwLCgsLDQ4SEA0OEQ4LCxAWEBETFBUVFQwPFxgWFBgSFBUU/9sAQwEDBAQFBAUJBQUJFA0LDRQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQU/8AAEQgAZABkAwEiAAIRAQMRAf/EAB8AAAEFAQEBAQEBAAAAAAAAAAABAgMEBQYHCAkKC//EALUQAAIBAwMCBAMFBQQEAAABfQECAwAEEQUSITFBBhNRYQcicRQygZGhCCNCscEVUtHwJDNicoIJChYXGBkaJSYnKCkqNDU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6g4SFhoeIiYqSk5SVlpeYmZqio6Slpqeoqaqys7S1tre4ubrCw8TFxsfIycrS09TV1tfY2drh4uPk5ebn6Onq8fLz9PX29/j5+v/EAB8BAAMBAQEBAQEBAQEAAAAAAAABAgMEBQYHCAkKC//EALURAAIBAgQEAwQHBQQEAAECdwABAgMRBAUhMQYSQVEHYXETIjKBCBRCkaGxwQkjM1LwFWJy0QoWJDThJfEXGBkaJicoKSo1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoKDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uLj5OXm5+jp6vLz9PX29/j5+v/aAAwDAQACEQMRAD8A/VGilx/nFJ/npQBwnxY+JEPw80R7h7i3tTsLvc3TYjgQdWPqewHf9D8val+2Ek9t59hr+pX9tuJW4g0rzYSenIUA7fyIqb9vXwvrms+KPDl08UsvheC1mDYUmIXRUiJpAOynkE8Dd7mr3wN+M/hv4deDYtL1fQ47KwjjUR3qMp28cq+eVOeefXIJ7dUYpQ5rXO6EEoKVrs6v4Aftdaf491m30HWRHYXN4pNncCQtDM4I+VWOQN2RgbiQQRnla5f47fETUYRqkcM2wxyXRck4OUVwvXsEMbfiK+J/2nvjr4I1L4sprPw4t5bSFTIb2YAIl1OeFdQPvEYzvIBOR1wDXEN+1R4svFnF7eDVQxDI96vmSIQMcNnLcEjnPHHTitI09eZG8aMb8y08j6D8UeGrTRP2hPAVv4Du7i+vZJUaSeMn968TRF5fUBi0uSeqjnoa/U633iCMSEGTaNx98c1+OX7N/wC1xpnwz8aW1/deGbPVb64QxG5vbny2gAPCxuUwmexJI57V+rPwq+L+gfF3QItR0iSSCYoGmsLpQs8B/wBoZII9GBIORzWdZPQwxKbt2XU7f8aKMf5xRj/OK5ThDcMkZGfTNFVvsCm+NyWYnaFC54GM8479atf56UlcBM0UuP8AOKKYBg+tRTJKzxFHCqG+cEdR/nFSY96Me9AHm/x/+I2m/DD4cX+r6lZQ6ln93DaXChkkfaW+YHsApPvgDIzX48/Hr43av4nlkS2S20xblyZIdNto4I4k4wihVz1Dckknk9CAP0x/b5jD/Cy3UYeWSR4lQj+8F+Ye4YL+Bavxn13VJr++lPmHYH2Ko4BAGAfrzXZRStc9PDxSp83VmQqOSZCM9+akgOyVODgdvwrSs7I3QCBX3DDBVXII5zk/XFdZo/ww1PVrNr+OIfZwOWY4xwCePYY/OnKooayZ206E6r91HFxusUju0SyblIAbPGR1GD1FfSf7PH7R+r+A9ZtdQtpFeGwxusYFEZjh6Ps7FT1IPfnI+bd5HN8NL+xhlkuIyY0jjlDDkhHDHP4Y59DWRo88ujalaz/NGVwQ4GSAf5j29Mil7WNS9mdH1adO3tFoz96vhH8UNK+LXgyx13TJ0bzUHmxDho25HIPIzj/ODXa4PrX5xfsU+Mdb8I67Y6a7BtI1ORlsp0P7lnwCYyf4SRjg/rha/Ri2mFzBHKuQHUMAevIrlkrM8HE0fY1HHoS4PrRg+tJj3ox71Jyi496KTb70UAHFH5UvNJzQB8wf8FBXuLL4Lw6jaoJJbO8EmO33SQD9WCj8q/IXWLGG4124FpEwspZjLax45CP90Y9eg+or91f2gfCtp4u+EniO0vYhLDHaPPtzg/IN+QexG3IODyBwelfjn4y8GK73FjZW8guLN28veuMrnpkEj3rqpP3bHt4K04WfQwfCvhlyZsRFmCxoMdtxP+Ir3/wvoRttPs7CCNp2LHCBMtIzN6d88DFcp8KE01fs9tdxXMEz/u5VmQAJInBUjrgHH5ivc/hX4e+IXjL4/wBjp2iGDwd4M0yRJ47lrSG5m1NIyhkZ2OSiuC4G07l3BSeTjw8ROU5ON7W7n1cZww9PmUbq1zN+IPwrFol1YPGYL1rGISbByhaPdtIPoXOc+hrx34yfAq70DRdE1kWpt4b2BZUhGP3KHO1GAJI+XBycZ3e1fp3qnwdtdb8aNq1ysa2oKYjHLNtA5P8AnjAr538KfsgQfD7xf4r8UeJLy81bVdfvpYPtcs67ZYJBJKRsXgBPLjCggAYPGMBfPw1aSej1/M51mNKqkn21+fRfmzF/4J3aY+r22tabfRs8Vs8dzAZBnyZ4zzkHoTuX644xivvy3KKWiH3l5Ix0zzXzN+zd4Vj+GHizxBawWyuLtY5HKDBaIHHmx9sDIDJ1AwRnofp/nrXvQk5RTZ8pmTi8TJx20/IPyo/KjmoLO8F4jssciBXZP3i7ScHBIz29+9XfoeYT0Uc0UwEd/LRmPQDNZVp4ggkWVmdtoOQShGATgD861sUm0elAGLrk+m6xo93p9zKTBe27ROArZKOpB7ema8N1D9knwLqPjv8A4Su2DGzU759HEX7tnI4w2QVXuRg9xkDgfQqTRGVgJA3OMDoD6fWq2o6pb2s8NpKjk3Hygp2zx160XfQ1hUlD4WfOnxu/Zx0bxn4YFz4b0qy0TX7NxNbTW0IhWQKuDE23joFwcZBRe2a8i/Zl+M1x4d1240q6laK3JJktJOhdeHVR/CwGSB/wHpgj6P8AGXjibwt4sg07C3mlq/lPbSNtcsV3BsjGcc8EYPTvmvBLvR01P4jaJJpCxL4gsp45IJoIhG0qq6/uZQOCu3OGxkDIzjIrzsTQ543ue9hcUvZvD11dSV15H16usT4uXk0e/hhgBbzW8phIOfuhXJP4gdfrjHv9DufFTQXeqB7G0t8zW1nFIC5JUjfIRkE4YgKMgZJJJI26MnixZdHuXfNneQu8bxx4kI2tyVOMHI5GR3GR2r56/az+JfiH4UfDfR9X0jUZFSO7FrdSw4PnRSKfJfvt4VgSMcjjqKypUaMZrl1b2PNoxnUlyppPY9Q0fRtQ8M+NtPESLJBM7qjsOCmDuA/Tj2r2QZr51+AP7SPhb4/+DbOzXVIdO8VWWxpbd3VJNynHmoOAQe4HqRjBFe+WU1zdWdtJKFglx++UcgEcHHtn9K9VKxz4jmUrTWq0L3NGDVbTb+LVLJLmBg8b5AZTkHBIyPbiq9rDfpqUzSyBrY52j8eKZzGjzRQRRQAZ9qZ5okTdFiQZxwffmnkEjFULOE6bHcIpaYL86oBz9P0oAzdO1OLStVl0u7dIHlkL2u84EoPYepGDxXNfE3xYng/U9P1C+vXstHRHElwqF0ilypHmAAnay5AOODU/xR8NL458OWkVvAYtaikS6sHml8k28g5DOQCcdiACenTqPnn41eF/FmuaBNbeKUvbGxLCOWe0uPMWTK5ADNnjg8bcDv2rSCUrM2got3bOTv8A4qJ8VPiJ4i1bTOdGtClvbXB4LyODucDsNq9Dzhge9df8OdRi8M+LLSaG0855UljidznZKQD5rnqcKsg/4FjjqPAvCl7o/wAJILiwhjvptOup/tNyZplmnjcDG8KFGVxjOPY17VoPinw1qmnW1/p+sWs8kf7wFZQMcYII6g4JGDWk6cZwcWVWlKNS8VpbT7j2eS9VIjGGZ3LFnkbqzEkk/Ukk1wt54bsfidpOseBddkL2cgayjLDd5UcpDQuo/vRyFBn+6cV88+Nf2/NO8OX17Yaf4SvL2eCV4PPuLyOOIsrFc/JvyOM4yKk/Zu/aXufiz8UhaanBbWWp3ciJZwWoO2RcYKcnJI+9n2PpSlRikmlsc9CFenebPDvFHwI8WfAbxqft8eq6dFZTeUNb0fcjJkcZK9UbBKn+IAjqrIvtHw6+L1/rGoWVl4p8d+IfEOgkhZYEvWjDLn7sndvr/Kv0b8R+FNF8V2ywazptrqMIBULcRhuGxkfQ4GR3rBv/AIW+A9RhGjt4e0aJ47fEcNvbRRSwxfdBTaAVXPHHFKNRdUenLGKpFKcde50nhr+zh4f04aR5baWIEFsYjlfLAAXB+laX4V4X8NNab4TeK9X8C30pn0C2niOn3x6wmcFhE4HQZBGemSOmePclUhmbcSD2PQVnJWZ58lZjvwoo5oqSBarynypg54UrtLenpn9anz7UHntQByd7u+1SFn35OQwPUdqWK9lQgFjInQo/II9Oa3ZNJtZfMITDt/ED0NY13p8lm53DKZwH9a0umgPlL9sH9lTUfF2i/wDCWeAIR9rtVMt1otvlXkGeXgx/EMZ2cZx8vPB/PDUtJ8RzJO0UD6hEjFJ4ljPmxMOCskf3lYEdR+JzX7d29y9s4KNj1HY18pftZfssP4/urjxv8PraODxikipqGmFlij1AcYkUlgBJgjv8wPZhye1ULJnXSnd2kfmU9/eQ/wCjzF0jJwUnyQPXrz+XNU9JkvbC9gvrSV7Ke3lEsU8LssgcYK7SD2I4I/OvUfiR4I8Y6Zri6Z4q0HWNKuYzkQXT8HjqmRtI9xn616F8Gf2QvG3xRvIPs3h+fSdF3gS6xrA8uFEIzlF6yZ9FGM4yRXUqqS1OiSS1voevfs3ft6+NzqNt4f8AEGlDxLahFVJ4ZT9oU8A5J+/1HHXPf0+xvCvinV/HPia+uNQ8Kat4SjaBIrTUpUAkli+8yOuMphiT1wcj0rK+E3wX8J/A7w5Fp+jW8epaiJWmfVLqBPNDlQvyHblVGOFye/PJrtHvrueWM+a7uPlXFcspJu8Ucc7PVFqT4YaXFpd9b2yEz37q9ze3TGSVsHOST39OgFdNDczJqEdokZe3WEEznu2cYzWJBbaw9lND5Rfzhhmnbp9Oa6LTbX7FYwQE7mRQpI9agxbb3LPPvRQTRSEKKM0UUAMjiSNmKqFLHLY7mklcqjHAOB3ooqXotAK82nwS/MYwDjqvFYOo6fCdRWDafLu7N/MB7FCoUj0P7w8+w9KKK48X/Cf9dTSm/eOWOvX9qkKJcswwrgyAMQSAepFP8FX8/i+xm/tRzcF4EPHy7TlhkY6fdB+tFFcdaTdPc9KaSV0tTrfDWmW93pFldzRiSWWBHbd0yQCTirunXOdZv7NYokihCMuxcEkjnNFFevHZHly3ZfEjfa2T+HZn9ag0qLyllO933SE/Mc4ooqN5L5kl4nFFFFbAf//Z"}}
Пример #5
0
    '&dgr;': 'δ',
    '&ggr;': 'γ',
    '&Sgr;': 'Σ',
    '&rgr;': 'ρ',
    '&zgr;': 'ζ',
    '&Circlesolid;': '◯',
    '&bsol;': ' '
}

utf_encodings = {
    i[0]: str(i[1].encode('UTF-8')).replace('b',
                                            '').replace('\\',
                                                        '').replace('\'', '')
    for i in entity_characters_mapping.items()
}
html5_entity_mappings = {f'&{key}': value for key, value in html5.items()}
utf_encodings = {**html5_entity_mappings, **utf_encodings}


def map_entity_characters(string, mapping_dictionary):
    mapped_string = string
    patent_entity_chars = list(set(re.findall(r"&\w+;", string)))
    for entity in patent_entity_chars:
        try:
            mapped_string = re.sub(entity, mapping_dictionary[entity],
                                   mapped_string)
        except:
            mapped_string = re.sub(entity,
                                   entity.replace('&', '"').replace(';', '"'),
                                   mapped_string)
    return mapped_string
Пример #6
0
    # Some of the named characters appear both with and without a trailing semicolon;
    # see https://docs.python.org/3.9/library/html.entities.html#html.entities.html5
    if not named_character.endswith(";"):
        return False

    if len(unicode_character) != 1:
        return False
    # The character should be escaped (and therefore included in the dict) if it's is not ASCII.
    # Might have to tweak the condition below, e.g. by checking if the character is part of `string.printable` instead.
    should_escape = ord(unicode_character) >= 128
    return should_escape


ESCAPE_UNICODE_TO_HTML5 = {
    unicode_character: f"&{named_character}"
    for named_character, unicode_character in html5.items()
    if _should_include_escape_entry(unicode_character, named_character)
}


def escape_to_named_characters(string: str):
    return "".join(ESCAPE_UNICODE_TO_HTML5.get(c, c) for c in string)


def block_join(object_collection, sep="<b>&bull;</b>", multiline=True):
    if len(object_collection) == 0:
        return ""

    tag = '<div style="display: inline-block; white-space: nowrap;">'
    if multiline:
        return format_html_join(
Пример #7
0
    def _populate_class_variables():
        """Initialize variables used by this class to manage the plethora of
        HTML5 named entities.

        This function returns a 3-tuple containing two dictionaries
        and a regular expression:

        unicode_to_name - A mapping of Unicode strings like "⦨" to
        entity names like "angmsdaa". When a single Unicode string has
        multiple entity names, we try to choose the most commonly-used
        name.

        name_to_unicode: A mapping of entity names like "angmsdaa" to 
        Unicode strings like "⦨".

        named_entity_re: A regular expression matching (almost) any
        Unicode string that corresponds to an HTML5 named entity.
        """
        unicode_to_name = {}
        name_to_unicode = {}

        short_entities = set()
        long_entities_by_first_character = defaultdict(set)
        
        for name_with_semicolon, character in sorted(html5.items()):
            # "It is intentional, for legacy compatibility, that many
            # code points have multiple character reference names. For
            # example, some appear both with and without the trailing
            # semicolon, or with different capitalizations."
            # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
            #
            # The parsers are in charge of handling (or not) character
            # references with no trailing semicolon, so we remove the
            # semicolon whenever it appears.
            if name_with_semicolon.endswith(';'):
                name = name_with_semicolon[:-1]
            else:
                name = name_with_semicolon

            # When parsing HTML, we want to recognize any known named
            # entity and convert it to a sequence of Unicode
            # characters.
            if name not in name_to_unicode:
                name_to_unicode[name] = character

            # When _generating_ HTML, we want to recognize special
            # character sequences that _could_ be converted to named
            # entities.
            unicode_to_name[character] = name

            # We also need to build a regular expression that lets us
            # _find_ those characters in output strings so we can
            # replace them.
            #
            # This is tricky, for two reasons.

            if (len(character) == 1 and ord(character) < 128
                and character not in '<>&'):
                # First, it would be annoying to turn single ASCII
                # characters like | into named entities like
                # &verbar;. The exceptions are <>&, which we _must_
                # turn into named entities to produce valid HTML.
                continue

            if len(character) > 1 and all(ord(x) < 128 for x in character):
                # We also do not want to turn _combinations_ of ASCII
                # characters like 'fj' into named entities like '&fjlig;',
                # though that's more debateable.
                continue

            # Second, some named entities have a Unicode value that's
            # a subset of the Unicode value for some _other_ named
            # entity.  As an example, \u2267' is &GreaterFullEqual;,
            # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
            # expression needs to match the first two characters of
            # "\u2267\u0338foo", but only the first character of
            # "\u2267foo".
            #
            # In this step, we build two sets of characters that
            # _eventually_ need to go into the regular expression. But
            # we won't know exactly what the regular expression needs
            # to look like until we've gone through the entire list of
            # named entities.
            if len(character) == 1:
                short_entities.add(character)
            else:
                long_entities_by_first_character[character[0]].add(character)

        # Now that we've been through the entire list of entities, we
        # can create a regular expression that matches any of them.
        particles = set()
        for short in short_entities:
            long_versions = long_entities_by_first_character[short]
            if not long_versions:
                particles.add(short)
            else:
                ignore = "".join([x[1] for x in long_versions])
                # This finds, e.g. \u2267 but only if it is _not_
                # followed by \u0338.
                particles.add("%s(?![%s])" % (short, ignore))
        
        for long_entities in list(long_entities_by_first_character.values()):
            for long_entity in long_entities:
                particles.add(long_entity)

        re_definition = "(%s)" % "|".join(particles)
                
        # If an entity shows up in both html5 and codepoint2name, it's
        # likely that HTML5 gives it several different names, such as
        # 'rsquo' and 'rsquor'. When converting Unicode characters to
        # named entities, the codepoint2name name should take
        # precedence where possible, since that's the more easily
        # recognizable one.
        for codepoint, name in list(codepoint2name.items()):
            character = chr(codepoint)
            unicode_to_name[character] = name

        return unicode_to_name, name_to_unicode, re.compile(re_definition)