Пример #1
0
def parse_ucd():
    def add_word(w, c):
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        word_search_map[w.lower()].add(c)

    first = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints = (codepoint, )
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            if category.startswith('M'):
                marks.add(codepoint)
Пример #2
0
def parse_ucd() -> None:
    def add_word(w: str, c: int) -> None:
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        if len(w) > 1:
            word_search_map[w.lower()].add(c)

    first: Optional[int] = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1] or parts[10]
        if name == '<control>':
            name = parts[10]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints: Union[Tuple[int, ...], Iterable[int]] = (codepoint, )
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            if category.startswith('M'):
                marks.add(codepoint)
            elif category.startswith('S'):
                all_symbols.add(codepoint)

    with open('nerd-fonts-glyphs.txt') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            code, category, name = line.split(' ', 2)
            codepoint = int(code, 16)
            if name and codepoint not in name_map:
                name_map[codepoint] = name.upper()
                for word in name.lower().split():
                    add_word(word, codepoint)

    # Some common synonyms
    word_search_map['bee'] |= word_search_map['honeybee']
    word_search_map['lambda'] |= word_search_map['lamda']
    word_search_map['lamda'] |= word_search_map['lambda']
    word_search_map['diamond'] |= word_search_map['gem']
Пример #3
0
def encode(string):
    """Return the encoded string with corresponding hmtl5 entities."""
    
    string = string.replace('&', '&amp;')
    string = string.replace(';', '&semi;')
    for k, v in html5.items():
        if k[-1] == ';' and k[0].islower() and 'amp' not in k and 'semi' not in k:
            string = string.replace(v, '&' + k)
    return string
Пример #4
0
from hashlib import md5
import pymysql
import re
from typing import Union,Sequence
import os
from html.entities import html5 as htmlentities
from fastapi.exceptions import HTTPException
import sys

htmlentities=dict(map(reversed,htmlentities.items()))
for each in tuple(htmlentities.keys()):
    if len(each)>1:
        htmlentities.pop(each)
        continue
    if len(hex(ord(each)))>6:
        htmlentities.pop(each)
pat="".join(htmlentities.keys())
pat=re.escape(pat)

try:
    from config import *
except:
    print(f"No config found! Initlizing...")
    db_host,db_user,db_passwd,db_db=[input(f"Setup MySQL {each}:") for each in ("Host","User","Password","DataBase")]
    host_ip=input("Setup Web Host IP:")
    host_port=input("Setup Web Host Port:")
    with open(sys.path[0]+"/config.py","w")as f:
        f.write(f'db_host="{db_host}"\ndb_user="******"\ndb_passwd="{db_passwd}"\ndb_db="{db_db}"\nhost_ip="{host_ip}"\nhost_port={host_port}')


fake_db = {"*****@*****.**": {"nickname":"admin","password":md5(b"123456").hexdigest(),"favor":None,"avator":""}}
Пример #5
0
    '&dgr;': 'δ',
    '&ggr;': 'γ',
    '&Sgr;': 'Σ',
    '&rgr;': 'ρ',
    '&zgr;': 'ζ',
    '&Circlesolid;': '◯',
    '&bsol;': ' '
}

utf_encodings = {
    i[0]: str(i[1].encode('UTF-8')).replace('b',
                                            '').replace('\\',
                                                        '').replace('\'', '')
    for i in entity_characters_mapping.items()
}
html5_entity_mappings = {f'&{key}': value for key, value in html5.items()}
utf_encodings = {**html5_entity_mappings, **utf_encodings}


def map_entity_characters(string, mapping_dictionary):
    mapped_string = string
    patent_entity_chars = list(set(re.findall(r"&\w+;", string)))
    for entity in patent_entity_chars:
        try:
            mapped_string = re.sub(entity, mapping_dictionary[entity],
                                   mapped_string)
        except:
            mapped_string = re.sub(entity,
                                   entity.replace('&', '"').replace(';', '"'),
                                   mapped_string)
    return mapped_string
Пример #6
0
    # Some of the named characters appear both with and without a trailing semicolon;
    # see https://docs.python.org/3.9/library/html.entities.html#html.entities.html5
    if not named_character.endswith(";"):
        return False

    if len(unicode_character) != 1:
        return False
    # The character should be escaped (and therefore included in the dict) if it's is not ASCII.
    # Might have to tweak the condition below, e.g. by checking if the character is part of `string.printable` instead.
    should_escape = ord(unicode_character) >= 128
    return should_escape


ESCAPE_UNICODE_TO_HTML5 = {
    unicode_character: f"&{named_character}"
    for named_character, unicode_character in html5.items()
    if _should_include_escape_entry(unicode_character, named_character)
}


def escape_to_named_characters(string: str):
    return "".join(ESCAPE_UNICODE_TO_HTML5.get(c, c) for c in string)


def block_join(object_collection, sep="<b>&bull;</b>", multiline=True):
    if len(object_collection) == 0:
        return ""

    tag = '<div style="display: inline-block; white-space: nowrap;">'
    if multiline:
        return format_html_join(
Пример #7
0
    def _populate_class_variables():
        """Initialize variables used by this class to manage the plethora of
        HTML5 named entities.

        This function returns a 3-tuple containing two dictionaries
        and a regular expression:

        unicode_to_name - A mapping of Unicode strings like "⦨" to
        entity names like "angmsdaa". When a single Unicode string has
        multiple entity names, we try to choose the most commonly-used
        name.

        name_to_unicode: A mapping of entity names like "angmsdaa" to 
        Unicode strings like "⦨".

        named_entity_re: A regular expression matching (almost) any
        Unicode string that corresponds to an HTML5 named entity.
        """
        unicode_to_name = {}
        name_to_unicode = {}

        short_entities = set()
        long_entities_by_first_character = defaultdict(set)
        
        for name_with_semicolon, character in sorted(html5.items()):
            # "It is intentional, for legacy compatibility, that many
            # code points have multiple character reference names. For
            # example, some appear both with and without the trailing
            # semicolon, or with different capitalizations."
            # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
            #
            # The parsers are in charge of handling (or not) character
            # references with no trailing semicolon, so we remove the
            # semicolon whenever it appears.
            if name_with_semicolon.endswith(';'):
                name = name_with_semicolon[:-1]
            else:
                name = name_with_semicolon

            # When parsing HTML, we want to recognize any known named
            # entity and convert it to a sequence of Unicode
            # characters.
            if name not in name_to_unicode:
                name_to_unicode[name] = character

            # When _generating_ HTML, we want to recognize special
            # character sequences that _could_ be converted to named
            # entities.
            unicode_to_name[character] = name

            # We also need to build a regular expression that lets us
            # _find_ those characters in output strings so we can
            # replace them.
            #
            # This is tricky, for two reasons.

            if (len(character) == 1 and ord(character) < 128
                and character not in '<>&'):
                # First, it would be annoying to turn single ASCII
                # characters like | into named entities like
                # &verbar;. The exceptions are <>&, which we _must_
                # turn into named entities to produce valid HTML.
                continue

            if len(character) > 1 and all(ord(x) < 128 for x in character):
                # We also do not want to turn _combinations_ of ASCII
                # characters like 'fj' into named entities like '&fjlig;',
                # though that's more debateable.
                continue

            # Second, some named entities have a Unicode value that's
            # a subset of the Unicode value for some _other_ named
            # entity.  As an example, \u2267' is &GreaterFullEqual;,
            # but '\u2267\u0338' is &NotGreaterFullEqual;. Our regular
            # expression needs to match the first two characters of
            # "\u2267\u0338foo", but only the first character of
            # "\u2267foo".
            #
            # In this step, we build two sets of characters that
            # _eventually_ need to go into the regular expression. But
            # we won't know exactly what the regular expression needs
            # to look like until we've gone through the entire list of
            # named entities.
            if len(character) == 1:
                short_entities.add(character)
            else:
                long_entities_by_first_character[character[0]].add(character)

        # Now that we've been through the entire list of entities, we
        # can create a regular expression that matches any of them.
        particles = set()
        for short in short_entities:
            long_versions = long_entities_by_first_character[short]
            if not long_versions:
                particles.add(short)
            else:
                ignore = "".join([x[1] for x in long_versions])
                # This finds, e.g. \u2267 but only if it is _not_
                # followed by \u0338.
                particles.add("%s(?![%s])" % (short, ignore))
        
        for long_entities in list(long_entities_by_first_character.values()):
            for long_entity in long_entities:
                particles.add(long_entity)

        re_definition = "(%s)" % "|".join(particles)
                
        # If an entity shows up in both html5 and codepoint2name, it's
        # likely that HTML5 gives it several different names, such as
        # 'rsquo' and 'rsquor'. When converting Unicode characters to
        # named entities, the codepoint2name name should take
        # precedence where possible, since that's the more easily
        # recognizable one.
        for codepoint, name in list(codepoint2name.items()):
            character = chr(codepoint)
            unicode_to_name[character] = name

        return unicode_to_name, name_to_unicode, re.compile(re_definition)