Пример #1
0
def parse_iamcal_emoji_data_json(path):
	modification = time.gmtime(os.path.getmtime(path))
	emoji_version = time.strftime('%A %d %B %Y', modification)
	print(emoji_version, modification)
	source_info['emoji_version'] = emoji_version

	input_list = json_load(path)
	emoji_map = {}
	non_qualified = {}
	input_map = {}
	for info in input_list:
		code = info['unified']
		unified = True
		if '-' in code:
			code = info['non_qualified']
			if not code or '-' in code:
				continue
			unified = False

		ch = int(code, 16)
		assert ch > 0x80, info
		character = chr(ch)
		code = 'U+' + code
		name = info['name'].title().strip()
		if not name:
			name = UnicodeData.getCharacterName(character)
		short_name = info['short_name']
		short_names = info['short_names']
		assert short_name in short_names
		for sequence in short_names:
			if sequence in input_map:
				assert character == input_map[sequence], (sequence, info)
			else:
				input_map[sequence] = character
			if unified:
				emoji_map[sequence] = {
					'code': code,
					'character': character,
					'sequence': sequence,
					'name': name
				}
			else:
				non_qualified[sequence] = {
					'code': code,
					'character': character,
					'sequence': sequence,
					'name': name
				}

	return emoji_map, non_qualified
Пример #2
0
    def parse(self, fileName):

        UnicodeDatas = []

        with open(fileName, "r") as ifile:

            for line in ifile:

                line = line.rstrip()
                tokens = line.split(";")

                p = re.compile(r"<.*,.*>")
                if p.match(tokens[1]):
                    line = next(ifile)
                    endRange = int(line.split(";")[0], 16)

                    for i in range(int(tokens[0], 16), endRange):
                        tokens[0] = "{0:x}".format(i)
                        UnicodeDatas.append(UnicodeData.UnicodeData(tokens))

                else:
                    UnicodeDatas.append(UnicodeData.UnicodeData(tokens))

        return UnicodeDatas
Пример #3
0
def GenerateUnicodeControlCharacters():
    # for kUnicodeControlCharacterTable in Edit.c
    ucc_table = [
        "\u200E",  # U+200E	LRM		Left-to-right mark
        "\u200F",  # U+200F	RLM		Right-to-left mark
        "\u200D",  # U+200D	ZWJ		Zero width joiner
        "\u200C",  # U+200C	ZWNJ	Zero width non-joiner
        "\u202A",  # U+202A	LRE		Start of left-to-right embedding
        "\u202B",  # U+202B	RLE		Start of right-to-left embedding
        "\u202D",  # U+202D	LRO		Start of left-to-right override
        "\u202E",  # U+202E	RLO		Start of right-to-left override
        "\u202C",  # U+202C	PDF		Pop directional formatting
        "\u206E",  # U+206E	NADS	National digit shapes substitution
        "\u206F",  # U+206F	NODS	Nominal (European) digit shapes
        "\u206B",  # U+206B	ASS		Activate symmetric swapping
        "\u206A",  # U+206A	ISS		Inhibit symmetric swapping
        "\u206D",  # U+206D	AAFS	Activate Arabic form shaping
        "\u206C",  # U+206C	IAFS	Inhibit Arabic form shaping
        "\u001E",  # U+001E	RS		Record Separator (Block separator)
        "\u001F",  # U+001F	US		Unit Separator (Segment separator)
        "\u2028",  # U+2028	LS		Line Separator
        "\u2029",  # U+2029	PS		Paragraph Separator
        "\u200B",  # U+200B	ZWSP	Zero width space
        "\u2060",  # U+2060	WJ		Word joiner
        "\u2066",  # U+2066	LRI		Left-to-right isolate
        "\u2067",  # U+2067	RLI		Right-to-left isolate
        "\u2068",  # U+2068	FSI		First strong isolate
        "\u2069",  # U+2069	PDI		Pop directional isolate
        "\u061C",  # U+061C	ALM		Arabic letter mark
    ]

    print('UnicodeControlCharacters:')
    for ucc in ucc_table:
        utf8bytes = ucc.encode('utf-8')
        utf8str = ''.join(f'\\x{b:02x}' for b in utf8bytes)
        print(utf8str, f'U+{ord(ucc):04X}', unicodedata.category(ucc),
              UnicodeData.getCharacterName(ucc))
Пример #4
0
# SUCH DAMAGE. 

import re
import string
import sys

import generate
import UnicodeData
import util

if len(sys.argv) != 4:
    print "usage: %s UnicodeData.txt"
    " CompositionExclusions-3.2.0.txt out-dir" % sys.argv[0]
    sys.exit(1)

ud = UnicodeData.read(sys.argv[1])

def sortedKeys(d):
    """Return a sorted list of the keys of a dict"""
    keys = d.keys()
    keys.sort()
    return keys

trans = dict([(k, [re.sub('<[a-zA-Z]+>', '', v[4]), v[0]])
              for k,v in ud.items() if v[4]])

maxLength = 0
for v in trans.values():
    maxLength = max(maxLength, len(v[0].split()))

normalize_h = generate.Header('%s/normalize_table.h' % sys.argv[3])
Пример #5
0
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
# SUCH DAMAGE. 

import re
import string
import sys

import generate
import UnicodeData

if len(sys.argv) != 3:
    print "usage: %s UnicodeData.txt out-dir" % sys.argv[0]
    sys.exit(1)

ud = UnicodeData.read(sys.argv[1])

trans = {}
for k,v in ud.items():
    if int(v[2]) != 0 :
        trans[k] = [int(v[2]), v[1]]

# trans = [(x[0], int(x[3]), x[1]) for x in UnicodeData.read() if int(x[3]) != 0]

combining_h = generate.Header('%s/combining_table.h' % sys.argv[2])
combining_c = generate.Implementation('%s/combining_table.c' % sys.argv[2])

combining_h.file.write(
'''
#include <krb5-types.h>
Пример #6
0
# SUCH DAMAGE.

import re
import string
import sys

import generate
import UnicodeData
import util

if len(sys.argv) != 4:
    print "usage: %s UnicodeData.txt"
    " CompositionExclusions-3.2.0.txt out-dir" % sys.argv[0]
    sys.exit(1)

ud = UnicodeData.read(sys.argv[1])


def sortedKeys(d):
    """Return a sorted list of the keys of a dict"""
    keys = d.keys()
    keys.sort()
    return keys


trans = dict([(k, [re.sub('<[a-zA-Z]+>', '', v[4]), v[0]])
              for k, v in ud.items() if v[4]])

maxLength = 0
for v in trans.values():
    maxLength = max(maxLength, len(v[0].split()))
Пример #7
0
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.

import re
import string
import sys

import generate
import UnicodeData

if len(sys.argv) != 3:
    print "usage: %s UnicodeData.txt out-dir" % sys.argv[0]
    sys.exit(1)

ud = UnicodeData.read(sys.argv[1])

trans = {}
for k, v in ud.items():
    if int(v[2]) != 0:
        trans[k] = [int(v[2]), v[1]]

# trans = [(x[0], int(x[3]), x[1]) for x in UnicodeData.read() if int(x[3]) != 0]

combining_h = generate.Header('%s/combining_table.h' % sys.argv[2])
combining_c = generate.Implementation('%s/combining_table.c' % sys.argv[2])

combining_h.file.write('''
#include <krb5-types.h>

struct translation {
Пример #8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import UnicodeData
import struct

if len(sys.argv) != 2:
    print >> sys.stderr, "Usage: %s <unicode_data_dir>" % sys.argv[0]
    print >> sys.stderr, "Takes UnicodeData.txt, PropList.txt ... and turns into optimized table files"
    sys.exit(99)

unicode_data_dir = sys.argv[1]
UnicodeData.read(unicode_data_dir)

last_codepoint = max(UnicodeData.data.keys())

print "Last codepoint: %d" % last_codepoint

## Generate codepoint->script mapping
script_name_to_code_mapping = {
    "Adlam": 1,
    "Ahom": 2,
    "Anatolian_Hieroglyphs": 3,
    "Arabic": 4,
    "Armenian": 5,
    "Avestan": 6,
    "Balinese": 7,
    "Bamum": 8,
    "Bassa_Vah": 9,
    "Batak": 10,
    "Bengali": 11,