def parse_iamcal_emoji_data_json(path): modification = time.gmtime(os.path.getmtime(path)) emoji_version = time.strftime('%A %d %B %Y', modification) print(emoji_version, modification) source_info['emoji_version'] = emoji_version input_list = json_load(path) emoji_map = {} non_qualified = {} input_map = {} for info in input_list: code = info['unified'] unified = True if '-' in code: code = info['non_qualified'] if not code or '-' in code: continue unified = False ch = int(code, 16) assert ch > 0x80, info character = chr(ch) code = 'U+' + code name = info['name'].title().strip() if not name: name = UnicodeData.getCharacterName(character) short_name = info['short_name'] short_names = info['short_names'] assert short_name in short_names for sequence in short_names: if sequence in input_map: assert character == input_map[sequence], (sequence, info) else: input_map[sequence] = character if unified: emoji_map[sequence] = { 'code': code, 'character': character, 'sequence': sequence, 'name': name } else: non_qualified[sequence] = { 'code': code, 'character': character, 'sequence': sequence, 'name': name } return emoji_map, non_qualified
def parse(self, fileName): UnicodeDatas = [] with open(fileName, "r") as ifile: for line in ifile: line = line.rstrip() tokens = line.split(";") p = re.compile(r"<.*,.*>") if p.match(tokens[1]): line = next(ifile) endRange = int(line.split(";")[0], 16) for i in range(int(tokens[0], 16), endRange): tokens[0] = "{0:x}".format(i) UnicodeDatas.append(UnicodeData.UnicodeData(tokens)) else: UnicodeDatas.append(UnicodeData.UnicodeData(tokens)) return UnicodeDatas
def GenerateUnicodeControlCharacters(): # for kUnicodeControlCharacterTable in Edit.c ucc_table = [ "\u200E", # U+200E LRM Left-to-right mark "\u200F", # U+200F RLM Right-to-left mark "\u200D", # U+200D ZWJ Zero width joiner "\u200C", # U+200C ZWNJ Zero width non-joiner "\u202A", # U+202A LRE Start of left-to-right embedding "\u202B", # U+202B RLE Start of right-to-left embedding "\u202D", # U+202D LRO Start of left-to-right override "\u202E", # U+202E RLO Start of right-to-left override "\u202C", # U+202C PDF Pop directional formatting "\u206E", # U+206E NADS National digit shapes substitution "\u206F", # U+206F NODS Nominal (European) digit shapes "\u206B", # U+206B ASS Activate symmetric swapping "\u206A", # U+206A ISS Inhibit symmetric swapping "\u206D", # U+206D AAFS Activate Arabic form shaping "\u206C", # U+206C IAFS Inhibit Arabic form shaping "\u001E", # U+001E RS Record Separator (Block separator) "\u001F", # U+001F US Unit Separator (Segment separator) "\u2028", # U+2028 LS Line Separator "\u2029", # U+2029 PS Paragraph Separator "\u200B", # U+200B ZWSP Zero width space "\u2060", # U+2060 WJ Word joiner "\u2066", # U+2066 LRI Left-to-right isolate "\u2067", # U+2067 RLI Right-to-left isolate "\u2068", # U+2068 FSI First strong isolate "\u2069", # U+2069 PDI Pop directional isolate "\u061C", # U+061C ALM Arabic letter mark ] print('UnicodeControlCharacters:') for ucc in ucc_table: utf8bytes = ucc.encode('utf-8') utf8str = ''.join(f'\\x{b:02x}' for b in utf8bytes) print(utf8str, f'U+{ord(ucc):04X}', unicodedata.category(ucc), UnicodeData.getCharacterName(ucc))
# SUCH DAMAGE. import re import string import sys import generate import UnicodeData import util if len(sys.argv) != 4: print "usage: %s UnicodeData.txt" " CompositionExclusions-3.2.0.txt out-dir" % sys.argv[0] sys.exit(1) ud = UnicodeData.read(sys.argv[1]) def sortedKeys(d): """Return a sorted list of the keys of a dict""" keys = d.keys() keys.sort() return keys trans = dict([(k, [re.sub('<[a-zA-Z]+>', '', v[4]), v[0]]) for k,v in ud.items() if v[4]]) maxLength = 0 for v in trans.values(): maxLength = max(maxLength, len(v[0].split())) normalize_h = generate.Header('%s/normalize_table.h' % sys.argv[3])
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. import re import string import sys import generate import UnicodeData if len(sys.argv) != 3: print "usage: %s UnicodeData.txt out-dir" % sys.argv[0] sys.exit(1) ud = UnicodeData.read(sys.argv[1]) trans = {} for k,v in ud.items(): if int(v[2]) != 0 : trans[k] = [int(v[2]), v[1]] # trans = [(x[0], int(x[3]), x[1]) for x in UnicodeData.read() if int(x[3]) != 0] combining_h = generate.Header('%s/combining_table.h' % sys.argv[2]) combining_c = generate.Implementation('%s/combining_table.c' % sys.argv[2]) combining_h.file.write( ''' #include <krb5-types.h>
# SUCH DAMAGE. import re import string import sys import generate import UnicodeData import util if len(sys.argv) != 4: print "usage: %s UnicodeData.txt" " CompositionExclusions-3.2.0.txt out-dir" % sys.argv[0] sys.exit(1) ud = UnicodeData.read(sys.argv[1]) def sortedKeys(d): """Return a sorted list of the keys of a dict""" keys = d.keys() keys.sort() return keys trans = dict([(k, [re.sub('<[a-zA-Z]+>', '', v[4]), v[0]]) for k, v in ud.items() if v[4]]) maxLength = 0 for v in trans.values(): maxLength = max(maxLength, len(v[0].split()))
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. import re import string import sys import generate import UnicodeData if len(sys.argv) != 3: print "usage: %s UnicodeData.txt out-dir" % sys.argv[0] sys.exit(1) ud = UnicodeData.read(sys.argv[1]) trans = {} for k, v in ud.items(): if int(v[2]) != 0: trans[k] = [int(v[2]), v[1]] # trans = [(x[0], int(x[3]), x[1]) for x in UnicodeData.read() if int(x[3]) != 0] combining_h = generate.Header('%s/combining_table.h' % sys.argv[2]) combining_c = generate.Implementation('%s/combining_table.c' % sys.argv[2]) combining_h.file.write(''' #include <krb5-types.h> struct translation {
#!/usr/bin/python # -*- coding: utf-8 -*- import sys import UnicodeData import struct if len(sys.argv) != 2: print >> sys.stderr, "Usage: %s <unicode_data_dir>" % sys.argv[0] print >> sys.stderr, "Takes UnicodeData.txt, PropList.txt ... and turns into optimized table files" sys.exit(99) unicode_data_dir = sys.argv[1] UnicodeData.read(unicode_data_dir) last_codepoint = max(UnicodeData.data.keys()) print "Last codepoint: %d" % last_codepoint ## Generate codepoint->script mapping script_name_to_code_mapping = { "Adlam": 1, "Ahom": 2, "Anatolian_Hieroglyphs": 3, "Arabic": 4, "Armenian": 5, "Avestan": 6, "Balinese": 7, "Bamum": 8, "Bassa_Vah": 9, "Batak": 10, "Bengali": 11,