Python parse_ucd_data 예제들, ucd.parse_ucd_data Python 예제들

예제 #1

0

파일 보기

파일: categories.py 프로젝트: marcismajors/espeak-ng

# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
    for codepoint in data['CodePoint']:
        unicode_chars[codepoint] = data['GeneralCategory']
if '--with-csur' in sys.argv:
    for csur in ['Klingon']:
        for data in ucd.parse_ucd_data('data/csur', csur):
            for codepoint in data['CodePoint']:
                unicode_chars[codepoint] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
category_sets = [
    (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
    (ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'),

예제 #2

0

파일 보기

파일: scripts.py 프로젝트: davidweenink/espeak-ng

# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
	for codepoint in data['Range']:
		unicode_chars[codepoint] = data['Script']
if '--with-csur' in sys.argv:
	for csur in ['Klingon']:
		for data in ucd.parse_ucd_data('data/csur', csur):
			for codepoint in data['CodePoint']:
				unicode_chars[codepoint] = data['Script']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
script_sets = [
	(ucd.CodeRange('000000..00D7FF'), None,   'Multiple Blocks'),
	(ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
	(ucd.CodeRange('00F800..02FAFF'), None,   'Multiple Blocks'),

예제 #3

0

파일 보기

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
emoji_rootdir = 'data/emoji'
csur_rootdir = 'data/csur'

null = ucd.CodePoint('0000')

properties = [(ucd_rootdir, 'PropList'),
              (ucd_rootdir, 'DerivedCoreProperties'),
              (emoji_rootdir, 'emoji-data'), ('data/espeak-ng', 'PropList')]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
    for codepoint in data['CodePoint']:
        unicode_chars[codepoint] = data
for propdir, propfile in properties:
    for data in ucd.parse_ucd_data(propdir, propfile):
        for codepoint in data['Range']:
            try:
                unicode_chars[codepoint][data['Property']] = 1
            except KeyError:
                unicode_chars[codepoint] = {'CodePoint': codepoint}
                unicode_chars[codepoint][data['Property']] = 1
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
    for codepoint in data['Range']:
        unicode_chars[codepoint]['Script'] = data['Script']
if '--with-csur' in sys.argv:
    for csur in ['Klingon']:

예제 #4

0

파일 보기

파일: printdata.py 프로젝트: CMB/espeak-ng

# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
csur_rootdir = 'data/csur'

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
	for codepoint in data['CodePoint']:
		unicode_chars[codepoint] = data
		unicode_chars[codepoint]['Properties'] = []
for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
	if data['Property'] in ['White_Space']:
		for codepoint in data['Range']:
			unicode_chars[codepoint]['Properties'].append(data['Property'])
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
	for codepoint in data['Range']:
		unicode_chars[codepoint]['Script'] = data['Script']
if '--with-csur' in sys.argv:
	for csur in ['Klingon']:
		for data in ucd.parse_ucd_data('data/csur', csur):
			for codepoint in data['CodePoint']:
				if not 'TitleCase'  in data: data['TitleCase']  = codepoint

예제 #5

0

파일 보기

파일: printdata.py 프로젝트: pettarin/espeak-ng

import sys
import ucd

ucd_rootdir = sys.argv[1]
csur_rootdir = 'data/csur'

null = ucd.CodePoint('0000')

properties = [
    (ucd_rootdir, 'PropList'),
    (ucd_rootdir, 'DerivedCoreProperties'),
    ('data/espeak-ng', 'PropList')
]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
	for codepoint in data['CodePoint']:
		unicode_chars[codepoint] = data
for propdir, propfile in properties:
	for data in ucd.parse_ucd_data(propdir, propfile):
		for codepoint in data['Range']:
			try:
				unicode_chars[codepoint][data['Property']] = 1
			except KeyError:
				unicode_chars[codepoint] = {'CodePoint': codepoint}
				unicode_chars[codepoint][data['Property']] = 1
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
	for codepoint in data['Range']:
		unicode_chars[codepoint]['Script'] = data['Script']
if '--with-csur' in sys.argv:
	for csur in ['Klingon']:

예제 #6

0

파일 보기

파일: case.py 프로젝트: CMB/espeak-ng

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
null = ucd.CodePoint('0000')
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
	if data['LowerCase'] != null or data['UpperCase'] != null or data['TitleCase'] != null:
		unicode_chars[data['CodePoint']] = (data['LowerCase'], data['UpperCase'], data['TitleCase'])

if __name__ == '__main__':
	sys.stdout.write("""/* Unicode Case Conversion
 *
 * Copyright (C) 2012-2016 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *

예제 #7

0

파일 보기

파일: categories.py 프로젝트: CMB/espeak-ng

# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
	for codepoint in data['CodePoint']:
		unicode_chars[codepoint] = data['GeneralCategory']
if '--with-csur' in sys.argv:
	for csur in ['Klingon']:
		for data in ucd.parse_ucd_data('data/csur', csur):
			for codepoint in data['CodePoint']:
				unicode_chars[codepoint] = data['GeneralCategory']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
category_sets = [
	(ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
	(ucd.CodeRange('00D800..00DFFF'), 'Cs', 'Surrogates'),
	(ucd.CodeRange('00E000..00F7FF'), 'Co', 'Private Use Area'),

예제 #8

0

파일 보기

# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
    for codepoint in data['Range']:
        unicode_chars[codepoint] = data['Script']
if '--with-csur' in sys.argv:
    for csur in ['Klingon']:
        for data in ucd.parse_ucd_data('data/csur', csur):
            for codepoint in data['CodePoint']:
                unicode_chars[codepoint] = data['Script']

# This map is a combination of the information in the UnicodeData and Blocks
# data files. It is intended to reduce the number of character tables that
# need to be generated.
script_sets = [
    (ucd.CodeRange('000000..00D7FF'), None, 'Multiple Blocks'),
    (ucd.CodeRange('00D800..00F7FF'), 'Zzzz', 'Surrogates / Private Use Area'),
    (ucd.CodeRange('00F800..02FAFF'), None, 'Multiple Blocks'),

예제 #9

0

파일 보기

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
ucd_version = sys.argv[2]

unicode_chars = {}
null = ucd.CodePoint('0000')
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
    if data['LowerCase'] != null or data['UpperCase'] != null or data[
            'TitleCase'] != null:
        unicode_chars[data['CodePoint']] = (data['LowerCase'],
                                            data['UpperCase'],
                                            data['TitleCase'])

if __name__ == '__main__':
    sys.stdout.write("""/* Unicode Case Conversion
 *
 * Copyright (C) 2012-2018 Reece H. Dunn
 *
 * This file is part of ucd-tools.
 *
 * ucd-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by

예제 #10

0

파일 보기

# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ucd-tools.  If not, see <http://www.gnu.org/licenses/>.

import os
import sys
import ucd

ucd_rootdir = sys.argv[1]
csur_rootdir = 'data/csur'

unicode_chars = {}
for data in ucd.parse_ucd_data(ucd_rootdir, 'UnicodeData'):
    for codepoint in data['CodePoint']:
        unicode_chars[codepoint] = data
        unicode_chars[codepoint]['Properties'] = []
for data in ucd.parse_ucd_data(ucd_rootdir, 'PropList'):
    if data['Property'] in ['White_Space']:
        for codepoint in data['Range']:
            unicode_chars[codepoint]['Properties'].append(data['Property'])
for data in ucd.parse_ucd_data(ucd_rootdir, 'Scripts'):
    for codepoint in data['Range']:
        unicode_chars[codepoint]['Script'] = data['Script']
if '--with-csur' in sys.argv:
    for csur in ['Klingon']:
        for data in ucd.parse_ucd_data('data/csur', csur):
            for codepoint in data['CodePoint']:
                if not 'TitleCase' in data: data['TitleCase'] = codepoint