def _print_detailed(cps, inverted_target=None): last_block = None for cp in sorted(cps): block = unicode_data.block(cp) if block != last_block: print ' %s' % block last_block = block script = unicode_data.script(cp) extensions = unicode_data.script_extensions(cp) - set([script]) if extensions: extensions = ' (%s)' % ','.join(sorted(extensions)) else: extensions = '' if not inverted_target: extra = '' elif cp not in inverted_target: extra = ' !missing' else: scripts = sorted(inverted_target[cp]) if len(scripts) > 3: script_text = ', '.join(scripts[:3]) + '... ' + scripts[-1] else: script_text = ', '.join(scripts) extra = ' (in %s)' % script_text print ' %6s %4s %2s %3s %s%s%s' % ( '%04x' % cp, script, unicode_data.category(cp), unicode_data.age(cp), unicode_data.name(cp, ''), extensions, extra)
def _print_detailed(cps, inverted_target=None): last_block = None undefined_start = -1 undefined_end = -1 def show_undefined(start, end): if start >= 0: if end > start: print(" %04x-%04x Zzzz <%d undefined>" % (start, end, end - start - 1)) else: print(" %04x Zzzz <1 undefined>" % start) for cp in sorted(cps): block = unicode_data.block(cp) if block != last_block or (undefined_end > -1 and cp > undefined_end + 1): show_undefined(undefined_start, undefined_end) undefined_start, undefined_end = -1, -1 if block != last_block: print(" %s" % block) last_block = block script = unicode_data.script(cp) if script == "Zzzz": if undefined_start >= 0: undefined_end = cp else: undefined_start, undefined_end = cp, cp continue show_undefined(undefined_start, undefined_end) undefined_start, undefined_end = -1, -1 extensions = unicode_data.script_extensions(cp) - {script} if extensions: extensions = " (script %s)" % ", ".join(sorted(extensions)) else: extensions = "" if not inverted_target: extra = "" elif cp not in inverted_target: extra = " !missing" else: scripts = sorted(inverted_target[cp]) if len(scripts) > 3: script_text = ", ".join(scripts[:3]) + "... " + scripts[-1] else: script_text = ", ".join(scripts) extra = " (font %s)" % script_text print(" %6s %4s %2s %3s %s%s%s" % ( "%04x" % cp, script, unicode_data.category(cp), unicode_data.age(cp), unicode_data.name(cp, ""), extensions, extra, )) show_undefined(undefined_start, undefined_end)
def _print_detailed(cps, inverted_target=None): last_block = None undefined_start = -1 undefined_end = -1 def show_undefined(start, end): if start >= 0: if end > start: print ' %04x-%04x Zzzz <%d undefined>' % ( start, end, end - start - 1) else: print ' %04x Zzzz <1 undefined>' % start for cp in sorted(cps): block = unicode_data.block(cp) if block != last_block or (undefined_end > -1 and cp > undefined_end + 1): show_undefined(undefined_start, undefined_end) undefined_start, undefined_end = -1, -1 if block != last_block: print ' %s' % block last_block = block script = unicode_data.script(cp) if script == 'Zzzz': if undefined_start >= 0: undefined_end = cp else: undefined_start, undefined_end = cp, cp continue show_undefined(undefined_start, undefined_end) undefined_start, undefined_end = -1, -1 extensions = unicode_data.script_extensions(cp) - set([script]) if extensions: extensions = ' (%s)' % ','.join(sorted(extensions)) else: extensions = '' if not inverted_target: extra = '' elif cp not in inverted_target: extra = ' !missing' else: scripts = sorted(inverted_target[cp]) if len(scripts) > 3: script_text = ', '.join(scripts[:3]) + '... ' + scripts[-1] else: script_text = ', '.join(scripts) extra = ' (in %s)' % script_text print ' %6s %4s %2s %3s %s%s%s' % ( '%04x' % cp, script, unicode_data.category(cp), unicode_data.age(cp), unicode_data.name(cp, ''), extensions, extra) show_undefined(undefined_start, undefined_end)
def _add_text(chars, text): skip = False for i, cp in enumerate(text): if cp == '{': skip = True continue if cp == '}': skip = False continue if not skip: if cp == ' ': continue script = unicode_data.script(cp) if script == 'Zyyy': chars.add(cp)
def _add_text(chars, text): skip = False for i, cp in enumerate(text): if cp == "{": skip = True continue if cp == "}": skip = False continue if not skip: if cp == " ": continue script = unicode_data.script(cp) if script == "Zyyy": chars.add(cp)
def get_scripts(text): """Return the set of scripts in this text. Excludes some common chars.""" # ignore these chars, we assume they are ok in any script exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF} zyyy_chars = set() scripts = set() ustr = text.decode("utf8") for cp in ustr: if ord(cp) in exclusions: continue script = unicode_data.script(cp) if script == "Zyyy": # common/undetermined zyyy_chars.add(cp if cp < "\u00fe" else ord(cp)) elif not script == "Zinh": # inherited scripts.add(script) return scripts, zyyy_chars
def get_script_histogram(utext): """Return a map from script to character count + chars, excluding some common whitespace, and inherited characters. utext is a unicode string.""" exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF} result = {} for cp in utext: if ord(cp) in exclusions: continue script = unicode_data.script(cp) if script == "Zinh": continue if script not in result: result[script] = [1, {cp}] else: r = result[script] r[0] += 1 r[1].add(cp) return result
def test_script(self): """Tests the script() method.""" self.assertEqual('Latn', unicode_data.script(0xA794)) self.assertEqual('Zzzz', unicode_data.script(0xE006))