示例#1
0
def _print_detailed(cps, inverted_target=None):
  last_block = None
  for cp in sorted(cps):
    block = unicode_data.block(cp)
    if block != last_block:
      print '    %s' % block
      last_block = block
    script = unicode_data.script(cp)
    extensions = unicode_data.script_extensions(cp) - set([script])
    if extensions:
      extensions = ' (%s)' % ','.join(sorted(extensions))
    else:
      extensions = ''
    if not inverted_target:
      extra = ''
    elif cp not in inverted_target:
      extra = ' !missing'
    else:
      scripts = sorted(inverted_target[cp])
      if len(scripts) > 3:
        script_text = ', '.join(scripts[:3]) + '... ' + scripts[-1]
      else:
        script_text = ', '.join(scripts)
      extra = ' (in %s)' % script_text
    print '    %6s %4s %2s %3s %s%s%s' % (
        '%04x' % cp,
        script,
        unicode_data.category(cp),
        unicode_data.age(cp),
        unicode_data.name(cp, ''),
        extensions,
        extra)
示例#2
0
def _print_detailed(cps, inverted_target=None):
    last_block = None
    undefined_start = -1
    undefined_end = -1

    def show_undefined(start, end):
        if start >= 0:
            if end > start:
                print("      %04x-%04x Zzzz <%d undefined>" %
                      (start, end, end - start - 1))
            else:
                print("      %04x Zzzz <1 undefined>" % start)

    for cp in sorted(cps):
        block = unicode_data.block(cp)
        if block != last_block or (undefined_end > -1
                                   and cp > undefined_end + 1):
            show_undefined(undefined_start, undefined_end)
            undefined_start, undefined_end = -1, -1
            if block != last_block:
                print("    %s" % block)
                last_block = block
        script = unicode_data.script(cp)
        if script == "Zzzz":
            if undefined_start >= 0:
                undefined_end = cp
            else:
                undefined_start, undefined_end = cp, cp
            continue

        show_undefined(undefined_start, undefined_end)
        undefined_start, undefined_end = -1, -1
        extensions = unicode_data.script_extensions(cp) - {script}
        if extensions:
            extensions = " (script %s)" % ", ".join(sorted(extensions))
        else:
            extensions = ""
        if not inverted_target:
            extra = ""
        elif cp not in inverted_target:
            extra = " !missing"
        else:
            scripts = sorted(inverted_target[cp])
            if len(scripts) > 3:
                script_text = ", ".join(scripts[:3]) + "... " + scripts[-1]
            else:
                script_text = ", ".join(scripts)
            extra = " (font %s)" % script_text
        print("    %6s %4s %2s %3s %s%s%s" % (
            "%04x" % cp,
            script,
            unicode_data.category(cp),
            unicode_data.age(cp),
            unicode_data.name(cp, ""),
            extensions,
            extra,
        ))
    show_undefined(undefined_start, undefined_end)
示例#3
0
def _print_detailed(cps, inverted_target=None):
  last_block = None
  undefined_start = -1
  undefined_end = -1
  def show_undefined(start, end):
    if start >= 0:
      if end > start:
        print '      %04x-%04x Zzzz <%d undefined>' % (
            start, end, end - start - 1)
      else:
        print '      %04x Zzzz <1 undefined>' % start

  for cp in sorted(cps):
    block = unicode_data.block(cp)
    if block != last_block or (undefined_end > -1 and cp > undefined_end + 1):
      show_undefined(undefined_start, undefined_end)
      undefined_start, undefined_end = -1, -1
      if block != last_block:
        print '    %s' % block
        last_block = block
    script = unicode_data.script(cp)
    if script == 'Zzzz':
      if undefined_start >= 0:
        undefined_end = cp
      else:
        undefined_start, undefined_end = cp, cp
      continue

    show_undefined(undefined_start, undefined_end)
    undefined_start, undefined_end = -1, -1
    extensions = unicode_data.script_extensions(cp) - set([script])
    if extensions:
      extensions = ' (%s)' % ','.join(sorted(extensions))
    else:
      extensions = ''
    if not inverted_target:
      extra = ''
    elif cp not in inverted_target:
      extra = ' !missing'
    else:
      scripts = sorted(inverted_target[cp])
      if len(scripts) > 3:
        script_text = ', '.join(scripts[:3]) + '... ' + scripts[-1]
      else:
        script_text = ', '.join(scripts)
      extra = ' (in %s)' % script_text
    print '    %6s %4s %2s %3s %s%s%s' % (
        '%04x' % cp,
        script,
        unicode_data.category(cp),
        unicode_data.age(cp),
        unicode_data.name(cp, ''),
        extensions,
        extra)
  show_undefined(undefined_start, undefined_end)
示例#4
0
def _add_text(chars, text):
  skip = False
  for i, cp in enumerate(text):
    if cp == '{':
      skip = True
      continue
    if cp == '}':
      skip = False
      continue
    if not skip:
      if cp == ' ':
        continue
      script = unicode_data.script(cp)
      if script == 'Zyyy':
        chars.add(cp)
示例#5
0
def _add_text(chars, text):
    skip = False
    for i, cp in enumerate(text):
        if cp == "{":
            skip = True
            continue
        if cp == "}":
            skip = False
            continue
        if not skip:
            if cp == " ":
                continue
            script = unicode_data.script(cp)
            if script == "Zyyy":
                chars.add(cp)
示例#6
0
def get_scripts(text):
    """Return the set of scripts in this text.  Excludes
  some common chars."""
    # ignore these chars, we assume they are ok in any script
    exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF}
    zyyy_chars = set()
    scripts = set()
    ustr = text.decode("utf8")
    for cp in ustr:
        if ord(cp) in exclusions:
            continue
        script = unicode_data.script(cp)
        if script == "Zyyy":  # common/undetermined
            zyyy_chars.add(cp if cp < "\u00fe" else ord(cp))
        elif not script == "Zinh":  # inherited
            scripts.add(script)
    return scripts, zyyy_chars
示例#7
0
def get_script_histogram(utext):
    """Return a map from script to character count + chars, excluding some common
  whitespace, and inherited characters.  utext is a unicode string."""
    exclusions = {0x00, 0x0A, 0x0D, 0x20, 0xA0, 0xFEFF}
    result = {}
    for cp in utext:
        if ord(cp) in exclusions:
            continue
        script = unicode_data.script(cp)
        if script == "Zinh":
            continue
        if script not in result:
            result[script] = [1, {cp}]
        else:
            r = result[script]
            r[0] += 1
            r[1].add(cp)
    return result
 def test_script(self):
     """Tests the script() method."""
     self.assertEqual('Latn', unicode_data.script(0xA794))
     self.assertEqual('Zzzz', unicode_data.script(0xE006))
示例#9
0
 def test_script(self):
     """Tests the script() method."""
     self.assertEqual('Latn', unicode_data.script(0xA794))
     self.assertEqual('Zzzz', unicode_data.script(0xE006))