예제 #1
0
def detectScript(txt):
    charScript = [script(c) for c in txt]

    for i, ch in enumerate(txt):
        scr = charScript[i]
        if scr in UNKNOWN_SCRIPT:
            if i:
                scr = charScript[i - 1]
            else:
                scr = None
            cat = category(ch)
            if ch in MIRRORED and cat == "Pe":
                scr = None
        charScript[i] = scr

    # Any unknowns should be mapped to the _next_ script
    prev = None
    for i in range(len(txt) - 1, -1, -1):
        if charScript[i] is None:
            charScript[i] = prev
        else:
            prev = charScript[i]

    # There may be unknowns at the end of the string, fall back to
    # preceding script
    prev = "Zxxx"  # last resort
    for i in range(len(txt)):
        if charScript[i] is None:
            charScript[i] = prev
        else:
            prev = charScript[i]

    assert None not in charScript

    return charScript
예제 #2
0
def unicodeScriptDirection(uv):
    sc = unicodedata.script(chr(uv))
    if sc in DFLT_SCRIPTS:
        return None
    return unicodedata.script_horizontal_direction(sc)
예제 #3
0
def test_script():
    assert unicodedata.script("a") == "Latn"
    assert unicodedata.script(unichr(0)) == "Zyyy"
    assert unicodedata.script(unichr(0x0378)) == "Zzzz"
    assert unicodedata.script(unichr(0x10FFFF)) == "Zzzz"

    # these were randomly sampled, one character per script
    assert unicodedata.script(unichr(0x1E918)) == 'Adlm'
    assert unicodedata.script(unichr(0x1170D)) == 'Ahom'
    assert unicodedata.script(unichr(0x145A0)) == 'Hluw'
    assert unicodedata.script(unichr(0x0607)) == 'Arab'
    assert unicodedata.script(unichr(0x056C)) == 'Armn'
    assert unicodedata.script(unichr(0x10B27)) == 'Avst'
    assert unicodedata.script(unichr(0x1B41)) == 'Bali'
    assert unicodedata.script(unichr(0x168AD)) == 'Bamu'
    assert unicodedata.script(unichr(0x16ADD)) == 'Bass'
    assert unicodedata.script(unichr(0x1BE5)) == 'Batk'
    assert unicodedata.script(unichr(0x09F3)) == 'Beng'
    assert unicodedata.script(unichr(0x11C5B)) == 'Bhks'
    assert unicodedata.script(unichr(0x3126)) == 'Bopo'
    assert unicodedata.script(unichr(0x1103B)) == 'Brah'
    assert unicodedata.script(unichr(0x2849)) == 'Brai'
    assert unicodedata.script(unichr(0x1A0A)) == 'Bugi'
    assert unicodedata.script(unichr(0x174E)) == 'Buhd'
    assert unicodedata.script(unichr(0x18EE)) == 'Cans'
    assert unicodedata.script(unichr(0x102B7)) == 'Cari'
    assert unicodedata.script(unichr(0x1053D)) == 'Aghb'
    assert unicodedata.script(unichr(0x11123)) == 'Cakm'
    assert unicodedata.script(unichr(0xAA1F)) == 'Cham'
    assert unicodedata.script(unichr(0xAB95)) == 'Cher'
    assert unicodedata.script(unichr(0x1F0C7)) == 'Zyyy'
    assert unicodedata.script(unichr(0x2C85)) == 'Copt'
    assert unicodedata.script(unichr(0x12014)) == 'Xsux'
    assert unicodedata.script(unichr(0x1082E)) == 'Cprt'
    assert unicodedata.script(unichr(0xA686)) == 'Cyrl'
    assert unicodedata.script(unichr(0x10417)) == 'Dsrt'
    assert unicodedata.script(unichr(0x093E)) == 'Deva'
    assert unicodedata.script(unichr(0x1BC4B)) == 'Dupl'
    assert unicodedata.script(unichr(0x1310C)) == 'Egyp'
    assert unicodedata.script(unichr(0x1051C)) == 'Elba'
    assert unicodedata.script(unichr(0x2DA6)) == 'Ethi'
    assert unicodedata.script(unichr(0x10AD)) == 'Geor'
    assert unicodedata.script(unichr(0x2C52)) == 'Glag'
    assert unicodedata.script(unichr(0x10343)) == 'Goth'
    assert unicodedata.script(unichr(0x11371)) == 'Gran'
    assert unicodedata.script(unichr(0x03D0)) == 'Grek'
    assert unicodedata.script(unichr(0x0AAA)) == 'Gujr'
    assert unicodedata.script(unichr(0x0A4C)) == 'Guru'
    assert unicodedata.script(unichr(0x23C9F)) == 'Hani'
    assert unicodedata.script(unichr(0xC259)) == 'Hang'
    assert unicodedata.script(unichr(0x1722)) == 'Hano'
    assert unicodedata.script(unichr(0x108F5)) == 'Hatr'
    assert unicodedata.script(unichr(0x05C2)) == 'Hebr'
    assert unicodedata.script(unichr(0x1B072)) == 'Hira'
    assert unicodedata.script(unichr(0x10847)) == 'Armi'
    assert unicodedata.script(unichr(0x033A)) == 'Zinh'
    assert unicodedata.script(unichr(0x10B66)) == 'Phli'
    assert unicodedata.script(unichr(0x10B4B)) == 'Prti'
    assert unicodedata.script(unichr(0xA98A)) == 'Java'
    assert unicodedata.script(unichr(0x110B2)) == 'Kthi'
    assert unicodedata.script(unichr(0x0CC6)) == 'Knda'
    assert unicodedata.script(unichr(0x3337)) == 'Kana'
    assert unicodedata.script(unichr(0xA915)) == 'Kali'
    assert unicodedata.script(unichr(0x10A2E)) == 'Khar'
    assert unicodedata.script(unichr(0x17AA)) == 'Khmr'
    assert unicodedata.script(unichr(0x11225)) == 'Khoj'
    assert unicodedata.script(unichr(0x112B6)) == 'Sind'
    assert unicodedata.script(unichr(0x0ED7)) == 'Laoo'
    assert unicodedata.script(unichr(0xAB3C)) == 'Latn'
    assert unicodedata.script(unichr(0x1C48)) == 'Lepc'
    assert unicodedata.script(unichr(0x1923)) == 'Limb'
    assert unicodedata.script(unichr(0x1071D)) == 'Lina'
    assert unicodedata.script(unichr(0x100EC)) == 'Linb'
    assert unicodedata.script(unichr(0xA4E9)) == 'Lisu'
    assert unicodedata.script(unichr(0x10284)) == 'Lyci'
    assert unicodedata.script(unichr(0x10926)) == 'Lydi'
    assert unicodedata.script(unichr(0x11161)) == 'Mahj'
    assert unicodedata.script(unichr(0x0D56)) == 'Mlym'
    assert unicodedata.script(unichr(0x0856)) == 'Mand'
    assert unicodedata.script(unichr(0x10AF0)) == 'Mani'
    assert unicodedata.script(unichr(0x11CB0)) == 'Marc'
    assert unicodedata.script(unichr(0x11D28)) == 'Gonm'
    assert unicodedata.script(unichr(0xABDD)) == 'Mtei'
    assert unicodedata.script(unichr(0x1E897)) == 'Mend'
    assert unicodedata.script(unichr(0x109B0)) == 'Merc'
    assert unicodedata.script(unichr(0x10993)) == 'Mero'
    assert unicodedata.script(unichr(0x16F5D)) == 'Plrd'
    assert unicodedata.script(unichr(0x1160B)) == 'Modi'
    assert unicodedata.script(unichr(0x18A8)) == 'Mong'
    assert unicodedata.script(unichr(0x16A48)) == 'Mroo'
    assert unicodedata.script(unichr(0x1128C)) == 'Mult'
    assert unicodedata.script(unichr(0x105B)) == 'Mymr'
    assert unicodedata.script(unichr(0x108AF)) == 'Nbat'
    assert unicodedata.script(unichr(0x19B3)) == 'Talu'
    assert unicodedata.script(unichr(0x1143D)) == 'Newa'
    assert unicodedata.script(unichr(0x07F4)) == 'Nkoo'
    assert unicodedata.script(unichr(0x1B192)) == 'Nshu'
    assert unicodedata.script(unichr(0x169C)) == 'Ogam'
    assert unicodedata.script(unichr(0x1C56)) == 'Olck'
    assert unicodedata.script(unichr(0x10CE9)) == 'Hung'
    assert unicodedata.script(unichr(0x10316)) == 'Ital'
    assert unicodedata.script(unichr(0x10A93)) == 'Narb'
    assert unicodedata.script(unichr(0x1035A)) == 'Perm'
    assert unicodedata.script(unichr(0x103D5)) == 'Xpeo'
    assert unicodedata.script(unichr(0x10A65)) == 'Sarb'
    assert unicodedata.script(unichr(0x10C09)) == 'Orkh'
    assert unicodedata.script(unichr(0x0B60)) == 'Orya'
    assert unicodedata.script(unichr(0x104CF)) == 'Osge'
    assert unicodedata.script(unichr(0x104A8)) == 'Osma'
    assert unicodedata.script(unichr(0x16B12)) == 'Hmng'
    assert unicodedata.script(unichr(0x10879)) == 'Palm'
    assert unicodedata.script(unichr(0x11AF1)) == 'Pauc'
    assert unicodedata.script(unichr(0xA869)) == 'Phag'
    assert unicodedata.script(unichr(0x10909)) == 'Phnx'
    assert unicodedata.script(unichr(0x10B81)) == 'Phlp'
    assert unicodedata.script(unichr(0xA941)) == 'Rjng'
    assert unicodedata.script(unichr(0x16C3)) == 'Runr'
    assert unicodedata.script(unichr(0x0814)) == 'Samr'
    assert unicodedata.script(unichr(0xA88C)) == 'Saur'
    assert unicodedata.script(unichr(0x111C8)) == 'Shrd'
    assert unicodedata.script(unichr(0x1045F)) == 'Shaw'
    assert unicodedata.script(unichr(0x115AD)) == 'Sidd'
    assert unicodedata.script(unichr(0x1D8C0)) == 'Sgnw'
    assert unicodedata.script(unichr(0x0DB9)) == 'Sinh'
    assert unicodedata.script(unichr(0x110F9)) == 'Sora'
    assert unicodedata.script(unichr(0x11A60)) == 'Soyo'
    assert unicodedata.script(unichr(0x1B94)) == 'Sund'
    assert unicodedata.script(unichr(0xA81F)) == 'Sylo'
    assert unicodedata.script(unichr(0x0740)) == 'Syrc'
    assert unicodedata.script(unichr(0x1714)) == 'Tglg'
    assert unicodedata.script(unichr(0x1761)) == 'Tagb'
    assert unicodedata.script(unichr(0x1965)) == 'Tale'
    assert unicodedata.script(unichr(0x1A32)) == 'Lana'
    assert unicodedata.script(unichr(0xAA86)) == 'Tavt'
    assert unicodedata.script(unichr(0x116A5)) == 'Takr'
    assert unicodedata.script(unichr(0x0B8E)) == 'Taml'
    assert unicodedata.script(unichr(0x1754D)) == 'Tang'
    assert unicodedata.script(unichr(0x0C40)) == 'Telu'
    assert unicodedata.script(unichr(0x07A4)) == 'Thaa'
    assert unicodedata.script(unichr(0x0E42)) == 'Thai'
    assert unicodedata.script(unichr(0x0F09)) == 'Tibt'
    assert unicodedata.script(unichr(0x2D3A)) == 'Tfng'
    assert unicodedata.script(unichr(0x114B0)) == 'Tirh'
    assert unicodedata.script(unichr(0x1038B)) == 'Ugar'
    assert unicodedata.script(unichr(0xA585)) == 'Vaii'
    assert unicodedata.script(unichr(0x118CF)) == 'Wara'
    assert unicodedata.script(unichr(0xA066)) == 'Yiii'
    assert unicodedata.script(unichr(0x11A31)) == 'Zanb'
예제 #4
0
def test_script():
    assert unicodedata.script("a") == "Latn"
    assert unicodedata.script(unichr(0)) == "Zyyy"
    assert unicodedata.script(unichr(0x0378)) == "Zzzz"
    assert unicodedata.script(unichr(0x10FFFF)) == "Zzzz"

    # these were randomly sampled, one character per script
    assert unicodedata.script(unichr(0x1E918)) == 'Adlm'
    assert unicodedata.script(unichr(0x1170D)) == 'Ahom'
    assert unicodedata.script(unichr(0x145A0)) == 'Hluw'
    assert unicodedata.script(unichr(0x0607)) == 'Arab'
    assert unicodedata.script(unichr(0x056C)) == 'Armn'
    assert unicodedata.script(unichr(0x10B27)) == 'Avst'
    assert unicodedata.script(unichr(0x1B41)) == 'Bali'
    assert unicodedata.script(unichr(0x168AD)) == 'Bamu'
    assert unicodedata.script(unichr(0x16ADD)) == 'Bass'
    assert unicodedata.script(unichr(0x1BE5)) == 'Batk'
    assert unicodedata.script(unichr(0x09F3)) == 'Beng'
    assert unicodedata.script(unichr(0x11C5B)) == 'Bhks'
    assert unicodedata.script(unichr(0x3126)) == 'Bopo'
    assert unicodedata.script(unichr(0x1103B)) == 'Brah'
    assert unicodedata.script(unichr(0x2849)) == 'Brai'
    assert unicodedata.script(unichr(0x1A0A)) == 'Bugi'
    assert unicodedata.script(unichr(0x174E)) == 'Buhd'
    assert unicodedata.script(unichr(0x18EE)) == 'Cans'
    assert unicodedata.script(unichr(0x102B7)) == 'Cari'
    assert unicodedata.script(unichr(0x1053D)) == 'Aghb'
    assert unicodedata.script(unichr(0x11123)) == 'Cakm'
    assert unicodedata.script(unichr(0xAA1F)) == 'Cham'
    assert unicodedata.script(unichr(0xAB95)) == 'Cher'
    assert unicodedata.script(unichr(0x1F0C7)) == 'Zyyy'
    assert unicodedata.script(unichr(0x2C85)) == 'Copt'
    assert unicodedata.script(unichr(0x12014)) == 'Xsux'
    assert unicodedata.script(unichr(0x1082E)) == 'Cprt'
    assert unicodedata.script(unichr(0xA686)) == 'Cyrl'
    assert unicodedata.script(unichr(0x10417)) == 'Dsrt'
    assert unicodedata.script(unichr(0x093E)) == 'Deva'
    assert unicodedata.script(unichr(0x1BC4B)) == 'Dupl'
    assert unicodedata.script(unichr(0x1310C)) == 'Egyp'
    assert unicodedata.script(unichr(0x1051C)) == 'Elba'
    assert unicodedata.script(unichr(0x2DA6)) == 'Ethi'
    assert unicodedata.script(unichr(0x10AD)) == 'Geor'
    assert unicodedata.script(unichr(0x2C52)) == 'Glag'
    assert unicodedata.script(unichr(0x10343)) == 'Goth'
    assert unicodedata.script(unichr(0x11371)) == 'Gran'
    assert unicodedata.script(unichr(0x03D0)) == 'Grek'
    assert unicodedata.script(unichr(0x0AAA)) == 'Gujr'
    assert unicodedata.script(unichr(0x0A4C)) == 'Guru'
    assert unicodedata.script(unichr(0x23C9F)) == 'Hani'
    assert unicodedata.script(unichr(0xC259)) == 'Hang'
    assert unicodedata.script(unichr(0x1722)) == 'Hano'
    assert unicodedata.script(unichr(0x108F5)) == 'Hatr'
    assert unicodedata.script(unichr(0x05C2)) == 'Hebr'
    assert unicodedata.script(unichr(0x1B072)) == 'Hira'
    assert unicodedata.script(unichr(0x10847)) == 'Armi'
    assert unicodedata.script(unichr(0x033A)) == 'Zinh'
    assert unicodedata.script(unichr(0x10B66)) == 'Phli'
    assert unicodedata.script(unichr(0x10B4B)) == 'Prti'
    assert unicodedata.script(unichr(0xA98A)) == 'Java'
    assert unicodedata.script(unichr(0x110B2)) == 'Kthi'
    assert unicodedata.script(unichr(0x0CC6)) == 'Knda'
    assert unicodedata.script(unichr(0x3337)) == 'Kana'
    assert unicodedata.script(unichr(0xA915)) == 'Kali'
    assert unicodedata.script(unichr(0x10A2E)) == 'Khar'
    assert unicodedata.script(unichr(0x17AA)) == 'Khmr'
    assert unicodedata.script(unichr(0x11225)) == 'Khoj'
    assert unicodedata.script(unichr(0x112B6)) == 'Sind'
    assert unicodedata.script(unichr(0x0ED7)) == 'Laoo'
    assert unicodedata.script(unichr(0xAB3C)) == 'Latn'
    assert unicodedata.script(unichr(0x1C48)) == 'Lepc'
    assert unicodedata.script(unichr(0x1923)) == 'Limb'
    assert unicodedata.script(unichr(0x1071D)) == 'Lina'
    assert unicodedata.script(unichr(0x100EC)) == 'Linb'
    assert unicodedata.script(unichr(0xA4E9)) == 'Lisu'
    assert unicodedata.script(unichr(0x10284)) == 'Lyci'
    assert unicodedata.script(unichr(0x10926)) == 'Lydi'
    assert unicodedata.script(unichr(0x11161)) == 'Mahj'
    assert unicodedata.script(unichr(0x0D56)) == 'Mlym'
    assert unicodedata.script(unichr(0x0856)) == 'Mand'
    assert unicodedata.script(unichr(0x10AF0)) == 'Mani'
    assert unicodedata.script(unichr(0x11CB0)) == 'Marc'
    assert unicodedata.script(unichr(0x11D28)) == 'Gonm'
    assert unicodedata.script(unichr(0xABDD)) == 'Mtei'
    assert unicodedata.script(unichr(0x1E897)) == 'Mend'
    assert unicodedata.script(unichr(0x109B0)) == 'Merc'
    assert unicodedata.script(unichr(0x10993)) == 'Mero'
    assert unicodedata.script(unichr(0x16F5D)) == 'Plrd'
    assert unicodedata.script(unichr(0x1160B)) == 'Modi'
    assert unicodedata.script(unichr(0x18A8)) == 'Mong'
    assert unicodedata.script(unichr(0x16A48)) == 'Mroo'
    assert unicodedata.script(unichr(0x1128C)) == 'Mult'
    assert unicodedata.script(unichr(0x105B)) == 'Mymr'
    assert unicodedata.script(unichr(0x108AF)) == 'Nbat'
    assert unicodedata.script(unichr(0x19B3)) == 'Talu'
    assert unicodedata.script(unichr(0x1143D)) == 'Newa'
    assert unicodedata.script(unichr(0x07F4)) == 'Nkoo'
    assert unicodedata.script(unichr(0x1B192)) == 'Nshu'
    assert unicodedata.script(unichr(0x169C)) == 'Ogam'
    assert unicodedata.script(unichr(0x1C56)) == 'Olck'
    assert unicodedata.script(unichr(0x10CE9)) == 'Hung'
    assert unicodedata.script(unichr(0x10316)) == 'Ital'
    assert unicodedata.script(unichr(0x10A93)) == 'Narb'
    assert unicodedata.script(unichr(0x1035A)) == 'Perm'
    assert unicodedata.script(unichr(0x103D5)) == 'Xpeo'
    assert unicodedata.script(unichr(0x10A65)) == 'Sarb'
    assert unicodedata.script(unichr(0x10C09)) == 'Orkh'
    assert unicodedata.script(unichr(0x0B60)) == 'Orya'
    assert unicodedata.script(unichr(0x104CF)) == 'Osge'
    assert unicodedata.script(unichr(0x104A8)) == 'Osma'
    assert unicodedata.script(unichr(0x16B12)) == 'Hmng'
    assert unicodedata.script(unichr(0x10879)) == 'Palm'
    assert unicodedata.script(unichr(0x11AF1)) == 'Pauc'
    assert unicodedata.script(unichr(0xA869)) == 'Phag'
    assert unicodedata.script(unichr(0x10909)) == 'Phnx'
    assert unicodedata.script(unichr(0x10B81)) == 'Phlp'
    assert unicodedata.script(unichr(0xA941)) == 'Rjng'
    assert unicodedata.script(unichr(0x16C3)) == 'Runr'
    assert unicodedata.script(unichr(0x0814)) == 'Samr'
    assert unicodedata.script(unichr(0xA88C)) == 'Saur'
    assert unicodedata.script(unichr(0x111C8)) == 'Shrd'
    assert unicodedata.script(unichr(0x1045F)) == 'Shaw'
    assert unicodedata.script(unichr(0x115AD)) == 'Sidd'
    assert unicodedata.script(unichr(0x1D8C0)) == 'Sgnw'
    assert unicodedata.script(unichr(0x0DB9)) == 'Sinh'
    assert unicodedata.script(unichr(0x110F9)) == 'Sora'
    assert unicodedata.script(unichr(0x11A60)) == 'Soyo'
    assert unicodedata.script(unichr(0x1B94)) == 'Sund'
    assert unicodedata.script(unichr(0xA81F)) == 'Sylo'
    assert unicodedata.script(unichr(0x0740)) == 'Syrc'
    assert unicodedata.script(unichr(0x1714)) == 'Tglg'
    assert unicodedata.script(unichr(0x1761)) == 'Tagb'
    assert unicodedata.script(unichr(0x1965)) == 'Tale'
    assert unicodedata.script(unichr(0x1A32)) == 'Lana'
    assert unicodedata.script(unichr(0xAA86)) == 'Tavt'
    assert unicodedata.script(unichr(0x116A5)) == 'Takr'
    assert unicodedata.script(unichr(0x0B8E)) == 'Taml'
    assert unicodedata.script(unichr(0x1754D)) == 'Tang'
    assert unicodedata.script(unichr(0x0C40)) == 'Telu'
    assert unicodedata.script(unichr(0x07A4)) == 'Thaa'
    assert unicodedata.script(unichr(0x0E42)) == 'Thai'
    assert unicodedata.script(unichr(0x0F09)) == 'Tibt'
    assert unicodedata.script(unichr(0x2D3A)) == 'Tfng'
    assert unicodedata.script(unichr(0x114B0)) == 'Tirh'
    assert unicodedata.script(unichr(0x1038B)) == 'Ugar'
    assert unicodedata.script(unichr(0xA585)) == 'Vaii'
    assert unicodedata.script(unichr(0x118CF)) == 'Wara'
    assert unicodedata.script(unichr(0xA066)) == 'Yiii'
    assert unicodedata.script(unichr(0x11A31)) == 'Zanb'
예제 #5
0
def script(value):
    char = chr(value)
    return unicodedata.script_name(unicodedata.script(char), default="Unknown")
예제 #6
0
    cps_to_value = dict[range, str]()
    for line in path.read_text().splitlines():

        if not (content := line.partition("#")[0].strip()):
            continue

        field_0, value = [i.strip() for i in content.split(";")]
        start, _, stop = field_0.partition("..")

        cps_to_value[range(int(start, 16), int(stop or start, 16) + 1)] = value

    script_to_value_to_cps = dict[str, dict[str, list[int]]]()
    for cp, value in sorted(
        (i, v) for k, v in cps_to_value.items() for i in k):
        value_to_cps = script_to_value_to_cps.setdefault(
            unicodedata.script_name(unicodedata.script(chr(cp))), {})
        value_to_cps.setdefault(value, list()).append(cp)

    # yaml.add_representer(
    #     int,
    #     lambda dumper, data: dumper.represent_scalar("tag:yaml.org,2002:int", f"0x{data:04X}"),
    # )
    path = directory / "tests" / (property_name + ".yaml")
    with path.open("w") as f:
        yaml.dump(
            {
                k: {
                    value: [{
                        i: unicodedata.name(chr(i))
                    } for i in cps]
                    for value, cps in v.items()
예제 #7
0
    ],
                               stdin=subprocess.PIPE)
    process.communicate(
        json.dumps(font, ensure_ascii=False, separators=(',', ':')).encode())


baseName = sys.argv[1]
scaleZh = int(sys.argv[2])
scaleEn = int(sys.argv[3])

font = ReadFont("src/font/{}.otf".format(baseName))
del font['GSUB']
del font['GPOS']
del font['GDEF']

Gc(font)

scriptMap = {}
for cp, name in font['cmap'].items():
    cp = int(cp)
    isZh = script(chr(cp)) == "Hani"
    scriptMap[name] = scriptMap.get(name) or isZh

for name, glyph in font['glyf'].items():
    if not scriptMap.get(name):
        Transform(glyph, scaleEn / scaleZh, 0, 0, scaleEn / scaleZh, 0, 0,
                  True)

WriteFont(font,
          "build/{}.otf".format(ResolveFileName(baseName, scaleZh, scaleEn)))