Exemplo n.º 1
0
 def gbk_to_sjis(exc):
     if not isinstance(exc, UnicodeEncodeError):
         raise exc
     newpos = exc.end
     char = exc.object[exc.start:exc.end]
     c = ord(char)
     if c in cdict:
         # print('%s: %s matched!' %(char, cdict[c]))
         return chr(cdict[c]), newpos
     pinyin = xpy.get_pinyin(char)
     ok = []
     if pinyin:
         for newchar in xpy.py2hz(pinyin):
             try:
                 encode(newchar, 'SHIFT-JIS')
                 ok.append(newchar)
             except UnicodeEncodeError:
                 pass
         for newchar in xpy.py2hz(pinyin[:-1]):
             try:
                 encode(newchar, 'SHIFT-JIS')
                 ok.append(newchar)
             except UnicodeEncodeError:
                 pass
     if ok:
         newchar = random.choice(ok)
         cdict[c] = ord(newchar)
         guess_chars.add(c)
         # print('%s: %s' %(char, ','.join(ok)))
         return newchar, newpos
     except_chars.add(c)
     # print('Can not encode %s, ignore' % char)
     return ' ' * (newpos - exc.start), newpos
Exemplo n.º 2
0
 def gbk_to_sjis(exc):
     if not isinstance(exc, UnicodeEncodeError):
         raise exc
     newpos = exc.end
     char = exc.object[exc.start:exc.end]
     c = ord(char)
     if c in cdict:
         # print('%s: %s matched!' %(char, cdict[c]))
         return chr(cdict[c]), newpos
     pinyin = xpy.get_pinyin(char)
     ok = []
     if pinyin:
         for newchar in xpy.py2hz(pinyin):
             try:
                 encode(newchar, 'SHIFT-JIS')
                 ok.append(newchar)
             except UnicodeEncodeError:
                 pass
         for newchar in xpy.py2hz(pinyin[:-1]):
             try:
                 encode(newchar, 'SHIFT-JIS')
                 ok.append(newchar)
             except UnicodeEncodeError:
                 pass
     if ok:
         newchar = random.choice(ok)
         cdict[c] = ord(newchar)
         guess_chars.add(c)
         # print('%s: %s' %(char, ','.join(ok)))
         return newchar, newpos
     except_chars.add(c)
     # print('Can not encode %s, ignore' % char)
     return ' ' * (newpos - exc.start), newpos
Exemplo n.º 3
0
def hzconvert(text, from_, to_, method='auto', chardict=None):
    assert from_ == 'gbk' and to_ == 'sjis' and method == 'auto'

    from zhtools import chconv, xpinyin
    cdict = chconv.Chinese2Kanji_Table
    for k, v in chardict.items():
        try:
            encode(v, 'SHIFT-JIS')
            cdict[ord(k)] = ord(v)
        except UnicodeEncodeError:
            pass

    xpy = xpinyin.Pinyin()
    guess_chars = set()
    except_chars = set()

    def gbk_to_sjis(exc):
        if not isinstance(exc, UnicodeEncodeError):
            raise exc
        newpos = exc.end
        char = exc.object[exc.start:exc.end]
        c = ord(char)
        if c in cdict:
            # print('%s: %s matched!' %(char, cdict[c]))
            return chr(cdict[c]), newpos
        pinyin = xpy.get_pinyin(char)
        ok = []
        if pinyin:
            for newchar in xpy.py2hz(pinyin):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
            for newchar in xpy.py2hz(pinyin[:-1]):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
        if ok:
            newchar = random.choice(ok)
            cdict[c] = ord(newchar)
            guess_chars.add(c)
            # print('%s: %s' %(char, ','.join(ok)))
            return newchar, newpos
        except_chars.add(c)
        # print('Can not encode %s, ignore' % char)
        return ' ' * (newpos - exc.start), newpos

    codecs.register_error('gbk_to_sjis', gbk_to_sjis)
    # from zhtools import langconv
    # text = langconv.Converter('zh-hant').convert(text)
    try:
        text = text.encode('SHIFT-JIS', errors='gbk_to_sjis')
    except UnicodeError as exc:
        char = exc.object[exc.start:exc.end]
        print(char)
        raise
    print('These chars cannot encode to shift-jis:')
    if py3k:
        print(''.join(chr(c) for c in except_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in except_chars)))
    print('These chars can be guessed by pinyin:')
    if py3k:
        print(''.join(chr(c) for c in guess_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in guess_chars)))
    return text
Exemplo n.º 4
0
def hzconvert(text, from_, to_, method='auto', chardict=None):
    assert from_ == 'gbk' and to_ == 'sjis' and method == 'auto'

    from zhtools import chconv, xpinyin
    cdict = chconv.Chinese2Kanji_Table
    for k, v in chardict.items():
        try:
            encode(v, 'SHIFT-JIS')
            cdict[ord(k)] = ord(v)
        except UnicodeEncodeError:
            pass

    xpy = xpinyin.Pinyin()
    guess_chars = set()
    except_chars = set()

    def gbk_to_sjis(exc):
        if not isinstance(exc, UnicodeEncodeError):
            raise exc
        newpos = exc.end
        char = exc.object[exc.start:exc.end]
        c = ord(char)
        if c in cdict:
            # print('%s: %s matched!' %(char, cdict[c]))
            return chr(cdict[c]), newpos
        pinyin = xpy.get_pinyin(char)
        ok = []
        if pinyin:
            for newchar in xpy.py2hz(pinyin):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
            for newchar in xpy.py2hz(pinyin[:-1]):
                try:
                    encode(newchar, 'SHIFT-JIS')
                    ok.append(newchar)
                except UnicodeEncodeError:
                    pass
        if ok:
            newchar = random.choice(ok)
            cdict[c] = ord(newchar)
            guess_chars.add(c)
            # print('%s: %s' %(char, ','.join(ok)))
            return newchar, newpos
        except_chars.add(c)
        # print('Can not encode %s, ignore' % char)
        return ' ' * (newpos - exc.start), newpos

    codecs.register_error('gbk_to_sjis', gbk_to_sjis)
    # from zhtools import langconv
    # text = langconv.Converter('zh-hant').convert(text)
    try:
        text = text.encode('SHIFT-JIS', errors='gbk_to_sjis')
    except UnicodeError as exc:
        char = exc.object[exc.start:exc.end]
        print(char)
        raise
    print('These chars cannot encode to shift-jis:')
    if py3k:
        print(''.join(chr(c) for c in except_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in except_chars)))
    print('These chars can be guessed by pinyin:')
    if py3k:
        print(''.join(chr(c) for c in guess_chars))
    else:
        print(encode(UEMPTY.join(chr(c) for c in guess_chars)))
    return text