def gbk_to_sjis(exc): if not isinstance(exc, UnicodeEncodeError): raise exc newpos = exc.end char = exc.object[exc.start:exc.end] c = ord(char) if c in cdict: # print('%s: %s matched!' %(char, cdict[c])) return chr(cdict[c]), newpos pinyin = xpy.get_pinyin(char) ok = [] if pinyin: for newchar in xpy.py2hz(pinyin): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass for newchar in xpy.py2hz(pinyin[:-1]): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass if ok: newchar = random.choice(ok) cdict[c] = ord(newchar) guess_chars.add(c) # print('%s: %s' %(char, ','.join(ok))) return newchar, newpos except_chars.add(c) # print('Can not encode %s, ignore' % char) return ' ' * (newpos - exc.start), newpos
def hzconvert(text, from_, to_, method='auto', chardict=None): assert from_ == 'gbk' and to_ == 'sjis' and method == 'auto' from zhtools import chconv, xpinyin cdict = chconv.Chinese2Kanji_Table for k, v in chardict.items(): try: encode(v, 'SHIFT-JIS') cdict[ord(k)] = ord(v) except UnicodeEncodeError: pass xpy = xpinyin.Pinyin() guess_chars = set() except_chars = set() def gbk_to_sjis(exc): if not isinstance(exc, UnicodeEncodeError): raise exc newpos = exc.end char = exc.object[exc.start:exc.end] c = ord(char) if c in cdict: # print('%s: %s matched!' %(char, cdict[c])) return chr(cdict[c]), newpos pinyin = xpy.get_pinyin(char) ok = [] if pinyin: for newchar in xpy.py2hz(pinyin): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass for newchar in xpy.py2hz(pinyin[:-1]): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass if ok: newchar = random.choice(ok) cdict[c] = ord(newchar) guess_chars.add(c) # print('%s: %s' %(char, ','.join(ok))) return newchar, newpos except_chars.add(c) # print('Can not encode %s, ignore' % char) return ' ' * (newpos - exc.start), newpos codecs.register_error('gbk_to_sjis', gbk_to_sjis) # from zhtools import langconv # text = langconv.Converter('zh-hant').convert(text) try: text = text.encode('SHIFT-JIS', errors='gbk_to_sjis') except UnicodeError as exc: char = exc.object[exc.start:exc.end] print(char) raise print('These chars cannot encode to shift-jis:') if py3k: print(''.join(chr(c) for c in except_chars)) else: print(encode(UEMPTY.join(chr(c) for c in except_chars))) print('These chars can be guessed by pinyin:') if py3k: print(''.join(chr(c) for c in guess_chars)) else: print(encode(UEMPTY.join(chr(c) for c in guess_chars))) return text