def gbk_to_sjis(exc): if not isinstance(exc, UnicodeEncodeError): raise exc newpos = exc.end char = exc.object[exc.start:exc.end] c = ord(char) if c in cdict: # print('%s: %s matched!' %(char, cdict[c])) return chr(cdict[c]), newpos pinyin = xpy.get_pinyin(char) ok = [] if pinyin: for newchar in xpy.py2hz(pinyin): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass for newchar in xpy.py2hz(pinyin[:-1]): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass if ok: newchar = random.choice(ok) cdict[c] = ord(newchar) guess_chars.add(c) # print('%s: %s' %(char, ','.join(ok))) return newchar, newpos except_chars.add(c) # print('Can not encode %s, ignore' % char) return ' ' * (newpos - exc.start), newpos
def hzconvert(text, from_, to_, method='auto', chardict=None): assert from_ == 'gbk' and to_ == 'sjis' and method == 'auto' from zhtools import chconv, xpinyin cdict = chconv.Chinese2Kanji_Table for k, v in chardict.items(): try: encode(v, 'SHIFT-JIS') cdict[ord(k)] = ord(v) except UnicodeEncodeError: pass xpy = xpinyin.Pinyin() guess_chars = set() except_chars = set() def gbk_to_sjis(exc): if not isinstance(exc, UnicodeEncodeError): raise exc newpos = exc.end char = exc.object[exc.start:exc.end] c = ord(char) if c in cdict: # print('%s: %s matched!' %(char, cdict[c])) return chr(cdict[c]), newpos pinyin = xpy.get_pinyin(char) ok = [] if pinyin: for newchar in xpy.py2hz(pinyin): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass for newchar in xpy.py2hz(pinyin[:-1]): try: encode(newchar, 'SHIFT-JIS') ok.append(newchar) except UnicodeEncodeError: pass if ok: newchar = random.choice(ok) cdict[c] = ord(newchar) guess_chars.add(c) # print('%s: %s' %(char, ','.join(ok))) return newchar, newpos except_chars.add(c) # print('Can not encode %s, ignore' % char) return ' ' * (newpos - exc.start), newpos codecs.register_error('gbk_to_sjis', gbk_to_sjis) # from zhtools import langconv # text = langconv.Converter('zh-hant').convert(text) try: text = text.encode('SHIFT-JIS', errors='gbk_to_sjis') except UnicodeError as exc: char = exc.object[exc.start:exc.end] print(char) raise print('These chars cannot encode to shift-jis:') if py3k: print(''.join(chr(c) for c in except_chars)) else: print(encode(UEMPTY.join(chr(c) for c in except_chars))) print('These chars can be guessed by pinyin:') if py3k: print(''.join(chr(c) for c in guess_chars)) else: print(encode(UEMPTY.join(chr(c) for c in guess_chars))) return text
# -*- coding: utf-8 -*- from portable import chr, to_unicode, UEMPTY, py3k _fullwide_map = [chr(65248 + i) for i in range(128)] _fullwide_map[32] = to_unicode(' ') _fullwide_map = UEMPTY.join(_fullwide_map) def get_widechar_converter(excepts=None): ''' >>> f = get_widechar_converter(r'/\@') >>> s = 'wc是@厕所的意思.../' wc是@厕所的意思.../ ''' if excepts: fm = list(_fullwide_map) for char in excepts: fm[ord(char)] = char fm = UEMPTY.join(fm) else: fm = _fullwide_map return lambda s: s.translate(fm) if __name__ == '__main__': import doctest doctest.testmod()
# -*- coding: utf-8 -*- from portable import chr, to_unicode, UEMPTY, py3k _fullwide_map = [chr(65248 + i) for i in range(128)] _fullwide_map[32] = to_unicode(' ') _fullwide_map = UEMPTY.join(_fullwide_map) def get_widechar_converter(excepts=None): ''' >>> f = get_widechar_converter(r'/\@') >>> s = 'wc是@厕所的意思.../' >>> print(f(s) if py3k else f(s.decode('utf-8')).encode('utf-8')) wc是@厕所的意思.../ ''' if excepts: fm = list(_fullwide_map) for char in excepts: fm[ord(char)] = char fm = UEMPTY.join(fm) else: fm = _fullwide_map return lambda s: s.translate(fm) if __name__ == '__main__': import doctest doctest.testmod()