def token_indexing(idx, encoding_type, return_type): """ 将输入的单词id映射为每个字五笔、拼音的字符的id :param idx: (seq_len, batch_size) :return: chars: (seq_len, batch_size, num_char) token_lens: (seq_len, batch_size, num_char) """ c = dict_word['idx2word'][idx] if c == '<eos>': c = '。' if encoding_type == 'wubi': encoding = wubi(c)[0] if wubi(c) else c full_encoding = encoding if len( encoding) == 8 else encoding + '。' * (8 - len(encoding)) assert len(full_encoding) == 8, full_encoding tokens = [dict_wubi['char2idx'][c] for c in full_encoding] length = [i < len(encoding) for i in range(len(tokens))] elif encoding_type == 'pinyin': encoding = pinyin(c)[0][0] if pinyin(c) else c full_encoding = encoding if len( encoding) == 8 else encoding + '。' * (8 - len(encoding)) assert len(full_encoding) == 8, full_encoding tokens = [dict_pinyin['char2idx'][c] for c in full_encoding] length = [i < len(encoding) for i in range(len(tokens))] else: raise NotImplementedError # print(idx, c, encoding, tokens, length) return tokens if return_type == 'tokens' else length
def wubi_convert(word): length = len(word) wubi_list = wubi(word) ret = '' if length == 2: ret = wubi_list[0][:2] + wubi_list[1][:2] elif length == 3: ret = wubi_list[0][:1] + wubi_list[1][:1] + wubi_list[2][:2] elif length >= 4: ret = wubi_list[0][:1] + wubi_list[1][:1] + wubi_list[ 2][:1] + wubi_list[-1][:1] return ret
def convert_to_wubi(text): return ' '.join(wubi(text))
def convert_wubi(s): wubi_code_list = wubi(s) wubi_code = ''.join(wubi_code_list) return wubi_code