def test_custom_pinyin_dict(): hans = '桔' try: assert lazy_pinyin(hans, style=TONE2) == ['ju2'] except AssertionError: pass load_single_dict({ord('桔'): 'jú,jié'}) assert lazy_pinyin(hans, style=TONE2) == ['ju2']
def rectify(): pypinyin.load_single_dict({ ord('的'):'de,di', ord('地'):'de,di', ord('了'):'le,liao', ord('着'):'zhe,zhuo', ord('还'):'hai,huan' })
def __init__(self): self.textReformer = text_reformer.TextReformer() self.pinyin_part = pinyin_utility.load_pinyin_part_dict() self.wordPart = word_part.WordPart() pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"}) pypinyin.load_phrases_dict( phrases_dict={ u'嗯哪': [[u'en'], [u'na']], u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'], [u'xiang'], [u'feng']] })
def hz2pinyin(): # 生成拼音验证集、测试集、训练集 load_single_dict({ord('嗯'): 'en'}) with open('pyin_hz_data/dev.pyin', 'w') as f: for line in open('pyin_hz_data/dev.hz'): pinyinList = S('{}'.format(line.strip()), separator='', strict=False) f.write('{}\n'.format(pinyinList)) with open('pyin_hz_data/test.pyin', 'w') as f: for line in open('pyin_hz_data/test.hz'): pinyinList = S('{}'.format(line.strip()), separator='', strict=False) f.write('{}\n'.format(pinyinList)) with open('pyin_hz_data/train.pyin', 'w') as f: for line in open('pyin_hz_data/train.hz'): pinyinList = S('{}'.format(line.strip()), separator='', strict=False) f.write('{}\n'.format(pinyinList))
def py(s): ''' 汉字拼音大写首字母缩写 ''' load_single_dict({ord('长'): 'cháng,zhǎng'}) # 调整 "长" 字的拼音顺序 return ''.join(lazy_pinyin(s, style=Style.FIRST_LETTER))
def active(): """ 激活自定义单词多音字。 """ load_single_dict(SINGLE_DICTIONARY)
def pypinyin_fix(): pypinyin.load_phrases_dict({"哪些": [["na"], ["xie"]]}) pypinyin.load_phrases_dict({"哪个": [["na"], ["ge"]]}) pypinyin.load_phrases_dict({"那些": [["na"], ["xie"]]}) pypinyin.load_phrases_dict({"白干": [["bai"], ["gan"]]}) pypinyin.load_phrases_dict({"寻思": [["xun"], ["si"]]}) pypinyin.load_phrases_dict({"清寒": [["qing"], ["han"]]}) pypinyin.load_phrases_dict({"补齐": [["bu"], ["qi"]]}) pypinyin.load_phrases_dict({"添砖加瓦": [["tian"], ["zhuan"], ["jia"], ["wa"]]}) pypinyin.load_phrases_dict({"敬业乐群": [["jing"], ["ye"], ["le"], ["qun"]]}) pypinyin.load_phrases_dict({"物竞天择": [["wu"], ["jing"], ["tian"], ["ze"]]}) pypinyin.load_phrases_dict({"心存疑虑": [["xin"], ["cun"], ["yi"], ["lv"]]}) pypinyin.load_phrases_dict({"避免麻烦": [["bi"], ["mian"], ["ma"], ["fan"]]}) pypinyin.load_phrases_dict({"叶落归根": [["ye"], ["luo"], ["gui"], ["gen"]]}) pypinyin.load_phrases_dict({"地动山摇": [["di"], ["dong"], ["shan"], ["yao"]]}) pypinyin.load_single_dict({ord("帧"): "zhen"}) pypinyin.load_single_dict({ord("霰"): "xian"}) pypinyin.load_single_dict({ord("珩"): "heng"}) pypinyin.load_single_dict({ord("嗯"): "en"}) pypinyin.load_single_dict({ord("嗲"): "dia"}) pypinyin.load_single_dict({ord("豉"): "chi"}) pypinyin.load_single_dict({ord("聒"): "guo"})
使用其他分词模块: 安装分词模块,比如 pip install snownlp ; 使用经过分词处理的 字符串列表 作参数: >> from pypinyin import lazy_pinyin, TONE2 >> from snownlp import SnowNLP >> hans = u'音乐123' >> hans_seg = SnowNLP(hans).words # 分词处理 >> hans_seg [u'\u97f3\u4e50', u'123'] >> lazy_pinyin(hans_seg, style=TONE2) [u'yi1n', u'yue4', u'123'] 自定义拼音库 如果对结果不满意,可以通过 load_single_dict() 或 load_phrases_dict() 以自定义拼音库的方式修正结果: 安装了 jieba 分词模块并且支持分词的词组 >> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2 >> hans = u'桔子' >> lazy_pinyin(hans, style=TONE2) [u'jie2', u'zi3'] >> load_phrases_dict({u'桔子': [[u'jú'], [u'zǐ']]}) >> lazy_pinyin(hans, style=TONE2) [u'ju2', u'zi3'] 未安装 jieba 分词模块 and/or 不支持分词的词组 >> from pypinyin import lazy_pinyin, load_phrases_dict, TONE2, load_single_dict >> hans = u'还没' >> lazy_pinyin(hans, style=TONE2)
#coding=utf-8 import sys import codecs reload(sys) sys.setdefaultencoding('utf8') import win32com.client app = win32com.client.Dispatch('Indesign.Application') from pypinyin import pinyin, lazy_pinyin, load_single_dict myTitles=app.FindGrep() #设置分类错误的拼音 load_single_dict({0x7684:u'de,d\xed'}) #处理拼音格式 def addPinyin(sometext): mylist=pinyin(sometext,heteronym=True) str=u'' for pp in mylist: str+=pp[0]+u' ' #去除最后空格 return str.rstrip() #添加拼音 for myT in myTitles: myT.Texts[0].RubyFlag=True myT.Texts[0].RubyString=addPinyin(myT.Texts[0].Contents)
def tunepypinyin(): load_single_dict({ord(u"的"): u"de0,di2"}) load_single_dict({ord(u"得"): u"de2,dei3,de0"}) load_single_dict({ord(u"了"): u"le0,liao3"})
def test_custom_pinyin_dict_tone2(): load_single_dict({ord('桔'): 'ce4,si4'}, style='tone2') assert lazy_pinyin('桔', style=TONE2) == ['ce4'] assert pinyin('桔') == [['cè']]
def load_luna_dict(): single_dict = {} phrases_dict = {} # 朙月拼音是专为繁体字设计的字典, 里面的简体字被看成"被大陸簡化字借用字形的傳承字"标注的是"古音" # 直接用来处理带简体字的zhwiki效果惨不忍睹(-_-#), 这里使用opencc尝试规避该问题 luna_dict = {} luna_dict_simple = {} with open('./rime-luna-pinyin/luna_pinyin.dict.yaml', mode='r') as f: for line in f: match = PATTERN_RIME_DICT_ITEM.match(line) if match: item = Dict(match.groupdict()) # item中的words字段进用来debug时追踪item的来源 word = item['word'] item.pop('word') item.words = word item.percent = float(item.percent) if item.percent is not None else 100 if luna_dict.get(word) is None: luna_dict[word] = [item] else: # 多音字 luna_dict[word].append(item) word_simple = _TO_SIMPLIFIED_CHINESE.convert(word) if word != word_simple: item_simple = Dict(item) if luna_dict_simple.get(word_simple) is None: luna_dict_simple[word_simple] = [item_simple] else: # 多繁转一简后同音的情况, 此时应该将词频累加 for exist_item in luna_dict_simple[word_simple]: if exist_item.pinyin == item_simple.pinyin: exist_item.percent += item_simple.percent exist_item.words += item_simple.words # logging.info(f'exist_item: {exist_item}') break else: luna_dict_simple[word_simple].append(item_simple) # 使用简体字的注音覆盖繁体字的注音, 则那些"被大陸簡化字借用字形的傳承字"的注音大多会被覆盖掉... luna_dict.update(luna_dict_simple) for (word, items) in luna_dict.items(): for item in items: if item.percent < 5: # 排除低频词 continue if len(word) == 1: codePoint = ord(word) if single_dict.get(codePoint) is None: single_dict[codePoint] = item.pinyin else: single_dict[codePoint] = f'{single_dict[codePoint]},{item.pinyin}' else: w = item.pinyin.split(' ') if phrases_dict.get(word) is None: phrases_dict[word] = [[it] for it in w] elif len(phrases_dict[word]) == len(w): for i in range(len(w)): phrases_dict[word][i].append(w[i]) else: logging.warn(f'invalid pinyin: {word} -> {item}') # 移除内置单字词典的多音字 for (word, pinyins) in PINYIN_DICT.items(): pinyin_list = pinyins.split(',') if len(pinyin_list) > 1: PINYIN_DICT[word] = pinyin_list[0] # 移除内置词组的多音字 for (word, phrases) in PHRASES_DICT.items(): for p in phrases: while(len(p) > 1): p.pop() # 加载luna词典 load_single_dict(single_dict) load_phrases_dict(phrases_dict)
#coding=utf-8 import pypinyin import word_part import numpy as np import pinyin_utility pypinyin.load_single_dict(pinyin_dict={0x55ef: u"en"}) pypinyin.load_phrases_dict( phrases_dict={ u'嗯哪': [[u'en'], [u'na']], u'人生何处不相逢': [[u'ren'], [u'sheng'], [u'he'], [u'chu'], [u'bu'], [u'xiang'], [u'feng']] }) pinyin_part_dict = pinyin_utility.load_pinyin_part_dict() wp = word_part.WordPart() def generate_vocab_vec(): raw_vocab_list, raw_vocab, size1 = load_w2v('../../model/raw.txt') pinyin_vocab_list, pinyin_vocab, size2 = load_w2v('../../model/pinyin.txt') part_vocab_list, part_vocab, size3 = load_w2v('../../model/part.txt') fw1 = open('../../model/vocab.txt', 'w') fw2 = open('../../model/vec.txt', 'w') size = size1 + size2 + size3 fw2.write(str(len(raw_vocab_list)) + ' ' + str(size) + '\n') fw1.write('UNK\n') unk_list = [0] * size unk = np.asarray(unk_list, dtype=np.float32).tolist() fw2.write(' '.join([str(i) for i in unk]) + '\n') for word in raw_vocab: word_pinyin = pypinyin.lazy_pinyin(word, errors=lambda x: u'ng') try:
#coding=utf-8 import sys reload(sys) sys.setdefaultencoding('utf8') import win32com.client import codecs app = win32com.client.Dispatch('Indesign.Application') from pypinyin import pinyin, lazy_pinyin, load_single_dict myfile= codecs.open('C:\\Users\\jjp\\py4id\\readTitle.jsx','r','utf-8') ff=myfile.read() myTitles=app.DoScript(ff,1246973031) load_single_dict({u'的':[[u'de'],[u'd\xed']]}) #处理拼音格式 def addPinyin(sometext): mylist=pinyin(sometext, heteronym=True) str=u'' for pp in mylist: str+=pp[0]+u' ' print str.rstrip() return str.rstrip() #添加拼音 for myT in myTitles: myT.Texts[0].RubyFlag=True myT.Texts[0].RubyString=addPinyin(myT.Texts[0].Contents)
print(pinyin('中心', style=Style.TONE2, heteronym=True)) print(lazy_pinyin('中心')) print(pinyin('翟偲翀', heteronym=True)) from pypinyin import load_phrases_dict, load_single_dict # load_phrases_dict({'步履蹒跚': [['bù'], ['lǚ'], ['pán'], ['shān']]}) # load_single_dict({ord('蹒'): 'pán'}) # print(pinyin('步履蹒跚')) single_sur_dict = {} phrases_sur_dict = {} with open('sur_pinyin.dict', 'r') as fr: for line in fr.readlines(): words = line.strip().split('\t') if len(words[0]) > 1: spys = words[1].split('/') phrases_sur_dict[words[0]] = [] for spy in spys: phrases_sur_dict[words[0]].append([spy]) else: single_sur_dict[ord(words[0])] = words[1] load_phrases_dict(phrases_sur_dict) load_single_dict(single_sur_dict) print(pinyin('翟偲翀'))
from pypinyin import lazy_pinyin, load_phrases_dict, Style, load_single_dict hans = '桔子' hans1 = lazy_pinyin(hans, style=Style.TONE2) print(hans1) load_phrases_dict({'桔子': [['jié'], ['zǐ']]}) # 增加 "桔子" 词组,故意使用一个错误的拼音 hans2 = lazy_pinyin(hans, style=Style.TONE2) print(hans2) hanm = '还没' hanm1 = lazy_pinyin(hanm, style=Style.TONE2) print(hanm1) load_single_dict({ord('还'): 'hái,huán'}) # 调整 "还" 字的拼音顺序 hanm2 = lazy_pinyin('还没', style=Style.TONE2) print(hanm2)
next_2_word = seg[index + 2][0] if (next_pos == 'a' and next_2_word not in not_in_after_2 and next_2_pos != 'uv') or (index > 0 and seg[index - 1][1] in before_de): py = 'de' elif (index == 0 or seg[index - 1][0] != '不') and (next_word + next_2_word) not in not_in_2_next and next_word not in not_in_2_next\ and ((next_pos in after_dei) or (next_2_pos in after_2_dei)): py = 'děi' return py print('Loading Pinyin Converter...') from pypinyin import pinyin, lazy_pinyin, load_phrases_dict, Style, load_single_dict print('Loading custom dictionary...') load_phrases_dict(phrases_dict) load_single_dict(pinyin_dict) print('Finalizing...') add_apostrophe = lambda x: re.sub(r'^([aoeāáǎàōóǒòēéěè])', r"'\1", x) add_apostrophe = np.vectorize(add_apostrophe) import opencc converter = opencc.OpenCC('s2tw.json') converter_t2s = opencc.OpenCC('t2s.json') print('Hanzi2Phonetics is ready.') # if segmentation_package == 'hanlp': # def to_pinyin(x): # x, beginning_list = breakdown(x.translate(punctuations_cn2py)) # ans = [] # seg, tag = segment(x) # for (seg_part, tag_part, is_beginning_of_sentence) in zip(s, t, beginning_list): # l = len(seg_part)
# coding:gbk import json import os import re # from xpinyin import Pinyin from pypinyin import lazy_pinyin, load_single_dict wd = dict() # word_list py = dict() load_single_dict({ord('帧'): 'zhen'}) load_single_dict({ord('嗯'): 'en'}) load_single_dict({ord('嗲'): 'dia'}) load_single_dict({ord('芎'): 'xiong'}) load_single_dict({ord('菹'): 'zu'}) load_single_dict({ord('呒'): 'fu'}) load_single_dict({ord('丬'): 'pan'}) load_single_dict({ord('嬷'): 'mo'}) load_single_dict({ord('珩'): 'heng'}) load_single_dict({ord('砉'): 'hua'}) load_single_dict({ord('碡'): 'zhou'}) load_single_dict({ord('聒'): 'guo'}) load_single_dict({ord('蚵'): 'ke'}) load_single_dict({ord('豉'): 'chi'}) load_single_dict({ord('霰'): 'xian'}) def load_pyhz_list(pinyin2hanzi_path): pinyin2hanzi = open(pinyin2hanzi_path) hanzi = open("D://project/Aiwork/lib/一二级汉字表.txt").read()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import re import sys import collections os.environ['PYPINYIN_NO_DICT_COPY'] = '1' import pypinyin import terra_pinyin pypinyin.load_single_dict(terra_pinyin.pinyin_dict) pypinyin.load_phrases_dict(terra_pinyin.phrases_dict) RE_UCJK = re.compile('([\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff' '\U00020000-\U0002A6DF\U0002A700-\U0002B73F' '\U0002B740-\U0002B81F\U0002B820-\U0002CEAF' '\U0002F800-\U0002FA1F]+)') with open(sys.argv[1], 'w', encoding='utf-8') as w1, \ open(sys.argv[2], 'w', encoding='utf-8') as w2, \ open(sys.argv[3], 'w', encoding='utf-8') as w3: for ln in sys.stdin.buffer: for seg in RE_UCJK.findall(ln.decode('utf-8', 'ignore')): pinyins = pypinyin.pinyin(seg, style=pypinyin.Style.NORMAL, errors='ignore') length = len(seg) if length != len(pinyins):
from pypinyin import pinyin, Style, load_single_dict import json import os SAVE_DIR = os.path.dirname(__file__) pinyin2num_dict_fp = os.path.join(SAVE_DIR, 'pinyin2num_dict.json') # 拼音-->数字 字典。 PinYinTable_fp = os.path.join( SAVE_DIR, 'PinYinTable_modern.csv') # 拼音与音素原始表,若不存在上述文件,则根据此表生成 if os.path.basename(PinYinTable_fp) == 'PinYinTable_classic.csv': load_single_dict({ ord('嗯'): 'en2', ord('哟'): 'you4', }) else: load_single_dict({ ord('嗯'): 'ng2', }) def prepare_pinyinbase(): tables = [] with open(PinYinTable_fp, 'r', encoding='utf8') as f: lines = f.read().split('\n') for line in lines: tables.append(line.split(',')) pinyin2num_dict = {} k = 0 for i in range(1, len(tables)):