def prefixs_for_term (self,term): """ Get prefixs for TERM. """ # Normalization term=term.lower() # Prefixs for term prefixs=[] for i in xrange(1, len(term) + 1): word = term[:i] prefixs.append(word) prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.FIRST_LETTER)]).lower()) prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.NORMAL)]).lower()) prefixs.append(word) tokens = self.normalize(term) for token in tokens: for i in xrange (1,len(token)+1): word = token[:i] prefixs.append(word) prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.FIRST_LETTER)]).lower()) prefixs.append(''.join([i[0] for i in pinyin(word, style=pypinyin.NORMAL)]).lower()) prefixs.append(word) return list(set(prefixs))
def get_pinyin(self): su_temp = pinyin(self.names[0], style=pypinyin.NORMAL,heteronym=True) fn_temp = pinyin(self.names[1], style=pypinyin.NORMAL, heteronym=True) su_py = self.combination(self, su_temp, '') fn_py = self.combination(self, fn_temp, '') pys = self.combination(self, [su_py, fn_py], ' ') return pys
def test_zh_and_en(): """中英文混合的情况""" # 中英文 hans = '中心' try: assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']] except AssertionError: assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['a'], ['b'], ['c']]
def test_errors_callable(): def foobar(chars): return 'a' * len(chars) class Foobar(object): def __call__(self, chars): return 'a' * len(chars) n = 5 assert pinyin('あ' * n, errors=foobar) == [['a' * n]] assert pinyin('あ' * n, errors=Foobar()) == [['a' * n]]
def test_others(): # 空字符串 assert pinyin('') == [] # 单个汉字 assert pinyin('營') == [['y\xedng']] # 中国 人 assert pinyin('中国人') == [['zh\u014dng'], ['gu\xf3'], ['r\xe9n']] # 日文 assert pinyin('の') == [['\u306e']] # 没有读音的汉字,还不存在的汉字 assert pinyin('\u9fff') == [['\u9fff']]
def get_homophones_by_char(input_char): """ 根据汉字取同音字 :param input_char: :return: """ result = [] # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字 for i in range(0x4e00, 0x9fa6): if pinyin([chr(i)], style=pypinyin.NORMAL)[0][0] == pinyin(input_char, style=pypinyin.NORMAL)[0][0]: result.append(chr(i)) return result
def test_custom_style_with_decorator(): style_value = 'test_custom_style_with_decorator' @register(style_value) def func(pinyin, **kwargs): return pinyin + str(len(pinyin)) hans = '北京' origin_pinyin_s = pinyin(hans) expected_pinyin_s = deepcopy(origin_pinyin_s) for pinyin_s in expected_pinyin_s: for index, py in enumerate(pinyin_s): pinyin_s[index] = func(py) assert pinyin(hans, style=style_value) == expected_pinyin_s
def lang_zh(text): res = [] for line in text.split('\n'): cut = jieba.cut(line) ln = [[i, "'".join(j[0] for j in pypinyin.pinyin(i, style=0))] for i in cut] res.append(ln) return res
def _du(self, _request, _rdata): if "user_uuid" not in _request: self.setErrorCode(API_ERR.NO_PARA) logging.error("Error for no para: %s.", (str(_request))) return _o = redis_hash_to_dict(self.application.redis, DeviceUser, _request["user_uuid"]) logging.info(_o) if _o == None: self.setErrorCode(API_ERR.NO_OBJECT) logging.error("Error for no user uuid: %s." % (_request["user_uuid"])) return # not return the password default return_password = False if "return_password" in _request: return_password = _request["return_password"] if not return_password: del _o["user_password"] _fn = _o.get("user_fullname") if _fn != None and not isinstance(_fn, unicode): _fn = _fn.decode("utf-8") _rdata.update(_o) _rdata["pinyinname0"] = "".join(lazy_pinyin(_fn)) _rdata["pinyinname1"] = "".join(list(itertools.chain.from_iterable(pinyin(_fn, style=pypinyin.INITIALS)))) return
def _du(self): _request = json.loads(self.request.body) _user_uuid = _request.get("user_uuid") if not _user_uuid: self.setErrorCode(API_ERR.NO_PARA) return _o = redis_hash_to_dict(self.application.redis, DeviceUser, _user_uuid) if not _o: self.setErrorCode(API_ERR.NO_OBJECT) return # not return the password default return_password = False if "return_password" in _request: return_password = _request["return_password"] if not return_password: del _o["user_password"] _fn = _o.get("user_fullname") if _fn != None and not isinstance(_fn, unicode): _fn = _fn.decode("utf-8") _rdata = self.getReturnData() _rdata.update(_o) _rdata["pinyinname0"] = "".join(lazy_pinyin(_fn)) _rdata["pinyinname1"] = "".join(list(itertools.chain.from_iterable(pinyin(_fn, style=pypinyin.INITIALS)))) _app_uuid = _get_config().get("team").get("app_uuid") _o = redis_hash_to_dict(self.application.redis, AppInfo, _app_uuid) _rdata.update({"team": _o}); return
def get_following_users(self, user): doc = yield self._db.followers.find_one({"user": user}, {"_id":0, "following":1}) if doc and "following" in doc: ret = yield [self.find_user(_, True) for _ in doc["following"] if _] else: ret = [] raise gen.Return(sorted(ret, key=lambda x: pinyin(to_unicode(("real_name" in x and x["real_name"]) or ""), style=TONE2)))
def _get_pinyin_all(existing_combinations, characters): """ Get all combinations of pinyin of some chinese characters as list, in a recurrence way, since format of result from pinyin is [['a'], ['b']] So a combination of two level loop is needed to get all the pinyin. :param existing_combinations: Existing combinations, for already calculated characters. :param characters: Characters to get combination of pinyin :return: A flat list of all combinations of pinyin for 多音字 """ first_character, other_characters = characters[0:1], characters[1:] if len(first_character) > 0: py = pinyin(first_character, style=pypinyin.FIRST_LETTER, heteronym=True) new_existing = [] for p in py: for a in p: if len(existing_combinations) > 0: for e in existing_combinations: ne = e[:] ne.append(a) new_existing.append(ne) else: ne = existing_combinations[:] ne.append(a) new_existing.append(ne) return _get_pinyin_all(new_existing, other_characters) return existing_combinations
def addPinyin(sometext): mylist=pinyin(sometext, heteronym=True) str=u'' for pp in mylist: str+=pp[0]+u' ' print str.rstrip() return str.rstrip()
def test_errors(): hans = ( ('啊', {'style': TONE2}, [['a']]), ('啊a', {'style': TONE2}, [['a'], ['a']]), # 非中文字符,没有拼音 ('⺁', {'style': TONE2}, [['\u2e81']]), ('⺁', {'style': TONE2, 'errors': 'ignore'}, []), ('⺁', {'style': TONE2, 'errors': 'replace'}, [['2e81']]), ('⺁⺁', {'style': TONE2, 'errors': 'replace'}, [['2e812e81']]), ('⺁⺁', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]}, [['a'], ['a']]), ('⺁⺁', {'style': TONE2, 'errors': lambda x: [['a', 'b'], ['b', 'c']]}, [['a'], ['b']]), ('⺁⺁', {'style': TONE2, 'heteronym': True, 'errors': lambda x: [['a', 'b'], ['b', 'c']]}, [['a', 'b'], ['b', 'c']]), # 中文字符,没有拼音 ('鿅', {'style': TONE2}, [['\u9fc5']]), ('鿅', {'style': TONE2, 'errors': 'ignore'}, []), ('鿅', {'style': TONE2, 'errors': '233'}, []), ('鿅', {'style': TONE2, 'errors': 'replace'}, [['9fc5']]), ('鿅', {'style': TONE2, 'errors': lambda x: ['a']}, [['a']]), ('鿅', {'style': TONE2, 'errors': lambda x: None}, []), ('鿅鿅', {'style': TONE2, 'errors': lambda x: ['a' for _ in x]}, [['a'], ['a']]), ('鿅鿅', {'style': TONE2, 'errors': lambda x: [['a', 'b']]}, [['a'], ['a']]), ('鿅鿅', {'style': TONE2, 'heteronym': True, 'errors': lambda x: [['a', 'b']]}, [['a', 'b'], ['a', 'b']]), ) for han in hans: assert pinyin(han[0], **han[1]) == han[2]
def add_term(term, weight): words, types = term2words(term) if len(words) == 0: #avoid '......' return #max prefix match level, node_id = max_prefix_match(words, types) # 如果全部存在这个字符序列,则更新 node_id if level == len(words):#exist already add_weight(node_id, weight)#may lead to parent weight bigger than weight sum of all children else: for word in words[level:]: #insert normal node parent = node_id node_id = new_node(word, parent) if len(word)==1 and ord(word)>=19904 and ord(word)<=40895: #insert pinyin node pys = pypinyin.pinyin(word, style=pypinyin.NORMAL, heteronym=True) for py in pys[0]: #complete pinyin push_pinyin_node(parent, node_id, py) push_pinyin_node(parent, node_id, py[0]) if py[0]=='c' or py[0]=='s' or py[0]=='z': if py[1] == 'h': push_pinyin_node(parent, node_id, py[:2]) add_weight(node_id, weight)
def addPinyin(sometext): mylist=pinyin(sometext,heteronym=True) str=u'' for pp in mylist: str+=pp[0]+u' ' #去除最后空格 return strB2Q(str.rstrip())
def get_university_by_province(self, province, need_pinyin=True): if province in self._university_of_province: if need_pinyin: return [{"university": u, "pinyin": self.to_pinyin(u)} for u in self._university_of_province[province]] else: return sorted(self._university_of_province[province], key=lambda x: pinyin(to_unicode(x), style=TONE2)) else: return []
def get_pinyin(text): pinyin_list = pinyin(text, style=pypinyin.TONE3) strs = '' for i in range(0,len(pinyin_list)): if strs != None: strs = strs + ' ' + pinyin_list[i][0] else: strs = pinyin_list[i][0] return strs
def test_pinyin(): hans = u'中心' assert pinyin(hans) == [[u'zh\u014dng'], [u'x\u012bn']] assert pinyin(hans + 'abc') == [[u'zh\u014dng'], [u'x\u012bn'], ['abc']] assert pinyin(hans, pypinyin.STYLE_NORMAL) == [[u'zhong'], [u'xin']] assert pinyin(hans, pypinyin.STYLE_TONE) == [[u'zh\u014dng'], [u'x\u012bn']] assert pinyin(hans, pypinyin.STYLE_TONE2) == [[u'zho1ng'], [u'xi1n']] assert pinyin(hans, pypinyin.STYLE_INITIALS) == [['zh'], ['x']] assert pinyin(hans, pypinyin.STYLE_FIRST_LETTER) == [[u'z'], [u'x']] assert pinyin(hans, heteronym=True) == [[u'zh\u014dng', u'zh\xf2ng'], [u'x\u012bn']]
def getFirstCase(str): firstcase = str[0] if is_chinese(firstcase): return pinyin(firstcase)[0][0][0].upper() elif is_alphabet(firstcase): return firstcase.upper() elif is_number(firstcase): return retutn_alphabet(firstcase) else: return "WARNING"
def name2pinyin(name): input = name.decode('utf-8') letter_list = pinyin(input, 4) #print letter_list output = "".join([ x[0] for x in letter_list]) output = safestr(output) #print safestr(name), safestr(input), output return output
def test_seg_jieba(): hans = '音乐' import jieba hans_seg = list(jieba.cut(hans)) assert pinyin(hans_seg, style=TONE2) == [['yi1n'], ['yue4']] # 中英文混合的固定词组 assert pinyin('黄山B股', style=TONE2) == [['hua2ng'], ['sha1n'], ['B'], ['gu3']] assert pinyin('A股', style=TONE2) == [['A'], ['gu3']] assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']] assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']] assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']] assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']] assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]
def main(): """ /anaconda3/bin/python han2pinyin.py '我们爱世界' python3 han2pinyin.py '与会' #ok """ input = sys.argv[1] output = pinyin(input) print(output) return(output)
def test_zh_and_en(): """中英文混合的情况""" # 中英文 hans = '中心' if has_module('jieba'): assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['abc']] else: assert pinyin(hans + 'abc') == [['zh\u014dng'], ['x\u012bn'], ['a'], ['b'], ['c']] # 中英文混合的固定词组 assert pinyin('黄山B股', style=TONE2) == [['hua2ng'], ['sha1n'], ['B'], ['gu3']] assert pinyin('A股', style=TONE2) == [['A'], ['gu3']] assert pinyin('阿Q', style=TONE2) == [['a1'], ['Q']] assert pinyin('B超', style=TONE2) == [['B'], ['cha1o']] assert pinyin('AB超C', style=TONE2) == [['A'], ['B'], ['cha1o'], ['C']] if has_module('jieba'): assert pinyin('AB阿C', style=TONE2) == [['AB'], ['a1'], ['C']] else: assert pinyin('AB阿C', style=TONE2) == [['A'], ['B'], ['a1'], ['C']] assert pinyin('维生素C', style=TONE2) == [['we2i'], ['she1ng'], ['su4'], ['C']]
def rename(filepath): files = [] for file in os.listdir(filepath): if os.path.isfile(os.path.join(filepath,file)) == True and file.find('.jpg') > 0: print(file) files.append({'file':file,'by':pypinyin.pinyin(file,style=pypinyin.NORMAL)}) fs=sorted(files,key = py) i = 0 for f in fs: i = i+1; print(f['by']) os.rename(os.path.join(filepath,f['file']),os.path.join(filepath,'%s_%d_%s' % (u'赵妮',i,f['file'])))
def get_homophones_by_pinyin(input_pinyin): """ 根据拼音取同音字 :param input_pinyin: :return: """ result = [] # CJK统一汉字区的范围是0x4E00-0x9FA5,也就是我们经常提到的20902个汉字 for i in range(0x4e00, 0x9fa6): if pinyin([chr(i)], style=pypinyin.TONE2)[0][0] == input_pinyin: # TONE2: 中zho1ng result.append(chr(i)) return result
def ranking_function(output_prob_tree, cx, cy): # 平仄 x_py = pypinyin.pinyin(cx, style=pypinyin.TONE2) y_py = pypinyin.pinyin(cy, style=pypinyin.TONE2) x_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, x_py) y_pz = map(lambda i: -1 if int(re.search('\d', i[0]).group(0)) <= 2 else 1, y_py) pingze_score = sum(map(lambda i, j: i + j == 0, x_pz, y_pz)) / float(len(cx)) + 0.001 def sigmoid(x): return 1 / (1 + math.e ** (-x)) def pos_eq(x_pos, y_pos): return x_pos == y_pos or x_pos in y_pos or y_pos in x_pos import operator smooth_value = 0.001 freq_amp = 10 ** math.sqrt(len(cx)) # 词性 cx_pos = map(lambda x: zip(*pseg.lcut(x)[0])[0][1], cx) cy_pos = map(lambda y: zip(*pseg.lcut(y)[0])[0][1], cy) pos_score = reduce(operator.add, map(lambda x, y: float(1)/len(cx) if pos_eq(x, y) else 0, cx_pos, cy_pos)) pos_score += smooth_value # 输出概率 out_score = reduce(operator.mul, map(lambda x, y: output_prob_tree[x][y] * freq_amp, cx, cy)) out_score = sigmoid(out_score) out_score += smooth_value # 整合 score = pingze_score * out_score * pos_score # score = pingze_score * pos_score # print 'ranking', cy # print 'pingze', pingze_score # print 'pos', pos_score # print 'freq', out_score return score
def get_college_by_university(self, university, need_pinyin=True): university=utf8(university) if university in self._universities: if need_pinyin: colleges = [] for c in (self._college_of_university[university]): c = to_unicode(c) colleges.append({"college": c, "pinyin": self.to_pinyin(c)}) return sorted(colleges, key=lambda x: pinyin(x["college"], style=TONE2)) else: return sorted(list(self._college_of_university[university]), key=lambda x: pinyin(to_unicode(x), style=TONE2)) else: return []
def test_pinyin_initials(): """包含声明和韵母的词语""" hans = '中心' # 默认风格,带声调 assert pinyin(hans) == [['zh\u014dng'], ['x\u012bn']] # 普通风格,不带声调 assert pinyin(hans, NORMAL) == [['zhong'], ['xin']] # 声调风格,拼音声调在韵母第一个字母上 assert pinyin(hans, TONE) == [['zh\u014dng'], ['x\u012bn']] # 声调风格2,即拼音声调在各个拼音之后,用数字 [0-4] 进行表示 assert pinyin(hans, TONE2) == [['zho1ng'], ['xi1n']] # 声母风格,只返回各个拼音的声母部分 assert pinyin(hans, INITIALS) == [['zh'], ['x']] # 首字母风格,只返回拼音的首字母部分 assert pinyin(hans, FIRST_LETTER) == [['z'], ['x']] # 启用多音字模式 assert pinyin(hans, heteronym=True) == [['zh\u014dng', 'zh\xf2ng'], ['x\u012bn']] # 韵母风格1,只返回各个拼音的韵母部分,不带声调 assert pinyin(hans, style=FINALS) == [['ong'], ['in']] # 韵母风格2,带声调,声调在韵母第一个字母上 assert pinyin(hans, style=FINALS_TONE) == [['\u014dng'], ['\u012bn']] # 韵母风格2,带声调,声调在各个拼音之后,用数字 [0-4] 进行表示 assert pinyin(hans, style=FINALS_TONE2) == [['o1ng'], ['i1n']]
def get_word_pinyin_py(word): ''' 将输入的中文处理成拼音和拼音首字母 ''' word_pinyin = '' word_py = '' try: word_pinyin_list = lazy_pinyin(word, errors='ignore') for w in word_pinyin_list: word_pinyin += str(w) #print 'word_pinyin: ', word_pinyin word_py_list_out = pinyin(word, style=pypinyin.FIRST_LETTER) for i in word_py_list_out: word_py += str(i[0]) #print 'word_py: ', word_py except Exception, ex: logger.exception(ex)
prov = 20 cityname = '广州' meta = { "brand": rows["brandid"], "series": rows["familyid"], "model": rows["salesdescid"], "registerDate": registerDate, "city": city, "prov": prov, "mile": mile } s = f"brand={meta['brand']}&city={meta['city']}&mileAge={mile}&model={meta['model']}&prov={meta['prov']}®isterDate={registerDate}&series={meta['series']}njB6TTeQvTnGN4To" md = get_md5_value(s) # cityname = cityname_dic[city] a = pypinyin.pinyin(cityname, style=pypinyin.FIRST_LETTER) c_pinyin = ''.join([str(a[i][0]) for i in range(len(a))]) # print(c_pinyin) url = start_url.format(meta["prov"], meta["city"], rows["brandid"], rows["familyid"], rows["salesdescid"], registerDate, mile, partnerId[0], md, c_pinyin) url_list.append(url) data = {"url": url} data_list.append(data) print(url) else: city_dic = dict(zip(city_list, prov_list)) cityname_dic = dict(zip(city_list, cityname_list)) for city, prov in city_dic.items(): meta = { "brand": rows["brandid"], "series": rows["familyid"],
import pypinyin as py word = '朝阳' A = py.pinyin(word, heteronym=True) #含声调 B = py.lazy_pinyin(word) #不含声调 print(A) print(B)
# !/usr/bin/env python # -*- coding: UTF-8 -*- """ 为什么没有 y, w, yu 几个声母? 声母风格(INITIALS)下,“雨”、“我”、“圆”等汉字返回空字符串,因为根据 《汉语拼音方案》 , y,w,ü (yu) 都不是声母,在某些特定韵母无声母时,才加上 y 或 w,而 ü 也有其特定规则。 —— @hotoo 如果你觉得这个给你带来了麻烦,那么也请小心一些无声母的汉字(如“啊”、“饿”、“按”、“昂”等)。 这时候你也许需要的是首字母风格(FIRST_LETTER)。 —— @hotoo """ from pypinyin import pinyin, lazy_pinyin, Style pinyinlist = pinyin("西藏", style=Style.NORMAL) print(pinyinlist) pinyinlist = pinyin("西藏") print(pinyinlist) pinyinlist = pinyin('中心', heteronym=True) # 启用多音字模式 print(pinyinlist) pinyinlist = pinyin('银行', heteronym=True) # 启用多音字模式,无效!! print(pinyinlist) pinyinlist = pinyin('武汉', style=Style.FIRST_LETTER) # 设置拼音风格 print(pinyinlist) pinyinlist = pinyin("差错") print(pinyinlist) pinyinlist = lazy_pinyin('差错') # 不考虑多音字的情况 print(pinyinlist) pinyinlist = lazy_pinyin('你好☆☆', errors='ignore') # 当遇到不包含拼音的字符(串)时,会根据 errors 参数的值做相应的处理: print(pinyinlist) pinyinlist = lazy_pinyin('你好☆☆') # 不做任何处理,原样返回 print(pinyinlist)
target_texts = [] target_processed = [] pairs = [] with open(data_path, 'r', encoding='utf-8') as f: for line in f.readlines(): input_text, target_text = line.split('\t') input_text = re.sub(r'\([^)]*\)', '', input_text.strip()) input_text = re.sub(r'subsp\. (\w)+', '', input_text) target_text = html.unescape(target_text) target_text = re.sub(r'\([^)]*\)', '', target_text.strip()) if len(target_text) > 1 and not re.findall(r'[A-Za-z]', target_text): target_text_py = ' '.join( [item for sublist in pinyin(target_text) for item in sublist]) target_text_processed = ' '.join(jieba.cut(target_text, HMM=False)) if [input_text, target_text_py] not in pairs: pairs.append([input_text, target_text_py]) input_texts.append(input_text) target_texts.append(target_text) if ' ' in input_text: names = input_text.split(' ') input_reverse.append(' '.join(names[::-1]).strip()) else: input_reverse.append(input_text) target_processed.append(target_text_processed) with open('input.txt', 'w') as i: i.write('\n'.join(input_texts))
def handle(c): s = pinyin(c, heteronym=True, style=TONE3, strict=False) return s[0]
print Assio authorFirst = author[0] pinyin_list = authorFirst.split(' ') pinyinAll = "" for ph in pinyin_list: pinyinAll += ph V = viterbi(pinyin_list) author_prob = {} for phrase, prob in V.iteritems(): pinyinFound = "" namePinyin = pinyin(phrase, style=NORMAL) for n in namePinyin: for t in n: pinyinFound += t.encode("utf8") if cmp(pinyinFound[0:len(pinyinAll)], pinyinAll) != 0: continue author_prob[phrase] = prob nameSet = {} author_prob = sorted(author_prob.items(), key=lambda item: item[1], reverse=True) for phrase, prob in author_prob: result = wordDis.comparePerson(phrase.encode("utf8"), keyWord) if result > (0.8 / len(keyWord)): print "找到了标签相同的名字: "
def init_data(): """ @describe: 准备训练用的数据文件.npy """ # toutiao_data phrase = open('../train_data/toutiao_cat_data.txt', 'r',encoding="utf-8") ans = [] for line in phrase.readlines(): ls = line.split("_!_") # 清洗数据 ls = ls[3:] for item in ls: string = "" i = 0 while(i<len(item)): res = re.match(r'[\u4E00-\u9FA5]', item[i]) if(res == None): if(string != ""): ans.append(string) string = "" else: string += item[i] i+=1 if(string != ""): ans.append(string) print(ans[0:100]) # 汉字的拼音添加 ans_pinyin = [] for item in ans: pinyin_ans = pinyin(u'{0}'.format(item), style=pypinyin.NORMAL) if(pinyin_ans == None): continue string = "" for item in pinyin_ans: string += item[0]+" " string = string[0:-1] if(string != ""): ans_pinyin.append(string) # ['bao li ji tuan', 'ma wei du', 'zhong guo ke xue ji zhu guan',... print(ans_pinyin[1:100]) # 统计汉字信息 [汉字] [出现次数]一元语料 hanzi_ls = [] hanzi_count_ls = [] for item in ans: for chr in item: if chr not in hanzi_ls: hanzi_ls.append(chr) hanzi_count_ls.append(0) else: hanzi_count_ls[hanzi_ls.index(chr)] += 1 np.save("../data/my_hanzi_num", hanzi_count_ls, allow_pickle=True, fix_imports=True) total_hanzi_num = len(hanzi_ls) print(total_hanzi_num) # dic 汉字:汉字编码 映射表 hanzi_dict = {} encode_num = 0 for item in hanzi_ls: hanzi_dict[item] = encode_num encode_num += 1 np.save("../data/my_hanzi_dict", hanzi_dict, allow_pickle=True, fix_imports=True) # 二元语料训练 # 汉字编码到汉字编码的映射 次数 hanzi_matrix = np.zeros([total_hanzi_num, total_hanzi_num]) for item in ans: for i in range(1,len(item)): chr1 = item[i-1] chr2 = item[i] code1 = hanzi_dict[chr1] code2 = hanzi_dict[chr2] hanzi_matrix[code1][code2] += 1 np.save("../data/my_moving_array", hanzi_matrix, allow_pickle=True, fix_imports=True) # 汉子拼音字典表数据准备 py2hanzi = {} for i in range(len(ans)): pinyin_ls = ans_pinyin[i].split() # 拼音序列 for pinyin_item in pinyin_ls: # 初始化 py2hanzi[pinyin_item] = "" for i in range(len(ans)): str = ans[i]# 汉字串 pinyin_ls = ans_pinyin[i].split() # 拼音序列 for j in range(len(str)): chr = str[j] pinyin_item = pinyin_ls[j] if( chr not in py2hanzi[pinyin_item]): py2hanzi[pinyin_item] += chr np.save("../data/py2hanzi.npy", py2hanzi, allow_pickle=True, fix_imports=True) print(py2hanzi.keys()) # 汉字 对应的 拼音 频数 eg:{'了':{'le':5, 'liao':10},'屈':{'qu':5}} hanzi2pin_dict = {} # {'了':{'le':5, 'liao':10},'屈':{'qu':5}} hanzi_str = hanzi_ls # 已存在拼音的汉字序列 py_data_ls = [] # 已存在的拼音列表 # 构建双重字典表结构 for k,v in py2hanzi.items(): py_data_ls.append(k) if "ü" in k: print(k) hanzi_str += v for chr in v: hanzi2pin_dict[chr] = {} # 初始化每一个字对应一个拼音频率字典 for k,v in py2hanzi.items(): for chr in v: hanzi2pin_dict[chr][k] = 0 # 初始化每一个拼音的频率value # phrase = open('../train_data/emission_train.txt', 'r') for i in range(len(ans)): str = ans[i]# 汉字串 pinyin_ls = ans_pinyin[i].split() # 拼音序列 for i in range(len(pinyin_ls)): # 去除音调 pinyin_ls[i] = pinyin_ls[i][0:-1] if(len(str) != len(pinyin_ls)): # 拼音与汉字数不匹配 continue for i in range(len(str)): if(str[i] not in hanzi_str): # 该汉字不在有拼音的汉字列表 添加到dic hanzi2pin_dict[ str[i] ] = {} hanzi2pin_dict[ str[i] ][ pinyin_ls[i] ] = 0 py_data_ls = hanzi2pin_dict[ str[i] ].keys() # 该汉字所有的拼音列表 if(pinyin_ls[i] not in py_data_ls): hanzi2pin_dict[ str[i] ][ pinyin_ls[i] ] = 0 hanzi2pin_dict[str[i]][pinyin_ls[i]] += 1 phrase.close() np.save("../data/my_emission_dic", hanzi2pin_dict, allow_pickle=True, fix_imports=True)
conn.commit() except Exception: conn.rollback() print('已存在', j['title']) conn.close() def run(self): radi_idlist = self.get_title() for i in radi_idlist: data = self.get_radio(i) self.saveFmysql(data=data, radio_id=i) if __name__ == '__main__': # a=Radio().run() # r={"c_user_id":209747,"session_key":"a65d158284540e74814fe63c101b32ad","device":"hradio","id":10525} # b=Radio().req_post('https://api.fm.subat.cn/v2.2/radio/programs',r) # a=Radio().saveFmysql(b) # print(a) # pwd = os.path.dirname(os.path.realpath(__file__)) # print(pwd) # radiot=RadioT.objects.all() # for i in radiot: # print(i.mp3) from pypinyin import pinyin, TONE pinyinlist = pinyin("四是四十是十", style=TONE) text = '' for i in pinyinlist: text += " " + i[0] print(text)
def __to_pinyin__(str=''): rs = pinyin(str, style=Style.TONE3, neutral_tone_with_five=True) return slug(rs, separator='')
def test_mmseg_for_pinyin(input, default_ret, mmseg_ret): assert pinyin(input) == mmseg_ret assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
import torch.nn as nn import torch import torch.nn.functional as F import heapq import numpy from pypinyin import pinyin, Style config = BertConfig() config.vocab_size = 41460 # 句子词典 model = AutoModelForMaskedLM.from_config(config) model.bert.embeddings.word_embeddings = nn.Embedding(1839, 768, padding_idx=0) state_dict = torch.load('./results/checkpoint-00000/pytorch_model.bin', map_location="cpu") model.load_state_dict(state_dict) model.eval() pinyin_list = [ i for tmp in pinyin('手机没电了', style=Style.TONE3, neutral_tone_with_five=True) for i in tmp ] con_tokenizer = BertTokenizer.from_pretrained('y2d1') lab_tokenizer = BertTokenizer.from_pretrained('z2d') con = torch.tensor( con_tokenizer.convert_tokens_to_ids(pinyin_list)).unsqueeze(0) out_top5 = torch.topk(F.softmax(model(con)[0].squeeze(0), dim=-1), k=10) values = out_top5[0].detach().numpy().tolist() indices = out_top5[1].detach().numpy().tolist() for i, item in enumerate(indices): print(lab_tokenizer.convert_ids_to_tokens(item)) print(values[i])
# -*- encoding: UTF-8 -*- from pypinyin import pinyin, lazy_pinyin import MyEsTools as es class Main: def __init__(self): pass if __name__ == '__main__': tool = es.MyEsTools('10.116.27.131', 'test', 'cn') ACTIONS = [] source = {"data": "test"} action = { "_index": tool.index, "_type": tool.type, "_source": source, "_id": "车质网_12345" } ACTIONS.append(action) # tool.bulk_data(ACTIONS) # print pinyin(unicode("车质网","UTF-8")) print pinyin("车质网".decode("utf-8"), errors='ignore') print ''.join(lazy_pinyin("太平洋汽车网".decode("utf-8"), errors='ignore')) print ''.join(lazy_pinyin("车质网%".decode("utf-8")))
def pypinyin_g2p(text) -> List[str]: from pypinyin import pinyin from pypinyin import Style phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)] return phones
def SPY(ch_str_1, ch_str_2): pinyin_1 = pinyin(ch_str_1, style=Style.NORMAL) pinyin_2 = pinyin(ch_str_2, style=Style.NORMAL) return ch_similarity_sub(pinyin_1, pinyin_2)
def pro(text): t = pinyin(text, style=Style.BOPOMOFO) t = t[0][0] return t
def char2bpmf(char): from pypinyin import pinyin, Style return pinyin(char, style=Style.BOPOMOFO)[0][0]
def extract(self, char_seq): raw_result = pinyin(char_seq, **self.params) result = [i[0] for i in raw_result] return result
def pinyin(text): py = pypinyin.pinyin(text, style=pypinyin.Style.TONE3) return ''.join([x[0] for x in py])
def p(input): str = "" arr = pinyin(input, style=Style.TONE3) for i in arr: str += i[0] + " " return str
def get_pinyin_first_litter(hanzi): pinyin_list = pinyin(hanzi, style=pypinyin.FIRST_LETTER) pinyin_st = '' for i in pinyin_list: pinyin_st += i[0] return pinyin_st
def renew_pinyin(self, request, queryset): for stu in queryset: pinyin_list = pypinyin.pinyin(stu.Sname, style=pypinyin.NORMAL) stu.pinyin = ''.join([w[0][0] for w in pinyin_list]) stu.save() return self.message_user(request=request, message='修改学生拼音成功!')
def STM(ch_str_1, ch_str_2): shengdiao_1 = get_shengdiao(pinyin(ch_str_1, style=Style.TONE3)) shengdiao_2 = get_shengdiao(pinyin(ch_str_2, style=Style.TONE3)) return ch_similarity_sub(shengdiao_1, shengdiao_2)
def getStrFirstAplha(str): return pinyin(str, style=Style.FIRST_LETTER)[0][0].upper()
default=None, help="path for the English transcription text file", ) args = parser.parse_args() # clean every line in transcription file first transcription_dict = {} with codecs.open(args.transcription_path, "r", "utf-8") as fid: for line in fid.readlines(): segments = line.split(" ") lang_char = args.transcription_path.split("/")[-1][0] id = args.spk + "_" + lang_char + segments[0] # ex. TMF1_M10001 content = segments[1].replace("\n", "") # Some special rules to match CSMSC pinyin text = pinyin(content, style=Style.TONE3) text = [c[0] for c in text] clean_content = [] for c in text: c_init = get_initials(c, strict=True) c_final = get_finals(c, strict=True) for c in [c_init, c_final]: if len(c) == 0: continue c = c.replace("ü", "v") c = c.replace("ui", "uei") c = c.replace("un", "uen") c = c.replace("iu", "iou") # Special rule: "e5n" -> "en5" if "5" in c:
def SSM(ch_str_1, ch_str_2): shengmu_1 = pinyin(ch_str_1, style=Style.INITIALS) shengmu_2 = pinyin(ch_str_2, style=Style.INITIALS) return ch_similarity_sub(shengmu_1, shengmu_2)
def all_company(): """ 存储所有上市公司表信息 :return: """ # TODO: 建表语句用SQL脚本 or 代码执行 cursor = conn.cursor() df = gsd.get_all_company() stockCode = list(df.index) # 股票代码 stockName = list(df['name']) # 股票名称 stockIndustry = list(df['industry']) # 所属行业 stockArea = list(df['area']) # 所在区域 stockPe = list(df['pe']) # 市盈率 stockOutstanding = list(df['outstanding']) # 流通股本(亿) stockTotals = list(df['totals']) # 总股本(亿) stockTotalAssets = list(df['totalAssets']) # 总资产(万) stockLiquidAssets = list(df['liquidAssets']) # 流动资产 stockFixedAssets = list(df['fixedAssets']) # 固定资产 stockReserved = list(df['reserved']) # 公积金 stockReservedPerShare = list(df['reservedPerShare']) # 每股公积金 stockEsp = list(df['esp']) # 每股收益 stockBvps = list(df['bvps']) # 每股净资 stockPb = list(df['pb']) # 市净率 stockTimeToMarket = list(df['timeToMarket']) # 上市日期 stockUndp = list(df['undp']) # 未分利润 stockPerundp = list(df['perundp']) # 每股未分配 stockRev = list(df['rev']) # 收入同比(%) stockProfit = list(df['profit']) # 利润同比(%) stockGpr = list(df['gpr']) # 毛利率(%) stockNpr = list(df['npr']) # 净利润率(%) stockHolders = list(df['holders']) # 股东人数 dfLen = len(df) # print(time.strptime(stockTimeToMarket[1], "%Y%m%d")) for i in range(0, dfLen): stockCodeDB = str(stockCode[i]) stockNameDB = str(stockName[i]) stockIndustryDB = str(stockIndustry[i]) stockAreaDB = str(stockArea[i]) stockPeDB = round(float(stockPe[i]), 4) stockOutstandingDB = round(float(stockOutstanding[i]), 4) stockTotalsDB = round(float(stockTotals[i]), 4) stockTotalAssetsDB = round(float(stockTotalAssets[i]), 4) stockLiquidAssetsDB = round(float(stockLiquidAssets[i]), 4) stockFixedAssetsDB = round(float(stockFixedAssets[i]), 4) stockReservedDB = round(float(stockReserved[i]), 4) stockReservedPerShareDB = round(float(stockReservedPerShare[i]), 4) stockEspDB = round(float(stockEsp[i]), 4) stockBvpsDB = round(float(stockBvps[i]), 4) stockPbDB = round(float(stockPb[i]), 4) timeToMarketDB = str(stockTimeToMarket[i])[0:4] + '-' + str( stockTimeToMarket[i])[4:6] + '-' + str(stockTimeToMarket[i])[6:8] stockUndpDB = round(float(stockUndp[i]), 4) stockPerundpDB = round(float(stockPerundp[i]), 4) stockRevDB = round(float(stockRev[i]), 4) stockProfitDB = round(float(stockProfit[i]), 4) stockGprDB = round(float(stockGpr[i]), 4) stockNprDB = round(float(stockNpr[i]), 4) stockHoldersDB = round(float(stockHolders[i]), 4) a = str(pinyin(stockNameDB, style=pypinyin.FIRST_LETTER)) stockTableNameDB = "".join(a).replace('[', '').replace(']', '').replace("'", '').replace(',', ''). \ replace(' ', '').replace('*', '').upper() + stockCodeDB # print(stockTableNameDB) # print(stockTimeToMarket[i]) # print(timeToMarketDB) # try: cursor.execute( "insert into stock_basics(code, name, industry, area, pe, outstanding, " "totals, totalAssets, liquidAssets, fixedAssets, reserved, " "reservedPerShare, esp, bvps, pb, timeToMarket, undp, " "perundp, rev, profit, gpr, npr, holders, tablename)" "values('%s', '%s', '%s', '%s', '%f', '%f', " "'%f', '%f', '%f', '%f', '%f', " "'%f', '%f', '%f', '%f', to_date('%s', 'yyyy-MM-dd'), '%f', " "'%f', '%f', '%f', '%f', '%f', '%f', '%s')" % (stockCodeDB, stockNameDB, stockIndustryDB, stockAreaDB, stockPeDB, stockOutstandingDB, stockTotalsDB, stockTotalAssetsDB, stockLiquidAssetsDB, stockFixedAssetsDB, stockReservedDB, stockReservedPerShareDB, stockEspDB, stockBvpsDB, stockPbDB, timeToMarketDB, stockUndpDB, stockPerundpDB, stockRevDB, stockProfitDB, stockGprDB, stockNprDB, stockHoldersDB, stockTableNameDB)) cursor.execute("commit") print("已存入 ", i) except Exception: print("Error")
def get_foot(line): '''获取诗句韵脚''' return pinyin(line, style=9, errors='ignore')[-1][0][:-1]
def SYM(ch_str_1, ch_str_2): yunmu_1 = pinyin(ch_str_1, style=Style.FINALS) yunmu_2 = pinyin(ch_str_2, style=Style.FINALS) return ch_similarity_sub(yunmu_1, yunmu_2)
def test_mmseg_and_jieba_for_pinyin(input, jieba_ret, mmseg_ret): assert pinyin(input) == mmseg_ret assert pinyin(mmseg.seg.cut(input)) == mmseg_ret
def text_to_vocab_func(txt): pins = pypinyin.pinyin(txt) pins = [i[0] for i in pins] return pins