def py2hz(self, pinyin): result = dag(self.dagparams, (pinyin, ), path_num=self.limit * self.page)[(self.page - 1) * self.limit:self.page * self.limit] data = [item.path[0] for item in result] return data
def genarate_word_error(sents): ans = [] # hmmparams = DefaultHmmParams() dagparams = DefaultDagParams() for sent in sents: seg_sent = list(jieba.cut(sent)) while True: select_word = random.sample(seg_sent, 1)[0] #随机一个候选词 if len(select_word) > 1: break error_word = select_word pinyin_list = lazy_pinyin(select_word) #获取选定词的拼音 # result1 = viterbi(hmm_params=hmmparams, observations=pinyin_list, path_num=5) try: result2 = dag(dagparams, pinyin_list, path_num=5, log=True) except KeyError: continue while len(result2) > 1: error_word = ''.join(random.choice(result2).path) if error_word != select_word: break word_index = sent.find(select_word) #替换词语中的单字 err_sent = sent[:word_index] + error_word if word_index + len(select_word) < len(sent): err_sent += sent[word_index + len(select_word):] if err_sent != sent: ans.append((sent, err_sent)) return ans
def pinyin_2_hanzi(pinyinList): from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagParams = DefaultDagParams() # 取第一个值 result = dag(dagParams, pinyinList, path_num=10, log=True)[0].path[0] return result
def change_word_b(word, path_num=6): pinyin_list = lazy_pinyin(word) result = dag(dagParams, pinyin_list, path_num, log=True) # print(len(result)) 10 <class 'list'> try: index = randint(0, len(result)-1) # 当result只有一位时(比如','),不进行更改,直接返回原word except: return word new_word = result[index].path return new_word[0] # print(new_word) ['疫情']
def pinyin_2_hanzi(pinyinList): from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagParams = DefaultDagParams() result = dag(dagParams, pinyinList, path_num=10, log=True) # 10 代表后选值个数 for item in result: socre = item.score res = item.path # 转换结果 print(socre, ''.join(res))
def pinyin_to_chinese(self, data): '''get the chinese from the pinyin :param data: pinyin data :return: ''' dagparames = DefaultDagParams() result = dag(dagparames, data, path_num=10, log=True) for item in result: print(str(item.score) + ":", item.path)
def pinyin_2_hanzi(pinyin_str): pinyin_list = pinyin_str.split() dagParams = DefaultDagParams() # 1个候选值 result = dag(dagParams, pinyin_list, path_num=1, log=True) if result: res = result[0].path # 转换结果 hanzi_str = ''.join(res) return hanzi_str else: logger.info("转化有误:" + pinyin_str)
def pinyin_to_hanzi(pinyin, Topk=5): ''' 拼音转化为汉字 汉字存在多意性,所以这里没有一一对应的关系,只能选出概率最高的topk ''' translator = DefaultDagParams() result = dag(translator, pinyin, path_num=Topk, log=True) for item in result: socre = item.score # 得分 res = item.path # 转换结果 print socre, ''.join([one.decode('utf-8') for one in res])
def pinyin_2_hanzi(word): if Pinyin2Hanzi.is_chinese(word): word_pinyin = lazy_pinyin(word) dagParams = DefaultDagParams() word_list = [] result = dag(dagParams, word_pinyin, path_num=3, log=True) for item in result: word_list.append(item.path[0]) return word_list else: return "Null"
def pinyin_2_hanzi(sentences): from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagParams = DefaultDagParams() pinyinList = lazy_pinyin(sentences) print(pinyinList) result = dag(dagParams, pinyinList, path_num=3) #10代表侯选值个数 for item in result: socre = item.score res = item.path # 转换结果 print(socre, res)
def get(self): """get请求""" word = self.get_argument('word') + " end" num = self.get_argument('num') print(tuple(word.split(" "))) result = dag(dagparams, tuple(word.split(" ")), path_num=num) list = [] for item in result: result = {item.score, item.path} list.append(result) print(item.score, item.path) self.write(json.dumps(list))
def pinyin_2_hanzi(self, pinyinList): dagParams = DefaultDagParams() result = dag(dagParams, pinyinList, path_num=10, log=True) #10代表侯选值个数 item_result = [] try: for item in result: # socre = item.score # res = item.path # 转换结果 item_result.append([item.score, item.path]) return sorted(item_result, key=itemgetter(0), reverse=True)[0][1] except Exception as e: print(e) print("输入异常,请重新输入拼音")
def pinyin2hanzi(pinyin_list): ''' :param text_list: 拼音列表 :return: 文本二维列表,且每个一维列表的长度为1 ''' dagParams, entities = DefaultDagParams(), [] for line in pinyin_list: result = dag(dagParams, line, path_num=5, log=True) for item in result: res = item.path # 转换结果 if len(res) > 1: continue entities.append(res) return entities
def _py2hz_dag(self, pinyin_list): if len(pinyin_list) == 1: num = 1000 elif len(pinyin_list) == 2: num = 20 elif len(pinyin_list) == 3: num = 10 elif len(pinyin_list) <= 5: num = 5 elif len(pinyin_list) <= 7: num = 3 else: num = 1 return dag(self._dagparams, pinyin_list, num, True)
def pinyin_to_hanzi(pinyin,Topk=5,Log=True): ''' 拼音转化为汉字 汉字存在多意性,所以这里没有一一对应的关系,只能选出概率最高的topk ''' print(pinyin) translator=DefaultDagParams() result=dag(translator,pinyin,path_num=Topk,log=Log) #print(result) # for item in result: # socre=item.score # 得分 # res=item.path # 转换结果 # print(socre, ''.join([one.decode('utf-8') for one in res])) return result
def transfer_pinyin_to_hanzi_by_dag(sets): """ DAG模式拼音转汉字 :param sets: :return: """ try: result = dag(dagparams, sets, path_num=1, log=True) path = '' for item in result: path = item.path except Exception as error: raise Exception('error:', error) else: return path
def pinyin_2_hanzi(pinyin_str): ''' zhao qing shi ding hu qu fang di chan xie hui --- 肇庆市鼎湖区房地产协会 ''' pinyin_list = pinyin_str.split() dagParams = DefaultDagParams() # 1个候选值 result = dag(dagParams, pinyin_list, path_num=1, log=True) if result: res = result[0].path # 转换结果 hanzi_str = ''.join(res) return hanzi_str else: return '' logger.info("转化有误:" + pinyin_str)
def get_chengyu(word): bigger_list = [] if Pinyin2Hanzi.is_chinese(word): word_pinyin = lazy_pinyin(word) dagParams = DefaultDagParams() word_list = [] result = dag(dagParams, word_pinyin, path_num=3, log=True) for item in result: word_list.append(item.path[0]) for avg_word in word_list: bigger_list.append(find_chengyu(avg_word)) heiheihei = list((chain(*bigger_list))) max_len = len(heiheihei) - 1 flag = random.randint(0, max_len) return heiheihei[flag] else: return "Null"
def pinyin_2_hanzi(pinyinList): from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagParams = DefaultDagParams() result = dag(dagParams, pinyinList, path_num=500000, log=True)#10代表侯选值个数 return [''.join(item.path) for item in result]
def get_word_by_pinyin(s0): rlt = [] l0 = dag(__param, get_pinyin(s0), path_num=PATH_NUM) for i in l0: rlt.append(''.join(i.path)) return rlt
# coding: utf-8 from __future__ import (print_function, unicode_literals) import sys sys.path.append('..') from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagparams = DefaultDagParams() result = dag(dagparams, ['wo']) for item in result: print(item.score, '/'.join(item.path)) print(20 * '*') result = dag(dagparams, ['ni', 'hao']) for item in result: print(item.score, '/'.join(item.path)) print(20 * '*') result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi']) for item in result: print(item.score, '/'.join(item.path)) print(20 * '*') result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi'], path_num=2,
# coding: utf-8 from __future__ import (print_function, unicode_literals) import sys sys.path.append('..') from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagparams = DefaultDagParams() print( dag(dagparams, [u'ti', u'chu', u'le', u'jie', u'jve', u'fang', u'an'], path_num=1)) print(dag(dagparams, [u'ti', u'chu', u'le'], path_num=1)) print(dag(dagparams, ['jie', 'jve', 'fang', 'an'], path_num=1)) print(dag(dagparams, ['jie', 'jve'], path_num=1)) print(dag(dagparams, ['fang', 'an'], path_num=1))
# coding: utf-8 from __future__ import (print_function, unicode_literals) import sys sys.path.append('..') from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagparams = DefaultDagParams() result = dag(dagparams, ['wo']) for item in result: print(item.score, '/'.join(item.path)) print(20*'*') result = dag(dagparams, ['ni', 'hao']) for item in result: print(item.score, '/'.join(item.path)) print(20*'*') result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi']) for item in result: print(item.score, '/'.join(item.path)) print(20*'*') result = dag(dagparams, ['ni', 'bu', 'zhi', 'dao', 'de', 'shi'], path_num=2, log=True)
# coding: utf-8 from __future__ import (print_function, unicode_literals) import sys sys.path.append('..') from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagparams = DefaultDagParams() print( dag(dagparams, [u'ti', u'chu', u'le', u'jie', u'jve', u'fang', u'an'], path_num=1) ) print( dag(dagparams, [u'ti', u'chu', u'le'], path_num=1) ) print( dag(dagparams, ['jie', 'jve', 'fang', 'an'], path_num=1) ) print( dag(dagparams, ['jie', 'jve'], path_num=1) ) print( dag(dagparams, ['fang', 'an'], path_num=1) )
try: reload(sys) sys.setdefaultencoding('utf-8') except: pass if __name__ == '__main__': data_dir = './original_corpus.txt' dst_fpath = './pinyin2hanzi_dag.txt' dagparams = DefaultDagParams() with open(dst_fpath, 'w') as f_write: with open(data_dir, 'r') as f_read: for line in f_read: line = line.strip('\n') items = line.split('\t') hanzi = items[0] pinyin = items[1] try: ## 2个候选 result = dag(dagparams, pinyin.split('#'), path_num=2) for item in result: line = line + '\t' + ''.join(item.path) + ':' + str(item.score) except Exception as e: print(e) finally: f_write.write(line + '\n')
from Pinyin2Hanzi import DefaultDagParams from Pinyin2Hanzi import dag dagparams = DefaultDagParams() ## 2个候选 result = dag(dagparams, ('pin'), path_num=3) for item in result: print(item.score, item.path) ''' 输出 0.08117536840088911 ['你不知道', '的是'] 0.04149191639287887 ['你不知道', '的诗'] ''' ## 2个候选,使用对数打分 # result = dag(dagparams, ('ni', 'bu', 'zhi', 'dao', 'de', 'shi'), path_num=2, log=True) # for item in result: # print(item.score, item.path) ''' 输出 -2.5111434226494866 ['你不知道', '的是'] -3.1822566564324477 ['你不知道', '的诗'] ''' ## 1个候选 # print( dag(dagparams, ['ti', 'chu', 'le', 'bu', 'cuo', 'de', 'jie', 'jve', 'fang', 'an'], path_num=1) ) '''输出 [< score=0.0017174549839096384, path=['提出了', '不错', '的', '解决方案'] >] '''
change_word = raw_word alpha = 1 beta = 1 for i in range(len(raw_word)): ch = raw_word[i] chacha = model.predict(ch, k=2) if (chacha[0][0] == '__label__辱骂'): P1 = chacha[1][0] else: P1 = chacha[1][1] pinyin = lazy_pinyin(ch) max_point = 0 change_ch = ch result = dag(dagParams, pinyin, path_num=5, log=True) for j in range(5): if len(result) > j: ss = "" for c in result[j].path: ss = c chacha = model.predict(ss, k=2) if (chacha[0][0] == '__label__辱骂'): P2 = chacha[1][0] else: P2 = chacha[1][1] if ss == ch: continue four_dis = distance([ch], [ss]) all_dis = 3.0 / 14 * ( 1 - four_dis['normalized_levenshtein'][0]) + 1.0 / 7 * ( 1 - four_dis['jaccard_word'][0]) + 3.0 / 14 * (