def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') with codecs.open(HANZI2PINYIN_FILE, 'r', 'utf-8') as fin: with codecs.open("data/dictionary.txt", 'w', 'utf-8') as fout: while True: line = fin.readline() if not line: break line = line.strip() if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] pnyn = "" for i in range(len(pinyins)): if i != len(pinyins) - 1: pnyn += pinyins[i] + "," else: pnyn += pinyins[i] fout.write(u"{}\t{}\n".format(hanzi, pnyn)) for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def gen_emission(): """ base_emission = {} #> {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}} """ data = {'default': 1.e-200, 'data': None} emission = readdatafromfile(BASE_EMISSION_FILE) for line in open(HZ2PY_FILE): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] char_list = [hanzi] * len(pinyin_list) for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0.) emission[hanzi][pinyin] += 1. for hanzi in emission: num_sum = 0. for pinyin in emission[hanzi]: num_sum += emission[hanzi][pinyin] for pinyin in emission[hanzi]: emission[hanzi][pinyin] = round( math.log(emission[hanzi][pinyin] / num_sum), 6) data['default'] = round(math.log(1.e-200), 6) data['data'] = emission writejson2file(data, FIN_EMISSION_FILE)
def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') for line in open(HANZI2PINYIN_FILE): line = util.as_text(line.strip()) if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = pypinyin.lazy_pinyin(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) return result
def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = PinyinHelper.convertToPinyinFromSentence(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) if ',' in ''.join(result): print(s) print(''.join(result)) sys.exit() return result
ALL_STATES_FILE = 'data/all_states.txt' # 汉字(隐藏状态) ALL_OBSERVATIONS_FILE = 'data/all_observations.txt' # 拼音(观测值) PINYIN2HANZI_FILE = 'data/pinyin2hanzi.txt' states = set() observations = set() py2hz = {} with codecs.open(SOURCE_FILE, 'r', 'utf-8') as fin: while True: line = fin.readline().strip() if not line: break hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] states.add(hanzi) for pinyin in pinyin_list: observations.add(pinyin) py2hz.setdefault(pinyin, set()) py2hz[pinyin].add(hanzi) # 声母 shengmu = util.get_shengmu(pinyin) if shengmu is not None: py2hz.setdefault(shengmu, set()) py2hz[shengmu].add(hanzi)
SOURCE_FILE = '../data/train/original/hanzipinyin.txt' ALL_STATES_FILE = '../data/train/result/all_states.txt' # 汉字(隐藏状态) ALL_OBSERVATIONS_FILE = '../data/train/result/all_observations.txt' # 拼音(观测值) PINYIN2HANZI_FILE = '../data/train/result/pinyin2hanzi.txt' states = set() observations = set() py2hz = {} for line in open(SOURCE_FILE): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] states.add(hanzi) for pinyin in pinyin_list: observations.add(pinyin) py2hz.setdefault(pinyin, set()) py2hz[pinyin].add(hanzi) # 声母 shengmu = util.get_shengmu(pinyin) if shengmu is not None: py2hz.setdefault(shengmu, set()) py2hz[shengmu].add(hanzi) with open(ALL_STATES_FILE, 'w') as out: