def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = pypinyin.lazy_pinyin(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) return result
def gen_emission(): """ base_emission = {} #> {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}} """ data = {'default': 1.e-200, 'data': None} emission = readdatafromfile(BASE_EMISSION_FILE) for line in open('./hanzipinyin.txt'): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [ util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',') ] char_list = [hanzi] * len(pinyin_list) for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0.) emission[hanzi][pinyin] += 1. for hanzi in emission: num_sum = 0. for pinyin in emission[hanzi]: num_sum += emission[hanzi][pinyin] for pinyin in emission[hanzi]: emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum data['data'] = emission writejson2file(data, FIN_EMISSION_FILE)
def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') for line in open(SENTENCE_FILE, encoding='utf8'): line = util.as_text(line.strip()) if len(line) < 2: continue if not util.is_chinese(line): continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin_list = topinyin(line) char_list = [c for c in line] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def gen_emission(): """ base_emission = {} #> {'泥': {'ni':1.0}, '了':{'liao':0.5, 'le':0.5}} """ data = {'default': 1.e-200, 'data': None} emission = readdatafromfile(BASE_EMISSION_FILE) for line in open('./hanzipinyin.txt'): line = util.as_text(line.strip()) hanzi, pinyin_list = line.split('=') pinyin_list = [util.simplify_pinyin(item.strip()) for item in pinyin_list.split(',')] char_list = [hanzi] * len(pinyin_list) for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0.) emission[hanzi][pinyin] += 1. for hanzi in emission: num_sum = 0. for pinyin in emission[hanzi]: num_sum += emission[hanzi][pinyin] for pinyin in emission[hanzi]: emission[hanzi][pinyin] = emission[hanzi][pinyin] / num_sum data['data'] = emission writejson2file(data, FIN_EMISSION_FILE)
def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') for line in open(SENTENCE_FILE): line = util.as_text(line.strip()) if len(line) < 2: continue if not util.is_chinese(line): continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin_list = topinyin(line) char_list = [c for c in line] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def topinyin(s): """ s都是汉字 """ s = util.as_text(s) py_list = PinyinHelper.convertToPinyinFromSentence(s) result = [] for py in py_list: py = util.as_text(py) if py == '〇': result.append('ling') else: result.append(util.simplify_pinyin(py)) if ',' in ''.join(result): print(s) print(''.join(result)) sys.exit() return result
def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') for line in open(HANZI2PINYIN_FILE, 'r', encoding='utf8'): line = util.as_text(line.strip()) if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def gen_py2hz(): data = {} for line in open(PY2HZ_FILE): line = util.as_text(line.strip()) ls = line.split('=') if len(ls) != 2: raise Exception('invalid format') py, chars = ls py = py.strip() chars = chars.strip() if len(py) > 0 and len(chars) > 0: data[py] = chars writejson2file(data, FIN_PY2HZ_FILE)
def process_hanzipinyin(emission): ## ./hanzipinyin.txt print('read from hanzipinyin.txt') for line in open(HANZI2PINYIN_FILE): line = util.as_text(line.strip()) if '=' not in line: continue hanzi, pinyins = line.split('=') pinyins = pinyins.split(',') pinyins = [util.simplify_pinyin(py) for py in pinyins] for pinyin in pinyins: emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1
def gen_py2hz(): data = {} for line in open(PY2HZ_FILE): line = util.as_text(line.strip()) ls = line.split('=') if len(ls) != 2: raise Exception('invalid format') py, chars = ls py = py.strip() chars = chars.strip() if len(py)>0 and len(chars)>0: data[py] = chars writejson2file(data, FIN_PY2HZ_FILE)
def extract_chinese_sentences(content): content = util.as_text(content) content = content.replace(' ', '') content = content.replace('\t', '') sentences = [] s = '' for c in content: if util.is_chinese(c): s += c else: sentences.append(s) s = '' sentences.append(s) return [s.strip() for s in sentences if len(s.strip()) > 1]
def read_from_word_txt(start, emission, transition): ## ! 基于word.txt的优化 print('read from word.txt') _base = 1000. _min_value = 2. for line in open(WORD_FILE, 'rb'): line = util.as_text(line.strip()) if '=' not in line: continue if len(line) < 3: continue ls = line.split('=') if len(ls) != 2: continue word, num = ls word = word.strip() num = num.strip() if len(num) == 0: continue num = float(num) num = max(_min_value, num / _base) if not util.is_chinese(word): continue ## for start start.setdefault(word[0], 0) start[word[0]] += num ## for emission pinyin_list = topinyin(word) char_list = [c for c in word] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += num ## for transition for f, t in zip(word[:-1], word[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += num
def read_from_word_txt(start, emission, transition): ## ! 基于word.txt的优化 print('read from word.txt') _base = 1000. _min_value = 2. for line in open(WORD_FILE): line = util.as_text(line.strip()) if '=' not in line: continue if len(line) < 3: continue ls = line.split('=') if len(ls) != 2: continue word, num = ls word = word.strip() num = num.strip() if len(num) == 0: continue num = float(num) num = max(_min_value, num/_base) if not util.is_chinese(word): continue ## for start start.setdefault(word[0], 0) start[word[0]] += num ## for emission pinyin_list = topinyin(word) char_list = [c for c in word] for hanzi, pinyin in zip(char_list, pinyin_list): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += num ## for transition for f, t in zip(word[:-1], word[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += num
max_num = 0. min_num = 100000000000000. for hanzi in data: for pinyin in data[hanzi]: pinyin = util.simplify_pinyin(pinyin) num = data[hanzi][pinyin] key = pinyin result.setdefault(key, {}) result[key].setdefault(hanzi, 0) result[key][hanzi] += num max_num = max(max_num, result[key][hanzi]) min_num = min(min_num, result[key][hanzi]) for line in open(pinyin2hanzi_file): line = util.as_text(line.strip()) if '=' not in line: continue pinyin, chars = line.split('=') if len(pinyin) == 0 or len(chars) == 0: continue pinyin = util.simplify_pinyin(pinyin) for hanzi in chars: key = pinyin result.setdefault(key, {}) result[key].setdefault(hanzi, 0) result[key][hanzi] += 1. max_num = max(max_num, result[key][hanzi]) min_num = min(min_num, result[key][hanzi])
def read_from_all_pinyin(all_pinyin): print("begin read from all_pinyin.txt") for line in open(PINYIN_FILE, encoding='utf8'): line = util.as_text(line.strip()) all_pinyin.append(line)