def main(): parser = OptionParser() parser.add_option("-t", dest="test", help="test data directory") parser.add_option("-m", dest="model", help="model data filename to save") (options, args) = parser.parse_args() if not options.model: parser.error("need model data filename(-m)") hmm = HMM() hmm.load(options.model) if options.test: tests = load_data(options.test) for x in tests: print zip(x, hmm.Viterbi(hmm.words2id(x)))
class Bigram: def __init__(self): # TODO 测试集上检查平滑处理的抉择问题 self.minfreq = -3.14e+100 # 构建字典树、用于扫描全切分有向图 self.trie = Trie() self.construct_trie() # 构建 二元词典 # self.construct_bigram_dic() # 读取二元词典 with open('files/bigram_dic.json', 'r') as f: self.bigram_dic = json.load(f) # 进行特殊处理 self.SP = SpecialProcess() # 创建HMM分词模型 self.hmm = HMM() # 获取常用姓名中名字 self.get_second_names() self.get_first_name() # 构建字典树 def construct_trie(self, dic_file=DICFILE): with codecs.open(dic_file, 'r', 'gbk') as f: d = f.read() text = d.split('\r\n') # unigram # self.unigram = {} # unigram_time = 0 self.words_num = len(text) for line in text: if (line != ""): words = line.split(" ") # self.unigram[words[0]] = int(words[1]) # unigram_time += int(words[1]) self.trie.add(words[0]) # 词频词典 # for key in self.unigram.keys(): # self.unigram[key] = math.log(self.unigram.get(key) / unigram_time) # 构建二元词典 def construct_bigram_dic(self, seg_file=BIDICFILE): with codecs.open(seg_file, 'r', 'gbk') as f: text = f.read() lines = text.split('\r\n') seg_lists = [] # 按行提取分词结果 for line in lines: # 遇到空白行直接进行下一行 pattern = re.compile(r'^199801') # 未匹配下一行 if not re.match(pattern, line): continue # 将每行正则匹配 Word/sign 包括 [Word/sign regex = r'\s[^\s^/]+/\w+' segs = re.findall(regex, line) # 处理匹配得到的字符 seg_list = [] for seg in segs: # 去除可能的[、同时去除匹配首位的空格 s = seg.replace('[', '')[1:] word = s.split('/')[0] # 该行所有分词 seg_list.append(word) # 首位插入BOS seg_list.insert(0, "^") # 尾部插入EOS seg_list.append("$") # 保存每行分词结果 seg_lists.append(seg_list) # 构造bigram词典 self.bigram_dic = {} # 遍历每行 for seg_list in seg_lists: # 从第二个分词(第一个是BOS)遍历每行的分词语料 for i in range(1, len(seg_list)): # 第一次遇到 if seg_list[i] not in self.bigram_dic: self.bigram_dic[seg_list[i]] = {} # 保存该词前面的词 self.bigram_dic[seg_list[i]][seg_list[i - 1]] = 1 else: self.bigram_dic[seg_list[i]][seg_list[ i - 1]] = self.bigram_dic[seg_list[i]].get( seg_list[i - 1], 0) + 1 # 频数转换为概率, 取对数 for key1 in self.bigram_dic.keys(): sigma = 1e-7 sum_freq = 0 for key2 in self.bigram_dic.get(key1).keys(): sum_freq += self.bigram_dic[key1].get(key2) # 求c(wi-1wi)/ c(wi)概率 for key2 in self.bigram_dic.get(key1).keys(): self.bigram_dic[key1][key2] = math.log( (self.bigram_dic[key1].get(key2) + sigma) / (sum_freq + sigma * self.words_num)) # add(sigma) temp = math.log(sigma / (sum_freq + self.words_num)) if self.minfreq > temp: self.minfreq = temp # with open('bigram_dic.json', 'w') as f: # json.dump(self.bigram_dic, f) # print(self.minfreq) # 构建全切分有向图 def construct_DAG(self, sentence): # {key:list} self.DAG = {} # ^ - $ for i in range(1, len(sentence) - 1): # 保存以wi开始的词 self.DAG[i] = self.trie.scan(sentence[i:-1], i) # 加EOS和BOS self.DAG[len(sentence) - 1] = [len(sentence) - 1] self.DAG[0] = [0] def dp_search(self, sentence): # prob max viterbi = {} for i in range(len(sentence)): viterbi[i] = {} # { i :{ end1: (prob, next), end2 : (prob, next) }} viterbi[len(sentence) - 1][len(sentence) - 1] = (0., len(sentence)) # 反向DP for i in range(len(sentence) - 2, -1, -1): # 对每个wi起始的词求最大概率 for x in self.DAG[i]: # P(wx+1...wy | wi..wx)*viterbi[x+1][index][0] prob_index = max( (self.bigram_dic.get(sentence[x + 1:y + 1], {}).get( sentence[i:x + 1], self.minfreq) + viterbi.get(x + 1)[y][0], y) for y in self.DAG[x + 1]) viterbi[i][x] = prob_index # BOS end = max((self.bigram_dic.get(sentence[1:y + 1], {}).get( sentence[0], self.minfreq) + viterbi.get(1)[y][0], y) for y in self.DAG[1])[1] # 回溯* start = 1 segs = [] while start < len(sentence) - 1: segs.append(sentence[start:end + 1]) temp = start start = end + 1 # print(viterbi[temp][end][0]) end = viterbi[temp][end][1] return segs # 调用bigram分词并做后续处理 def cut(self, sentence): sentence = '^' + sentence + '$' # 构建句子 全切分有向图 self.construct_DAG(sentence) # 得到bigram分词结果 bigram_segs = self.dp_search(sentence) # 汉语数字、阿拉伯数字、英语单词、年月日等标志词的处理 deal_segs = self.SP.special_process(bigram_segs) # hmm识别姓名 hmm_segs = self.hmm_for_single_words(deal_segs) return hmm_segs # 将bigram分词结果中多个连续的单个词传入hmm进行分词 def hmm_for_single_words(self, bigram_segs): # 保存hmm的分词结果 segs = [] temp = [] # 遍历bigram的分词结果 for seg in bigram_segs: # 保存单个的词 if len(seg) == 1: temp.append(seg) # 遇到多个字的词时处理之前的单个词序列 else: # 处理连续三个单个词的集合 if len(temp) >= 3: # 作为整体传入hmm并得到分词结果 hmm_segs = self.hmm.Viterbi(''.join(w for w in temp)) # 进行姓名识别 oov_segs = self.OOV_name(hmm_segs) # # # # 进行OOV识别 TODO if len(oov_segs) == len(temp): oov_segs = self.OOV(temp) for word in oov_segs: segs.append(word) else: # 否则直接加入最终分词结果中 for word in temp: segs.append(word) segs.append(seg) temp = [] # 如果连续的单个词在句子最后,则单独判断 if len(temp): # 作为整体传入hmm并得到分词结果 hmm_segs = self.hmm.Viterbi(''.join(w for w in temp)) # 进行OOV姓名识别 oov_segs = self.OOV_name(hmm_segs) # # # # 进行OOV识别 TODO if len(oov_segs) == len(temp): oov_segs = self.OOV(temp) for word in oov_segs: segs.append(word) # 返回HMM的最后分词结果 return segs # 采用一些手段进行OOV的识别 # 单个分词中的未登录词处理 # 调用获得HMM分词结果 # 对合并的词进行姓名打分 # 姓名的判断 rl*l*sum(ci) > 2*(3+3+1) # TODO 其他未登录词的打分机制和判断机制 def OOV_name(self, hmm_segs): oov_segs = [] # 遍历HMM分的词 for i, seg in enumerate(hmm_segs): # 单个词直接加入 if len(seg) == 1: oov_segs.append(seg) # 否则识别是否为姓名 # TODO 姓名的判断阈值 else: # 打分 score = 0 # 判断是否为名 for w in seg: if w in self.second_names and self.second_names.get( w, 0) > 500: score += 3 else: score += 1 # 判断是否为姓 if i > 0 and hmm_segs[ i - 1] in self.first_names and self.first_names.get( hmm_segs[i - 1], 0) > 100: score += 3 # 判断分数, 大于14则加入分词结果 # TODO 阈值判断 if score * len(seg) >= 14: oov_segs.append(seg) # 不是未登录词 else: # 将单个词加入分词结果 for w in seg: oov_segs.append(w) return oov_segs # 获取常用名字 def get_second_names(self): with codecs.open('files/second_name.txt', 'r', 'gbk') as f: file = f.read() # 不要最后一个空行 lines = file.split('\r\n')[:-1] self.second_names = {} for line in lines: # 获得名字和频次 words = line.split(' ') self.second_names[words[0]] = int(words[1]) def get_first_name(self): with codecs.open('files/first_name.txt', 'r', 'gbk') as f: file = f.read() # 不要最后一个空行 lines = file.split('\r\n')[:-1] self.first_names = {} for line in lines: # 获得名字和频次 words = line.split(' ') self.first_names[words[0]] = int(words[1]) # 判断多个单个词是否传入HMM进行分词 def OOV(self, temp): single_nums = 0 for w in temp: if self.hmm.emission_prob['S'].get(w, 0) < -3.5: single_nums += 1 if single_nums >= 2: hmm_segs = self.hmm.Viterbi(''.join(temp)) return hmm_segs else: return temp
class Unigram: def __init__(self): # 字典树 self.root = Trie() # 词概率 self.word_freq = {} self.hmm = HMM() self.read_dic() # 读取词典并保存到字典树中 def read_dic(self, dic='files/dic.txt'): with open(dic, 'r') as f: d = f.read() text = d.split('\n') total_times = 0 for line in text: if (line != ""): words = line.split(" ") self.root.add(words[0]) self.word_freq[words[0]] = int(words[1]) total_times += int(words[1]) for key in self.word_freq.keys(): self.word_freq[key] = math.log( self.word_freq.get(key) / total_times) self.min_freq = self.word_freq.get( min(self.word_freq, key=self.word_freq.get)) def cut(self, sentence, useHMM=False): # 最大词频切分 ans = self.dp_search(sentence) # 将分词结果中的多个连续单个字传入HMM if (useHMM): segs = [] temp = [] for seg in ans: if len(seg) == 1: temp.append(seg) else: if len(temp) >= 5: for word in self.hmm.Viterbi(''.join(w for w in temp)): segs.append(word) else: for word in temp: segs.append(word) segs.append(seg) temp = [] if len(temp): for word in self.hmm.Viterbi(''.join(w for w in temp)): segs.append(word) ans = segs return ans # DAG动态规划 def dp_search(self, sentence): DAG = {} # 构建全有向切分图 for i in range(0, len(sentence)): DAG[i] = self.root.scan(sentence[i:], i) # 保存路径 route = {} # 边界情况 route[len(sentence)] = (0.0, '') # 从最后一个字遍历到第一个字 for i in range(len(sentence) - 1, -1, -1): # 求解的所有可能值 temp = [(self.word_freq.get(sentence[i:x + 1], self.min_freq) + route[x + 1][0], x) for x in DAG[i]] # 选择最大值 route[i] = max(temp) index = 0 segs = [] # 回溯切分结果 while index < len(sentence): segs.append(sentence[index:route[index][1] + 1]) index = route[index][1] + 1 return segs