def word_match(self): """ 找出字典中所有匹配的词 """ atoms = self.words_graph.get_atoms() # 处理开始标识符 match = self.d_store.core_dct[definitions.SENTENCE_BEGIN] self.words_graph.generate_word(0, 1, Feature(tag_code=match[0][1]), weight=match[0][0]) # 处理句子atom len_atom = len(atoms) for i in range(1, len_atom - 1): # print('==match:') # print(''.join([atom.content for atom in atoms[i:]])) # 找出所有的匹配词 # matches格式: matches = [(word, [(freq, pos)...]), ...] matches = self.d_store.core_dct.matches( [atom.content for atom in atoms[i:]]) if matches: for match in matches: # print(match[0]) pos = 0 if len(match[1]) > 1 else match[1][0][1] # 过滤系统内部标识符, 如: 始##始 if 0 < pos < 256: continue # 字构成词的权重,即词频 weight = sum([v[0] for v in match[1]]) self.words_graph.generate_word(i, i + len(match[0]), feature=Feature(tag_code=pos), weight=weight ) else: # 没有找到任何匹配 feature = None if atoms[i].feature == definitions.CT_NUM: feature = Feature('m') alias = definitions.OOV_WORD_M elif atoms[i].feature == definitions.CT_LETTER: feature = Feature('nx') alias = definitions.OOV_WORD_NX else: pos, alias = 0, None self.words_graph.generate_word(i, i + 1, feature, 0, alias) # print('==match end===') # 处理结束标识符 match = self.d_store.core_dct[definitions.SENTENCE_END] self.words_graph.generate_word(len_atom - 1, len_atom, Feature(tag_code=match[0][1]), weight=match[0][0])
def print_hmm_model(hmm_model): print('状态序列 - 词性列表:') print(hmm_model.states) print('观察序列 - 词:') print(' '.join(hmm_model.observations)) print('发射概率 - 词在特定词性下的词频/此词性总数') for k, v in hmm_model.emission_prob.items(): print('{}-{}:{}'.format(Feature(tag_code=k).tag, k, ','.join( ['{}={:e}'.format(word, prob) for word, prob in v.items()])))
def atom_segment(self): """原子切分""" prev_type, cur_type = definitions.CT_SENTENCE_BEGIN, 0 atom = definitions.SENTENCE_BEGIN for c in self.sentence: cur_type = self.char_type(c) if (cur_type == definitions.CT_NUM or cur_type == definitions.CT_LETTER) and ( prev_type == definitions.CT_NUM or prev_type == definitions.CT_LETTER): atom = ''.join([atom, c]) else: self.words_graph.append_atom(atom, Feature(tag_code=prev_type)) atom = c prev_type = cur_type # 最后一个atom self.words_graph.append_atom(atom, Feature(tag_code=prev_type)) # SENTENCE_END self.words_graph.append_atom(definitions.SENTENCE_END, Feature(tag_code=definitions.CT_SENTENCE_END))
def test_pos_decode(self): self.assertEqual(Feature(tag_code=24832).tag, 'a') self.assertEqual(Feature(tag_code=24932).tag, 'ad') print('{} = {}'.format(30058, Feature(tag_code=30058).tag))
def test_pos_encode(self): self.assertEqual(Feature('a').tag_code, 24832) self.assertEqual(Feature('ad').tag_code, 24932) print('{} = {}'.format('ul', Feature('ul').tag_code))
def format_result(result): return ' '.join(['{}/{}'.format(w, Feature(tag_code=p).tag) for w, p in zip(result['words'], result['tags'])])
def generate_oov_words(self, oov_type, oov_tag, seg_index, oov_dct, oov_ctx, oov_alias): """ 根据viterbi tag结果,合并未登录词 @:param oov_type 未登录词类型, 'nr', 'tr', 'ns' @:param oov_tag 未登录词标注序列 @:param seg_index 每个标注项对应的词在word_graph中的索引 @:param oov_dct 未登录词词典 @:param oov_alias 未登录词的别名, 如 "北京"用"未##地"来代替 """ #print('generate_oov_words: {} {}'.format(oov_type, oov_tag)) i, len_tag = 0, len(oov_tag) while i < len_tag: pattern_match = None weight = 0 # 遍历标注序列oov_tag, 找到匹配的模式 if oov_type == 'nr': for pattern in self.nr_patterns: if oov_tag.startswith(pattern, i): pattern_match = pattern poss = self.compute_possibility( i, seg_index, pattern_match, oov_dct, oov_ctx) weight = -math.log( self.nr_factor[pattern_match]) + poss #print('nr: {} {}'.format(pattern_match, weight)) break elif oov_type == 'tr' or oov_type == 'ns': match = re.match(r'BC*D', oov_tag[i:]) if match: pattern_match = match.group() poss = self.compute_possibility(i, seg_index, pattern_match, oov_dct, oov_ctx) # NOTE: 简化了tr和ns权值的平滑计算,可能会影响准确度 weight = math.log(1.0) + poss if not pattern_match: i += 1 continue # print('match[{}] {} = {}'.format(i, oov_type, pattern_match)) # 找到未登录词pattern后, 合并未登录词 # 未登录词的左右边界 # 左边界等于pattern中第一个tag对应的词的左边界 # 右边界等于pattern中最后一个tag对应的词的右边界 oov_left, oov_right = seg_index[i][0], seg_index[i + len(pattern_match) - 1][1] # 判断是否已经是未登录词, 如果是, 则比较权值 seg_word = self.words_graph.get_word(oov_left, oov_right) #if seg_word is not None: # print('already {} {} {} : {} {}'.format(seg_word.weight, seg_word.content, seg_word.alias, oov_type, weight)) if seg_word is None or weight < seg_word.weight: # 合并未登录词 feature = Feature('nr') if oov_type == 'tr' else Feature( oov_type) self.words_graph.generate_word(oov_left, oov_right, feature, weight, oov_alias) i += len(pattern_match)