예제 #1
0
    def word_match(self):
        """
        找出字典中所有匹配的词
        """
        atoms = self.words_graph.get_atoms()
        # 处理开始标识符
        match = self.d_store.core_dct[definitions.SENTENCE_BEGIN]
        self.words_graph.generate_word(0, 1, Feature(tag_code=match[0][1]),
                                       weight=match[0][0])
        # 处理句子atom
        len_atom = len(atoms)
        for i in range(1, len_atom - 1):
            # print('==match:')
            # print(''.join([atom.content for atom in atoms[i:]]))
            # 找出所有的匹配词
            # matches格式: matches = [(word, [(freq, pos)...]), ...]
            matches = self.d_store.core_dct.matches(
                                    [atom.content for atom in atoms[i:]])
            if matches:
                for match in matches:
                    # print(match[0])
                    pos = 0 if len(match[1]) > 1 else match[1][0][1]
                    # 过滤系统内部标识符, 如: 始##始
                    if 0 < pos < 256:
                        continue
                    # 字构成词的权重,即词频
                    weight = sum([v[0] for v in match[1]])
                    self.words_graph.generate_word(i, i + len(match[0]),
                                                   feature=Feature(tag_code=pos),
                                                   weight=weight
                                                   )
            else:
                # 没有找到任何匹配
                feature = None
                if atoms[i].feature == definitions.CT_NUM:
                    feature = Feature('m')
                    alias = definitions.OOV_WORD_M
                elif atoms[i].feature == definitions.CT_LETTER:
                    feature = Feature('nx')
                    alias = definitions.OOV_WORD_NX
                else:
                    pos, alias = 0, None
                self.words_graph.generate_word(i, i + 1, feature, 0, alias)
                # print('==match end===')

        # 处理结束标识符
        match = self.d_store.core_dct[definitions.SENTENCE_END]
        self.words_graph.generate_word(len_atom - 1, len_atom,
                                       Feature(tag_code=match[0][1]),
                                       weight=match[0][0])
예제 #2
0
 def print_hmm_model(hmm_model):
     print('状态序列 - 词性列表:')
     print(hmm_model.states)
     print('观察序列 - 词:')
     print(' '.join(hmm_model.observations))
     print('发射概率 - 词在特定词性下的词频/此词性总数')
     for k, v in hmm_model.emission_prob.items():
         print('{}-{}:{}'.format(Feature(tag_code=k).tag, k, ','.join(
             ['{}={:e}'.format(word, prob) for word, prob in v.items()])))
예제 #3
0
 def atom_segment(self):
     """原子切分"""
     prev_type, cur_type = definitions.CT_SENTENCE_BEGIN, 0
     atom = definitions.SENTENCE_BEGIN
     for c in self.sentence:
         cur_type = self.char_type(c)
         if (cur_type == definitions.CT_NUM or
                     cur_type == definitions.CT_LETTER) and (
                         prev_type == definitions.CT_NUM or
                         prev_type == definitions.CT_LETTER):
             atom = ''.join([atom, c])
         else:
             self.words_graph.append_atom(atom, Feature(tag_code=prev_type))
             atom = c
         prev_type = cur_type
     # 最后一个atom
     self.words_graph.append_atom(atom, Feature(tag_code=prev_type))
     # SENTENCE_END
     self.words_graph.append_atom(definitions.SENTENCE_END,
                            Feature(tag_code=definitions.CT_SENTENCE_END))
예제 #4
0
 def test_pos_decode(self):
     self.assertEqual(Feature(tag_code=24832).tag, 'a')
     self.assertEqual(Feature(tag_code=24932).tag, 'ad')
     print('{} = {}'.format(30058, Feature(tag_code=30058).tag))
예제 #5
0
 def test_pos_encode(self):
     self.assertEqual(Feature('a').tag_code, 24832)
     self.assertEqual(Feature('ad').tag_code, 24932)
     print('{} = {}'.format('ul', Feature('ul').tag_code))
예제 #6
0
 def format_result(result):
     return ' '.join(['{}/{}'.format(w, Feature(tag_code=p).tag) for w, p in zip(result['words'], result['tags'])])
예제 #7
0
    def generate_oov_words(self, oov_type, oov_tag, seg_index, oov_dct,
                           oov_ctx, oov_alias):
        """
        根据viterbi tag结果,合并未登录词

        @:param oov_type    未登录词类型, 'nr', 'tr', 'ns'
        @:param oov_tag     未登录词标注序列
        @:param seg_index   每个标注项对应的词在word_graph中的索引
        @:param oov_dct     未登录词词典
        @:param oov_alias   未登录词的别名, 如 "北京"用"未##地"来代替
        """
        #print('generate_oov_words: {} {}'.format(oov_type, oov_tag))
        i, len_tag = 0, len(oov_tag)
        while i < len_tag:
            pattern_match = None
            weight = 0
            # 遍历标注序列oov_tag, 找到匹配的模式
            if oov_type == 'nr':
                for pattern in self.nr_patterns:
                    if oov_tag.startswith(pattern, i):
                        pattern_match = pattern
                        poss = self.compute_possibility(
                            i, seg_index, pattern_match, oov_dct, oov_ctx)
                        weight = -math.log(
                            self.nr_factor[pattern_match]) + poss
                        #print('nr: {} {}'.format(pattern_match, weight))
                        break
            elif oov_type == 'tr' or oov_type == 'ns':
                match = re.match(r'BC*D', oov_tag[i:])
                if match:
                    pattern_match = match.group()
                    poss = self.compute_possibility(i, seg_index,
                                                    pattern_match, oov_dct,
                                                    oov_ctx)
                    # NOTE: 简化了tr和ns权值的平滑计算,可能会影响准确度
                    weight = math.log(1.0) + poss
            if not pattern_match:
                i += 1
                continue

            # print('match[{}] {} = {}'.format(i, oov_type, pattern_match))
            # 找到未登录词pattern后, 合并未登录词

            # 未登录词的左右边界
            # 左边界等于pattern中第一个tag对应的词的左边界
            # 右边界等于pattern中最后一个tag对应的词的右边界
            oov_left, oov_right = seg_index[i][0], seg_index[i +
                                                             len(pattern_match)
                                                             - 1][1]

            # 判断是否已经是未登录词, 如果是, 则比较权值
            seg_word = self.words_graph.get_word(oov_left, oov_right)
            #if seg_word is not None:
            #    print('already {} {} {} : {} {}'.format(seg_word.weight, seg_word.content, seg_word.alias, oov_type, weight))
            if seg_word is None or weight < seg_word.weight:
                # 合并未登录词
                feature = Feature('nr') if oov_type == 'tr' else Feature(
                    oov_type)
                self.words_graph.generate_word(oov_left, oov_right, feature,
                                               weight, oov_alias)
            i += len(pattern_match)