def combine_comm(self, words):
     """根据词性标注进行普通实体合并
     Args:
         words: WordUnit list,进行命名实体合并后的words
     Returns:
         words_combine: WordUnit list,进行普通实体连接后的words
     """
     newword = words[0].lemma  # 第一个词,作为新词
     words_combine = []  # 存储合并后的结果
     n = 1
     i = 1  # 当前词ID
     while i < len(words):
         word = words[i]
         # 词合并: (前后词都是实体) and (前后词的词性相同 or 前词 in ["nz", "j"] or 后词 in ["nz", "j"])
         if (self.is_entity(word.postag)
                 and self.is_entity(words[i - 1].postag)
                 and (word.postag in {'nz', 'j'}
                      or words[i - 1].postag in {'nz', 'j'})):
             newword += word.lemma
         else:
             words_combine.append(WordUnit(n, newword,
                                           words[i - 1].postag))  # 添加上一个词
             n += 1
             newword = word.lemma  # 当前词作为新词
         i += 1
     # 添加最后一个词
     words_combine.append(WordUnit(n, newword,
                                   words[len(words) - 1].postag))
     return words_combine
 def combine(self, words, netags):
     """根据命名实体的B-I-E进行词合并
     Args:
         words: WordUnit list,分词与词性标注后得到的words
         netags: list,命名实体识别结果
     Returns:
         words_combine: WordUnit list,连接后的结果
     """
     words_combine = []  # 存储连接后的结果
     length = len(netags)
     n = 1  # 实体计数,从1开始
     i = 0
     while i < length:
         if 'B-' in netags[i]:
             newword = words[i].lemma
             j = i + 1
             while j < length:
                 if 'I-' in netags[j]:
                     newword += words[j].lemma
                 elif 'E-' in netags[j]:
                     newword += words[j].lemma
                     break
                 elif 'O' == netags[j] or (j + 1) == length:
                     break
                 j += 1
             words_combine.append(
                 WordUnit(n, newword, self.judge_postag(netags[j - 1])))
             n += 1
             i = j
         else:
             words[i].ID = n
             n += 1
             words_combine.append(words[i])
         i += 1
     return self.combine_comm(words_combine)
Пример #3
0
 def postag(self, lemmas):
     """对分词后的结果进行词性标注
     Args:
         lemmas: list,分词后的结果
         entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生
     Returns:
         words: WordUnit list,包含分词与词性标注结果
     """
     words = []  # 存储句子处理后的词单元
     # 词性标注
     postags = self.postagger.postag(lemmas)
     for i in range(len(lemmas)):
         # 存储分词与词性标记后的词单元WordUnit,编号从1开始
         word = WordUnit(i+1, lemmas[i], postags[i])
         words.append(word)
     # self.postagger.release()  # 释放
     return words
Пример #4
0
        return words_str.rstrip('\n')

    def get_lemmas(self):
        """获得句子的分词结果
        Returns:
            lemmas: str,该句子的分词结果
        """
        lemmas = ''
        for word in self.words:
            lemmas += word.lemma + '\t'
        return lemmas.rstrip('\t')


if __name__ == '__main__':
    # 中国首都北京
    word3 = WordUnit(3, '北京', 'ns', 0, None, 'HED')
    word2 = WordUnit(2, '首都', 'ns', 3, None, 'ATT')
    word1 = WordUnit(1, '中国', 'ns', 2, None, 'ATT')

    words = []  # 句子的词单元
    words.append(word1)
    words.append(word2)
    words.append(word3)

    sentence = SentenceUnit(words)
    print(sentence.to_string())

    print('句子分词结果: ' + sentence.get_lemmas())
    print('"首都"的中心词lemma: ' + sentence.words[1].head_word.lemma)

    print('句子的中心词: ' + sentence.get_head_word().to_string())