def combine_comm(self, words): """根据词性标注进行普通实体合并 Args: words: WordUnit list,进行命名实体合并后的words Returns: words_combine: WordUnit list,进行普通实体连接后的words """ newword = words[0].lemma # 第一个词,作为新词 words_combine = [] # 存储合并后的结果 n = 1 i = 1 # 当前词ID while i < len(words): word = words[i] # 词合并: (前后词都是实体) and (前后词的词性相同 or 前词 in ["nz", "j"] or 后词 in ["nz", "j"]) if (self.is_entity(word.postag) and self.is_entity(words[i - 1].postag) and (word.postag in {'nz', 'j'} or words[i - 1].postag in {'nz', 'j'})): newword += word.lemma else: words_combine.append(WordUnit(n, newword, words[i - 1].postag)) # 添加上一个词 n += 1 newword = word.lemma # 当前词作为新词 i += 1 # 添加最后一个词 words_combine.append(WordUnit(n, newword, words[len(words) - 1].postag)) return words_combine
def combine(self, words, netags): """根据命名实体的B-I-E进行词合并 Args: words: WordUnit list,分词与词性标注后得到的words netags: list,命名实体识别结果 Returns: words_combine: WordUnit list,连接后的结果 """ words_combine = [] # 存储连接后的结果 length = len(netags) n = 1 # 实体计数,从1开始 i = 0 while i < length: if 'B-' in netags[i]: newword = words[i].lemma j = i + 1 while j < length: if 'I-' in netags[j]: newword += words[j].lemma elif 'E-' in netags[j]: newword += words[j].lemma break elif 'O' == netags[j] or (j + 1) == length: break j += 1 words_combine.append( WordUnit(n, newword, self.judge_postag(netags[j - 1]))) n += 1 i = j else: words[i].ID = n n += 1 words_combine.append(words[i]) i += 1 return self.combine_comm(words_combine)
def postag(self, lemmas): """对分词后的结果进行词性标注 Args: lemmas: list,分词后的结果 entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i+1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() # 释放 return words
return words_str.rstrip('\n') def get_lemmas(self): """获得句子的分词结果 Returns: lemmas: str,该句子的分词结果 """ lemmas = '' for word in self.words: lemmas += word.lemma + '\t' return lemmas.rstrip('\t') if __name__ == '__main__': # 中国首都北京 word3 = WordUnit(3, '北京', 'ns', 0, None, 'HED') word2 = WordUnit(2, '首都', 'ns', 3, None, 'ATT') word1 = WordUnit(1, '中国', 'ns', 2, None, 'ATT') words = [] # 句子的词单元 words.append(word1) words.append(word2) words.append(word3) sentence = SentenceUnit(words) print(sentence.to_string()) print('句子分词结果: ' + sentence.get_lemmas()) print('"首都"的中心词lemma: ' + sentence.words[1].head_word.lemma) print('句子的中心词: ' + sentence.get_head_word().to_string())