def named_entity_recognition(self, sent, standard_name=False): """ 利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名)三种实体。harvesttext会预先链接已知实体 :param sent: :param standard_name: :return: 发现的命名实体信息,字典 {实体名: 实体类型} """ from pyhanlp import HanLP, JClass if not self.hanlp_prepared: self.hanlp_prepare() self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer") StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True) entity_type_dict = {} try: for x in StandardTokenizer.segment(sent2): # 三种前缀代表:人名(nr),地名(ns),机构名(nt) tag0 = str(x.nature) if tag0.startswith("nr"): entity_type_dict[x.word] = "人名" elif tag0.startswith("ns"): entity_type_dict[x.word] = "地名" elif tag0.startswith("nt"): entity_type_dict[x.word] = "机构名" elif tag0.startswith("nz"): entity_type_dict[x.word] = "其他专名" except: pass return entity_type_dict
def named_entity_recognition(self, sent, standard_name=False, return_posseg=False): '''利用pyhanlp的命名实体识别,找到句子中的(人名,地名,机构名,其他专名)实体。harvesttext会预先链接已知实体 :param sent: string, 文本 :param standard_name: bool, 是否把连接到的已登录转化为标准名 :param return_posseg: bool, 是否返回包括命名实体识别的,带词性分词结果 :param book: bool, 预先识别 :return: entity_type_dict: 发现的命名实体信息,字典 {实体名: 实体类型} (return_posseg=True时) possegs: list of (单词, 词性) ''' from pyhanlp import HanLP, JClass if not self.hanlp_prepared: self.hanlp_prepare() self.standard_name = standard_name entities_info = self.entity_linking(sent) sent2 = self.decoref(sent, entities_info) StandardTokenizer = JClass("com.hankcs.hanlp.tokenizer.StandardTokenizer") StandardTokenizer.SEGMENT.enableAllNamedEntityRecognize(True) entity_type_dict = {} try: possegs = [] for x in StandardTokenizer.segment(sent2): # 三种前缀代表:人名(nr),地名(ns),机构名(nt) tag0 = str(x.nature) if tag0.startswith("nr"): entity_type_dict[x.word] = "人名" elif tag0.startswith("ns"): entity_type_dict[x.word] = "地名" elif tag0.startswith("nt"): entity_type_dict[x.word] = "机构名" elif tag0.startswith("nz"): entity_type_dict[x.word] = "其他专名" possegs.append((x.word, tag0)) except: pass if return_posseg: return entity_type_dict, possegs else: return entity_type_dict
def hanlp_cut(text): tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer") return " ".join([term.word for term in tokenizer.segment(text)])