def add_node(self, word, typing): ''' 向 Trie 树添加节点 :param word: 字典中的词汇 :param typing: 词汇类型 :return: None ''' word = word.strip() if word not in ['', '\t', ' ', '\r']: tree = self.dict_trie depth = len(word) word = word.lower() # 将所有的字母全部转换成小写 for char in word: if char in tree: tree = tree[char] else: tree[char] = dict() tree = tree[char] if depth > self.depth: self.depth = depth if 'type' in tree and tree['type'] != typing: logging.warning('`{}` belongs to both `{}` and `{}`.'.format( word, tree['type'], typing)) else: tree['type'] = typing
def wrapper(self, *args, **kargs): # 按索引检索的 appkey if self.appkey_obj_list is not None: count = 0 while count <= self.appkey_num: self.appkey_obj = self.appkey_obj_list[self.appkey_index] count += 1 try: f = func(self, *args, **kargs) break except Exception as err: # 替换密钥的索引 if self.appkey_index == self.appkey_num - 1: self.appkey_index = 0 else: self.appkey_index += 1 # 统计,若循环次数大于密钥个数,即全部密钥被尝试,则退出;否则继续尝试下一个密钥 if count < self.appkey_num: logging.warning( 'The appkey {} of `{}` is invalid.'.format( json.dumps(self.appkey_obj, ensure_ascii=False), self.__class__.__name__)) else: logging.error(err) raise Exception(err) break else: f = func(self, *args, **kargs) return f
def _wrong_message(_idx, ts): if verbose: logging.info(token_list) logging.info(tags) logging.warning('wrong tag: {}'.format( ts[start if start is not None else max(0, _idx - 2): _idx + 2]))
def entity2tag(token_list: List[str], entities: List[Dict[str, Any]], formater='BIOES'): """ 将实体 entity 格式转为 tag 格式,若标注过程中有重叠标注,则会自动将靠后的 实体忽略、删除。针对单条处理,不支持批量处理。 Args: token_list(List[str]): token 化的文本的 list entities(List[str, Dict[str, Any]]): 文本相应的实体。 formater(str): 选择的标注标准 return: List[List[str], List[str]]: tag 格式的数据 Examples: >>> token_list = '胡静静在水利局工作。' # 字级别 >>> token_list = ['胡', '静', '静', '在', '水', '利', '局', '工', '作', '。'] # 字或词级别 >>> ner_entities = [{'text': '胡静静', 'offset': [0, 3], 'type': 'Person'}, {'text': '水利局', 'offset': [4, 7], 'type': 'Orgnization'}] >>> print(jio.ner.entity2tag(token_list, ner_entities)) ['B-Person', 'I-Person', 'E-Person', 'O', 'B-Orgnization', 'I-Orgnization', 'E-Orgnization', 'O', 'O', 'O'] """ tags = ['O' for i in range(len(token_list))] flag = 0 # 判断重叠标注 entities = sorted(entities, key=lambda i: i['offset'][0]) for idx, entity in enumerate(entities): if entity['offset'][1] < flag: # 说明重叠标注,要删除 if 1 < idx + 1 < len(entities): logging.warning('The entity {} is overlapped with {}.'.format( json.dumps(entity, ensure_ascii=False), json.dumps(entities[idx - 1], ensure_ascii=False))) else: if entity['offset'][1] - entity['offset'][0] == 1: tags[entity['offset'][0]] = 'S-' + entity['type'] else: tags[entity['offset'][0]] = 'B-' + entity['type'] if entity['offset'][1] - entity['offset'][0] > 2: for j in range(entity['offset'][0] + 1, entity['offset'][1] - 1): tags[j] = 'I-' + entity['type'] tags[entity['offset'][1] - 1] = 'E-' + entity['type'] flag = entity['offset'][1] return tags
def entity2tag(token_list: List[str], entities: List[Dict[str, Any]], formater='BIOES'): ''' 将实体 entity 格式转为 tag 格式,若标注过程中有重叠标注,则会自动将靠后的 实体忽略、删除。 Args: ner_entities(List[str, Dict[str, Any]]): 文本以及相应的实体。 formater(str): 选择的标注标准 return: List[List[str], List[str]]: tag 格式的数据 Examples: >>> ner_entities = [ '胡静静在水利局工作。', {'text': '胡静静', 'offset': [0, 3], 'type': 'Person'}, {'text': '水利局', 'offset': [4, 7], 'type': 'Orgnization'}]] >>> print(entity2tag(ner_entities)) [['胡', '静', '静', '在', '水', '利', '局', '工', '作', '。'], ['B-Person', 'I-Person', 'E-Person', 'O', 'B-Orgnization', 'I-Orgnization', 'E-Orgnization', 'O', 'O', 'O']] ''' tags = ['O' for i in range(len(token_list))] flag = 0 # 判断重叠标注 for idx, entity in enumerate(entities): if entity['offsets'][1] < flag: # 说明重叠标注,要删除 if 1 < idx + 1 < len(entities): logging.warning( 'The entity {} is overlapped with {}.'.format( json.dumps(entity, ensure_ascii=False), json.dumps(entities[idx - 1], ensure_ascii=False))) else: if entity['offsets'][1] - entity['offsets'][0] == 1: tags[entity['offsets'][0]] = 'S-' + entity['type'] else: tags[entity['offsets'][0]] = 'B-' + entity['type'] if entity['offsets'][1] - entity['offsets'][0] > 2: for j in range(entity['offsets'][0] + 1, entity['offsets'][1] - 1): tags[j] = 'I-' + entity['type'] tags[entity['offsets'][1] - 1] = 'E-' + entity['type'] flag = entity['offsets'][1] return tags
def __call__(self, cur_idiom, same_pinyin=True, check_idiom=False, same_tone=True, with_prob=True, restart=False): if self.idiom_list is None: self._prepare() if restart: # 重新开始游戏,清空历史记录 self.already_used_idioms = set() if cur_idiom not in self.pure_idiom_list: logging.warning('{} may not be a Chinese idiom.'.format(cur_idiom)) if check_idiom: return 'wrong input idiom' else: pass else: # add cur idiom into the already-list self.already_used_idioms.add(cur_idiom) if same_pinyin: cur_last_pinyin = self.pinyin_obj(cur_idiom, formater='simple')[-1] backup_idioms = list() if same_tone: for idiom_obj in self.idiom_list: if idiom_obj['idiom'] in self.already_used_idioms: continue if cur_last_pinyin == idiom_obj['pinyin'][0]: backup_idioms.append(idiom_obj) else: for idiom_obj in self.idiom_list: if idiom_obj['idiom'] in self.already_used_idioms: continue if cur_last_pinyin[:-1] == idiom_obj['pinyin'][0][:-1]: backup_idioms.append(idiom_obj) else: cur_last_char = cur_idiom[-1] backup_idioms = list() for idiom_obj in self.idiom_list: if idiom_obj in self.already_used_idioms: continue if cur_last_char == idiom_obj['idiom'][0]: backup_idioms.append(idiom_obj) if len(backup_idioms) == 0: return 'can not find next' if not with_prob: result = random.choice(backup_idioms) self.already_used_idioms.add(result['idiom']) return result['idiom'] else: result = self._random_select(backup_idioms) self.already_used_idioms.add(result['idiom']) return result['idiom']
def char2word(char_entity_list, word_token_list, verbose=False): '''将字 token 的 ner 训练数据组织成词 token,数据结构不变。针对单条数据处理, 不支持批量处理。 根据经验,jieba 分词的分词错误造成实体被丢弃,其错误率在 4.62%, 而 pkuseg 分词器错误率在 3.44%。 Args: char_entity_list: 以字 token 为基准对应的实体列表 word_token_list: 采用分词器分词后的 list verbose(bool): 字级别数据转换为词级别时,由于分词器误差,会有数据错漏, 此处选择是否打印详细错漏 Returns: list: 词 token 数据 Examples: >>> char_token_list = '胡静静喜欢江西红叶建筑公司' # 字级别 >>> char_token_list = [ '胡', '静', '静', '喜', '欢', '江', '西', '红', '叶', '建', '筑', '公', '司'] # 字或词级别 >>> char_entity_list = [ {'text': '胡静静', 'offset': [0, 3], 'type': 'Person'}, {'text': '江西红叶建筑公司', 'offset': [5, 13], 'type': 'Company'}] >>> word_token_list = ['胡静静', '喜欢', '江西', '红叶', '建筑', '公司'] >>> print(jio.ner.char2word(char_entity_list, word_token_list)) [{'text': '胡静静', 'offset': [0, 1], 'type': 'Person'}, {'text': '江西红叶建筑公司', 'offset': [2, 6], 'type': 'Company'}] ''' idx_flag = 0 idx_list = [0] for word in word_token_list: idx_flag += len(word) idx_list.append(idx_flag) word_entity_list = list() for char_entity in char_entity_list: # 判断该实体有没有与分词产生冲突 try: start = idx_list.index(char_entity['offset'][0]) end = idx_list.index(char_entity['offset'][1]) word_entity_list.append( {'type': char_entity['type'], 'offset': [start, end], 'text': char_entity['text']}) except ValueError: if verbose: # 确定该实体的具体位置,给出日志 if char_entity['offset'][0] not in idx_list: start = idx_list.index( max([idx for idx in idx_list if idx < char_entity['offset'][0]])) else: start = idx_list.index(char_entity['offset'][0]) if char_entity['offset'][1] not in idx_list: end = idx_list.index( min([idx for idx in idx_list if idx > char_entity['offset'][1]])) else: end = idx_list.index(char_entity['offset'][1]) logging.warning( 'the entity {} =/=> {}'.format( char_entity, word_token_list[start: end])) return word_entity_list