Exemplo n.º 1
0
 def add_node(self, word, typing):
     '''
     向 Trie 树添加节点
     :param word: 字典中的词汇
     :param typing: 词汇类型
     :return: None
     '''
     word = word.strip()
     if word not in ['', '\t', ' ', '\r']:
         tree = self.dict_trie
         depth = len(word)
         word = word.lower()  # 将所有的字母全部转换成小写
         for char in word:
             if char in tree:
                 tree = tree[char]
             else:
                 tree[char] = dict()
                 tree = tree[char]
         if depth > self.depth:
             self.depth = depth
         if 'type' in tree and tree['type'] != typing:
             logging.warning('`{}` belongs to both `{}` and `{}`.'.format(
                 word, tree['type'], typing))
         else:
             tree['type'] = typing
Exemplo n.º 2
0
    def wrapper(self, *args, **kargs):
        # 按索引检索的 appkey
        if self.appkey_obj_list is not None:
            count = 0
            while count <= self.appkey_num:
                self.appkey_obj = self.appkey_obj_list[self.appkey_index]

                count += 1
                try:
                    f = func(self, *args, **kargs)
                    break

                except Exception as err:

                    # 替换密钥的索引
                    if self.appkey_index == self.appkey_num - 1:
                        self.appkey_index = 0
                    else:
                        self.appkey_index += 1

                    # 统计,若循环次数大于密钥个数,即全部密钥被尝试,则退出;否则继续尝试下一个密钥
                    if count < self.appkey_num:
                        logging.warning(
                            'The appkey {} of `{}` is invalid.'.format(
                                json.dumps(self.appkey_obj, ensure_ascii=False),
                                self.__class__.__name__))
                    else:
                        logging.error(err)
                        raise Exception(err)
                        break

        else:
            f = func(self, *args, **kargs)

        return f
Exemplo n.º 3
0
 def _wrong_message(_idx, ts):
     if verbose:
         logging.info(token_list)
         logging.info(tags)
         logging.warning('wrong tag: {}'.format(
             ts[start if start is not None
                else max(0, _idx - 2): _idx + 2]))
Exemplo n.º 4
0
def entity2tag(token_list: List[str],
               entities: List[Dict[str, Any]],
               formater='BIOES'):
    """ 将实体 entity 格式转为 tag 格式,若标注过程中有重叠标注,则会自动将靠后的
    实体忽略、删除。针对单条处理,不支持批量处理。

    Args:
        token_list(List[str]): token 化的文本的 list
        entities(List[str, Dict[str, Any]]): 文本相应的实体。
        formater(str): 选择的标注标准
    return:
        List[List[str], List[str]]: tag 格式的数据

    Examples:
        >>> token_list = '胡静静在水利局工作。'  # 字级别
        >>> token_list = ['胡', '静', '静', '在', '水',
                          '利', '局', '工', '作', '。']  # 字或词级别
        >>> ner_entities =
                [{'text': '胡静静', 'offset': [0, 3], 'type': 'Person'},
                 {'text': '水利局', 'offset': [4, 7], 'type': 'Orgnization'}]
        >>> print(jio.ner.entity2tag(token_list, ner_entities))
            ['B-Person', 'I-Person', 'E-Person', 'O', 'B-Orgnization',
             'I-Orgnization', 'E-Orgnization', 'O', 'O', 'O']

    """
    tags = ['O' for i in range(len(token_list))]

    flag = 0  # 判断重叠标注

    entities = sorted(entities, key=lambda i: i['offset'][0])

    for idx, entity in enumerate(entities):
        if entity['offset'][1] < flag:  # 说明重叠标注,要删除
            if 1 < idx + 1 < len(entities):
                logging.warning('The entity {} is overlapped with {}.'.format(
                    json.dumps(entity, ensure_ascii=False),
                    json.dumps(entities[idx - 1], ensure_ascii=False)))

        else:
            if entity['offset'][1] - entity['offset'][0] == 1:
                tags[entity['offset'][0]] = 'S-' + entity['type']
            else:
                tags[entity['offset'][0]] = 'B-' + entity['type']
                if entity['offset'][1] - entity['offset'][0] > 2:
                    for j in range(entity['offset'][0] + 1,
                                   entity['offset'][1] - 1):
                        tags[j] = 'I-' + entity['type']
                tags[entity['offset'][1] - 1] = 'E-' + entity['type']
            flag = entity['offset'][1]

    return tags
Exemplo n.º 5
0
def entity2tag(token_list: List[str], entities: List[Dict[str, Any]], 
               formater='BIOES'):
    ''' 将实体 entity 格式转为 tag 格式,若标注过程中有重叠标注,则会自动将靠后的
    实体忽略、删除。
    
    Args:
        ner_entities(List[str, Dict[str, Any]]): 文本以及相应的实体。
        formater(str): 选择的标注标准
    return:
        List[List[str], List[str]]: tag 格式的数据
        
    Examples:
        >>> ner_entities = [
                '胡静静在水利局工作。', 
                {'text': '胡静静', 'offset': [0, 3], 'type': 'Person'},
                {'text': '水利局', 'offset': [4, 7], 'type': 'Orgnization'}]]
        >>> print(entity2tag(ner_entities))
            [['胡', '静', '静', '在', '水', '利', '局', '工', '作', '。'],
             ['B-Person', 'I-Person', 'E-Person', 'O', 'B-Orgnization',
             'I-Orgnization', 'E-Orgnization', 'O', 'O', 'O']]
             
    '''
    tags = ['O' for i in range(len(token_list))]
    
    flag = 0  # 判断重叠标注

    for idx, entity in enumerate(entities):
        if entity['offsets'][1] < flag:  # 说明重叠标注,要删除
            if 1 < idx + 1 < len(entities):
                logging.warning(
                    'The entity {} is overlapped with {}.'.format(
                        json.dumps(entity, ensure_ascii=False),
                        json.dumps(entities[idx - 1], ensure_ascii=False)))
            
        else:
            if entity['offsets'][1] - entity['offsets'][0] == 1:
                tags[entity['offsets'][0]] = 'S-' + entity['type']
            else:
                tags[entity['offsets'][0]] = 'B-' + entity['type']
                if entity['offsets'][1] - entity['offsets'][0] > 2:
                    for j in range(entity['offsets'][0] + 1,
                                   entity['offsets'][1] - 1):
                        tags[j] = 'I-' + entity['type']
                tags[entity['offsets'][1] - 1] = 'E-' + entity['type']
            flag = entity['offsets'][1]

    return tags
Exemplo n.º 6
0
    def __call__(self,
                 cur_idiom,
                 same_pinyin=True,
                 check_idiom=False,
                 same_tone=True,
                 with_prob=True,
                 restart=False):
        if self.idiom_list is None:
            self._prepare()

        if restart:
            # 重新开始游戏,清空历史记录
            self.already_used_idioms = set()

        if cur_idiom not in self.pure_idiom_list:
            logging.warning('{} may not be a Chinese idiom.'.format(cur_idiom))
            if check_idiom:
                return 'wrong input idiom'
            else:
                pass
        else:
            # add cur idiom into the already-list
            self.already_used_idioms.add(cur_idiom)

        if same_pinyin:
            cur_last_pinyin = self.pinyin_obj(cur_idiom, formater='simple')[-1]
            backup_idioms = list()
            if same_tone:
                for idiom_obj in self.idiom_list:
                    if idiom_obj['idiom'] in self.already_used_idioms:
                        continue

                    if cur_last_pinyin == idiom_obj['pinyin'][0]:
                        backup_idioms.append(idiom_obj)

            else:
                for idiom_obj in self.idiom_list:
                    if idiom_obj['idiom'] in self.already_used_idioms:
                        continue

                    if cur_last_pinyin[:-1] == idiom_obj['pinyin'][0][:-1]:
                        backup_idioms.append(idiom_obj)

        else:
            cur_last_char = cur_idiom[-1]
            backup_idioms = list()
            for idiom_obj in self.idiom_list:
                if idiom_obj in self.already_used_idioms:
                    continue

                if cur_last_char == idiom_obj['idiom'][0]:
                    backup_idioms.append(idiom_obj)

        if len(backup_idioms) == 0:
            return 'can not find next'

        if not with_prob:
            result = random.choice(backup_idioms)
            self.already_used_idioms.add(result['idiom'])
            return result['idiom']
        else:
            result = self._random_select(backup_idioms)
            self.already_used_idioms.add(result['idiom'])
            return result['idiom']
Exemplo n.º 7
0
def char2word(char_entity_list, word_token_list, verbose=False):
    '''将字 token 的 ner 训练数据组织成词 token,数据结构不变。针对单条数据处理,
    不支持批量处理。
    根据经验,jieba 分词的分词错误造成实体被丢弃,其错误率在 4.62%,
    而 pkuseg 分词器错误率在 3.44%。

    Args:
        char_entity_list: 以字 token 为基准对应的实体列表
        word_token_list: 采用分词器分词后的 list
        verbose(bool): 字级别数据转换为词级别时,由于分词器误差,会有数据错漏,
            此处选择是否打印详细错漏

    Returns:
        list: 词 token 数据

    Examples:
        >>> char_token_list = '胡静静喜欢江西红叶建筑公司'  # 字级别
        >>> char_token_list = [
                '胡', '静', '静', '喜', '欢', '江', '西',
                '红', '叶', '建', '筑', '公', '司']  # 字或词级别
        >>> char_entity_list = [
                {'text': '胡静静', 'offset': [0, 3], 'type': 'Person'},
                {'text': '江西红叶建筑公司', 'offset': [5, 13], 'type': 'Company'}]
        >>> word_token_list = ['胡静静', '喜欢', '江西', '红叶', '建筑', '公司']
        >>> print(jio.ner.char2word(char_entity_list, word_token_list))
            [{'text': '胡静静', 'offset': [0, 1], 'type': 'Person'},
             {'text': '江西红叶建筑公司', 'offset': [2, 6], 'type': 'Company'}]
    
    '''

    idx_flag = 0
    idx_list = [0]
    for word in word_token_list:
        idx_flag += len(word)
        idx_list.append(idx_flag)

    word_entity_list = list()
    for char_entity in char_entity_list:
        # 判断该实体有没有与分词产生冲突
        try:
            start = idx_list.index(char_entity['offset'][0])
            end = idx_list.index(char_entity['offset'][1])

            word_entity_list.append(
                {'type': char_entity['type'], 'offset': [start, end],
                 'text': char_entity['text']})

        except ValueError:
            if verbose:
                # 确定该实体的具体位置,给出日志
                if char_entity['offset'][0] not in idx_list:
                    start = idx_list.index(
                        max([idx for idx in idx_list
                             if idx < char_entity['offset'][0]]))
                else:
                    start = idx_list.index(char_entity['offset'][0])
                    
                if char_entity['offset'][1] not in idx_list:
                    end = idx_list.index(
                        min([idx for idx in idx_list
                             if idx > char_entity['offset'][1]]))
                else:
                    end = idx_list.index(char_entity['offset'][1])
                logging.warning(
                    'the entity {} =/=> {}'.format(
                        char_entity, word_token_list[start: end]))
        
    return word_entity_list