예제 #1
0
 def _wrong_message(_idx, ts):
     if verbose:
         logging.info(token_list)
         logging.info(tags)
         logging.warning('wrong tag: {}'.format(
             ts[start if start is not None
                else max(0, _idx - 2): _idx + 2]))
예제 #2
0
def analyse_dataset(dataset_x,
                    dataset_y,
                    ratio=[0.8, 0.05, 0.15],
                    shuffle=True,
                    multi_label=False):
    """ 将数据集按照训练、验证、测试进行划分,统计数据集中各个类别的数量和占比,计算训练、
    验证、测试集的相对熵,判断数据集分割是否合理。其中,dismatch 信息比例越低,证明数据集
    划分的各类别比例越贴近数据全集的分布。

    Args:
        dataset_x: 数据集的输入数据部分
        dataset_y: 数据集的输出标签
        ratio: 训练集、验证集、测试集的比例
        shuffle: 打散数据集
        multi_label: 数据集为多标签

    Return:
        train_x, train_y, valid_x, valid_y, test_x, test_y, stats(dict):
            stats 为数据集的统计信息(数量、占比、相对熵)

    Examples:
        >>> import jionlp as jio
        >>> dataset_x = ['美股大涨...', '金融市场开放...', '小米无屏电视...', ...]
        >>> dataset_y = ['财经', '财经', '科技', ...]
        >>> train_x, train_y, valid_x, valid_y, test_x, test_y, stats = \
            ... jio.text_classification.analyse_dataset(dataset_x, dataset_y)
        >>> print(stats)

            whole dataset:
            财经                            32,268        84.52%
            科技                             5,910        15.48%
            total                           38,178       100.00%

            train dataset: 80.00%
            财经                            25,848        84.63%
            科技                             4,694        15.37%
            total                           30,542       100.00%

            valid dataset: 5.00%
            财经                            32,268        84.52%
            科技                             5,910        15.48%
            total                            1,908       100.00%

            test dataset: 15.00%
            财经                             4,840        84.53%
            科技                               886        15.47%
            total                            5,726       100.00%

            train KL divergence: 0.000007, info dismatch: 0.00%
            valid KL divergence: 0.001616, info dismatch: 0.26%
            test KL divergence: 0.000000, info dismatch: 0.00%

    """
    dataset = [[sample_x, sample_y]
               for sample_x, sample_y in zip(dataset_x, dataset_y)]

    if shuffle:
        random.shuffle(dataset)

    has_kl = False
    for i in range(3):
        # 为获得最佳的数据子集切分,在切分情况不好(相对熵较高,类别不全)时,需要重新
        # 切分,以获得最佳的子集类别分布。在三次都不满足的情况下,则照常返回。
        # 统计各个类别的数据数量及占比
        stats = {'train': None, 'valid': None, 'test': None, 'total': None}
        dataset_stat = _stat_class(dataset_y, multi_label=multi_label)
        stats['total'] = dataset_stat

        tmp_ds = list()
        current = 0
        for s in ratio:
            num = int(len(dataset) * s)
            tmp_ds.append(dataset[current:current + num])
            current += num

        train_x = [item[0] for item in tmp_ds[0]]
        train_y = [item[1] for item in tmp_ds[0]]
        valid_x = [item[0] for item in tmp_ds[1]]
        valid_y = [item[1] for item in tmp_ds[1]]
        test_x = [item[0] for item in tmp_ds[2]]
        test_y = [item[1] for item in tmp_ds[2]]

        # 统计各数据子集的统计信息
        train_stat = _stat_class(train_y, multi_label=multi_label)
        stats['train'] = train_stat
        valid_stat = _stat_class(valid_y, multi_label=multi_label)
        stats['valid'] = valid_stat
        test_stat = _stat_class(test_y, multi_label=multi_label)
        stats['test'] = test_stat

        if not (len(train_stat) == len(valid_stat) == len(test_stat)):
            # 各子集的类别数量不一致,则重新进行切分
            continue

        # 计算 KL 散度
        has_kl = True
        train_kl_value, train_ratio = _compute_kl_divergence(
            np.array([item[1][1] for item in sorted(dataset_stat.items())]),
            np.array([item[1][1] for item in sorted(train_stat.items())]))
        valid_kl_value, valid_ratio = _compute_kl_divergence(
            np.array([item[1][1] for item in sorted(dataset_stat.items())]),
            np.array([item[1][1] for item in sorted(valid_stat.items())]))
        test_kl_value, test_ratio = _compute_kl_divergence(
            np.array([item[1][1] for item in sorted(dataset_stat.items())]),
            np.array([item[1][1] for item in sorted(test_stat.items())]))

        if (train_ratio > 0.05) or (valid_ratio > 0.05) or (test_ratio > 0.05):
            # kl 散度阈值过大,说明切分的类别分布比例不一致,需要重新切分
            continue

        break

    # 打印信息
    stats_fmt = '{0:<20s}\t{1:>8,d}\t{2:>2.2%}'
    total_fmt = stats_fmt + '\n'
    logging.info('whole dataset:')
    for _class, info in stats['total'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[1] for info in stats['total'].values()])
    logging.info(total_fmt.format('total', len(dataset_y), sum_res))

    logging.info('train dataset: {:.2%}'.format(ratio[0]))
    for _class, info in stats['train'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[1] for info in stats['train'].values()])

    logging.info(total_fmt.format('total', len(train_y), sum_res))

    logging.info('valid dataset: {:.2%}'.format(ratio[1]))
    for _class, info in stats['valid'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[1] for info in stats['valid'].values()])
    logging.info(total_fmt.format('total', len(valid_y), sum_res))

    logging.info('test dataset: {:.2%}'.format(ratio[2]))
    for _class, info in stats['test'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[1] for info in stats['test'].values()])
    logging.info(total_fmt.format('total', len(test_y), sum_res))

    if has_kl:
        kl_fmt = 'KL divergence: {0:.>2f}, info dismatch: {1:.2%}'
        logging.info('train ' + kl_fmt.format(train_kl_value, train_ratio))
        logging.info('valid ' + kl_fmt.format(valid_kl_value, valid_ratio))
        logging.info('test ' + kl_fmt.format(test_kl_value, test_ratio))

    return train_x, train_y, valid_x, valid_y, test_x, test_y, stats
예제 #3
0
def analyse_dataset(dataset_x, dataset_y, ratio=[0.8, 0.05, 0.15], shuffle=True):
    ''' 将 NER 数据集按照训练、验证、测试进行划分,统计数据集中各个类别实体的数量和占比,
    计算训练、验证、测试集的相对熵,判断数据集分割是否合理。其中,dismatch 信息比例越低,
    证明数据集划分的各类别比例越贴近数据全集的分布。
    
    Args:
        dataset_x: 数据集的输入数据部分
        dataset_y: 数据集的输出标签
        ratio: 训练集、验证集、测试集的比例
        shuffle: 打散数据集
        
    Return:
        train_x, train_y, valid_x, valid_y, test_x, test_y, stats(dict):
            stats 为数据集的统计信息(数量、占比、相对熵)
        
    Examples:
        >>> import jionlp as jio
        >>> dataset_x = ['马成宇在...', 
                         '金融国力教育公司...', 
                         '延平区人民法院曾经...',
                         ...]
        >>> dataset_y = [[{'type': 'Person', 'text': '马成宇', 'offset': (0, 3)}],
                         [{'type': 'Company', 'text': '国力教育公司', 'offset': (2, 8)}],
                         [{'type': 'Organization', 'text': '延平区人民法院', 'offset': (0, 7)}],
                         ...]
        >>> train_x, train_y, valid_x, valid_y, test_x, test_y, stats = \
            ... jio.ner.analyse_dataset(dataset_x, dataset_y)
        >>> print(stats)

            whole dataset:
            Company                    573        39.68%
            Person                     495        34.28%
            Organization               376        26.04%
            total                    3,000        100.00%

            train dataset: 80.00%
            Company                    464        40.38%
            Person                     379        32.99%
            Organization               306        26.63%
            total                    2,400        100.00%

            valid dataset: 5.00%
            Person                      32        47.06%
            Company                     22        32.35%
            Organization                14        20.59%
            total                      150        100.00%

            test dataset: 15.00%
            Company                     87        38.33%
            Person                      84        37.00%
            Organization                56        24.67%
            total                      450        100.00%

            train KL divergence: 0.000546, info dismatch: 0.03%
            valid KL divergence: 0.048423, info dismatch: 3.10%
            test KL divergence: 0.002364, info dismatch: 0.15%

    '''
    dataset = [[sample_x, sample_y] for sample_x, sample_y
               in zip(dataset_x, dataset_y)]
    
    if shuffle:
        random.shuffle(dataset)

    has_kl = False
    for i in range(3):
        # 为获得最佳的数据子集切分,在切分情况不好(相对熵较高,类别不全)时,需要重新
        # 切分,以获得最佳的子集类别分布。在三次都不满足的情况下,则照常返回。
        # 统计各个类别的数据数量及占比
        stats = {'train': None, 'valid': None, 'test': None, 'total': None}
        dataset_stat = _stat_class(dataset_y)
        stats['total'] = dataset_stat

        tmp_ds = list()
        current = 0
        for s in ratio:
            num = int(len(dataset) * s)
            tmp_ds.append(dataset[current: current + num])
            current += num

        train_x = [item[0] for item in tmp_ds[0]]
        train_y = [item[1] for item in tmp_ds[0]]
        valid_x = [item[0] for item in tmp_ds[1]]
        valid_y = [item[1] for item in tmp_ds[1]]
        test_x = [item[0] for item in tmp_ds[2]]
        test_y = [item[1] for item in tmp_ds[2]]

        # 统计各数据子集的统计信息
        train_stat = _stat_class(train_y)
        stats['train'] = train_stat
        valid_stat = _stat_class(valid_y)
        stats['valid'] = valid_stat
        test_stat = _stat_class(test_y)
        stats['test'] = test_stat
        
        if not (len(train_stat) == len(valid_stat) == len(test_stat)):
            # 各子集的类别数量不一致,则重新进行切分
            continue

        # 计算 KL 散度
        has_kl = True
        train_kl_value, train_ratio = _compute_kl_divergence(
            np.array([item[1][1] for item in sorted(dataset_stat.items())]),
            np.array([item[1][1] for item in sorted(train_stat.items())]))
        valid_kl_value, valid_ratio = _compute_kl_divergence(
            np.array([item[1][1] for item in sorted(dataset_stat.items())]),
            np.array([item[1][1] for item in sorted(valid_stat.items())]))
        test_kl_value, test_ratio = _compute_kl_divergence(
            np.array([item[1][1] for item in sorted(dataset_stat.items())]),
            np.array([item[1][1] for item in sorted(test_stat.items())]))

        if (train_ratio > 0.05) or (valid_ratio > 0.05) or (test_ratio > 0.05):
            # kl 散度阈值过大,说明切分的类别分布比例不一致,需要重新切分
            continue
            
        break

    # 打印信息
    stats_fmt = '{0:<20s}\t{1:>8,d}\t{2:>2.2%}'
    total_fmt = stats_fmt + '\n'
    logging.info('whole dataset:')
    for _class, info in stats['total'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[0] for info in stats['total'].values()])
    logging.info(total_fmt.format('total', sum_res, 1.))
    
    logging.info('train dataset: {:.2%}'.format(ratio[0]))
    for _class, info in stats['train'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[0] for info in stats['train'].values()])
    logging.info(total_fmt.format('total', sum_res, 1.))
    
    logging.info('valid dataset: {:.2%}'.format(ratio[1]))
    for _class, info in stats['valid'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[0] for info in stats['valid'].values()])
    logging.info(total_fmt.format('total', sum_res, 1.))
    
    logging.info('test dataset: {:.2%}'.format(ratio[2]))
    for _class, info in stats['test'].items():
        logging.info(stats_fmt.format(_class, info[0], info[1]))
    sum_res = sum([info[0] for info in stats['test'].values()])
    logging.info(total_fmt.format('total', sum_res, 1.))
    
    if has_kl:
        kl_fmt = 'KL divergence: {0:.>2f}, info dismatch: {1:.2%}'
        logging.info('train ' + kl_fmt.format(train_kl_value, train_ratio))
        logging.info('valid ' + kl_fmt.format(valid_kl_value, valid_ratio))
        logging.info('test ' + kl_fmt.format(test_kl_value, test_ratio))
    
    return train_x, train_y, valid_x, valid_y, test_x, test_y, stats
예제 #4
0
def analyse_freq_words(dataset_x: List[List[str]],
                       dataset_y: List[Any],
                       min_word_freq=10,
                       min_word_threshold=0.8):
    """ 采用朴素贝叶斯的概率分布,分析文本分类语料中,各个类别的高频特征词汇,用于制作类型词典,
    分析完毕后,方便加入模型当中,形成有效的模型和规则词典相结合的模型,提高模型的稳定性以及
    F1 值。具体来讲,获取每个类别 y 对应的词汇 x 的条件分布,即 p(x|y),找出其中概率最高,即
    大于 min_word_threshold 的词汇表,即该类的特征词。对于一般的判别式神经网络模型而言,这些
    词汇的概率在数据量偏少的情况下,或模型参数量过大的情况下,是很难学习得到的。因此,将这些词
    信息以各种形式融入模型,可以有效提升模型的 F1 值。根据经验,往往能提升 2% ~ 8%。

    Args:
        dataset_x: 分词、停用词处理后的词汇列表
        dataset_y: 文本对应的标签类型
        min_word_freq: 最小词频,若语料中词频小于 min_word_freq,则不予考虑其分布
        min_word_threshold: 每个类别返回高频特征词最低阈值。

    Return:
        Dict[Dict[str, List[int, float]]]: 各个类别对应的高频特征词汇,以及其统计词频
            和概率。

    Examples:
        >>> import jieba
        >>> import jionlp as jio
        >>> dataset_x = ['房间比较差,挺糟糕的,尤其是洗手间。',
                         '真糟糕!连热水都没有。',
                         '价格比比较不错的酒店。']
        >>> dataset_y = ['负', '负', '正']
        >>> dataset_x = [jieba.lcut(text) for text in dataset_x]  # 采用任何分词器处理均可
        >>> dataset_x = [jio.remove_stopwords(text_segs) for text in dataset_x]  # 去停用词
        >>> result = jio.text_classification.analyse_freq_words(
            ... dataset_x, dataset_y, min_word_freq=1)

        {
            '负': {
                '糟糕': [2, 1.0],
                '没有': [1, 1.0],
                '差': [1, 1.0]
            },
            '正': {
                '不错': [1, 1.0]
            }
        }

    """
    # 统计分类类型
    class_list = list(set(dataset_y))
    logging.info('当前包含的类型包括:{}'.format(class_list))

    # 统计词汇数量和词频
    word_list = list()
    for item in dataset_x:
        word_list.extend(item)
    word_dict = dict([
        item for item in collections.Counter(word_list).most_common()
        if item[1] >= min_word_freq
    ])

    # 统计各词在各类别中占比
    tmp_word_dict = dict([tuple([word, [0, 0]]) for word in word_dict])
    class_words_statistics = dict()
    for _class in class_list:
        class_words_statistics.update({_class: copy.deepcopy(tmp_word_dict)})

    for text_segs, label in zip(dataset_x, dataset_y):
        for word in text_segs:
            if word in word_dict:
                class_words_statistics[label][word][0] += 1

    result = dict()
    for label, words_statistics in class_words_statistics.items():
        for word, stats in words_statistics.items():
            stats[1] = stats[0] / word_dict[word]
        sorted_result = sorted([
            item for item in words_statistics.items()
            if item[1][1] > min_word_threshold
        ],
                               key=lambda i: i[1][1],
                               reverse=True)
        result.update({label: dict(sorted_result)})

    return result