示例#1
0
def word_distribution_loader():
    """ 加载 jieba 分词后的词汇结果在中文文本中的词频分布,返回每个词在语料中的出现总次数、概率、
    概率的 -log10 值。

    Returns:
        dict(list): 例如
            {'国家': {'total_num': 101930,
                    'prob': 0.0014539722,
                    'log_prob': 3.2632870},
             ...}

    """
    word_info = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'word_distribution.json'))

    word_info_dict = dict()
    total_num = sum([item[1] for item in word_info])
    for item in word_info:
        word_info_dict.update({
            item[0]: {
                'total_num': item[1],
                'prob': item[1] / total_num,
                'log_prob': -math.log10(item[1] / total_num)
            }
        })

    return word_info_dict
示例#2
0
def char_radical_loader():
    """ 加载汉字字形词典 char_radical.txt """
    structure_dict = {
        0: '一体结构',
        1: '左右结构',
        2: '上下结构',
        3: '左中右结构',
        4: '上中下结构',
        5: '右上包围结构',
        6: '左上包围结构',
        7: '左下包围结构',
        8: '全包围结构',
        9: '半包围结构'
    }

    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'char_radical.txt'))

    map_dict = dict()
    for item in content:
        assert len(item.split('\t')) == 5
        char, radical, structure, four_corner, components = item.split('\t')
        map_dict.update(
            {char: [radical, int(structure), four_corner, components]})

    return map_dict, structure_dict
示例#3
0
def chinese_char_dictionary_loader():
    """ 加载新华字典,分别包括:
    汉字,释义,详细释义 3 部分

    考虑到新华字典无法与时俱进,其中有相当多的老旧内容,故增删说明如下:
        1、删除了所有的日本和字 -> 释义中包含 “日本和字” 内容,如 “桛 ā 1.日本和字。”;
        2、删除了释义未详的字 -> 释义中包含 “义未详” 内容,如 “穝zuō## ⒈义未详。”
        3、删除了低频汉字 -> 释义中字频低于亿分之一的,且不在 char_distribution.json 中的字。
            如 “葨	葨wēi 1.见"葨芝"。”

    """
    content = read_file_by_line(os.path.join(GRAND_DIR_PATH, 'dictionary',
                                             'chinese_char_dictionary.txt'),
                                strip=False)

    char_dict = dict()
    for idx, line in enumerate(content):
        segs = line.split('\t')

        assert len(segs) == 3
        char_dict.update({
            segs[0]: {
                'explanation':
                segs[1],
                'more_details':
                segs[2].replace('\n', '') if segs[2] != '\n' else None
            }
        })

    return char_dict
示例#4
0
def char_distribution_loader():
    """ 加载 utf-8 编码字符在中文文本中的分布,返回每个字在语料中的出现总次数、概率、
    概率的 -log10 值。

    Returns:
        dict(list): 例如
            {'中': {'total_num': 61980430,
                    'prob': 0.0054539722,
                    'log_prob': 2.2632870},
             ...}

    """
    char_info = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'char_distribution.json'))

    char_info_dict = dict()
    total_num = sum([item[1] for item in char_info])
    for item in char_info:
        char_info_dict.update({
            item[0]: {
                'total_num': item[1],
                'prob': item[1] / total_num,
                'log_prob': -math.log10(item[1] / total_num)
            }
        })

    return char_info_dict
示例#5
0
def world_location_loader():
    ''' 加载世界地名词典 world_location.txt '''
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/world_location.txt'))
    
    result = dict()
    cur_continent = None
    
    for line in content:
        if '洲:' in line:
            cur_continent = line.replace(':', '')
            result.update({cur_continent: dict()})
            continue
        
        item_tup = line.split('\t')
        item_length = len(item_tup)
        if item_length == 3:
            result[cur_continent].update(
                {item_tup[0]: {'full_name': item_tup[1], 
                               'capital': item_tup[2]}})
        
        if item_length == 4:
            result[cur_continent].update(
                {item_tup[0]: {'full_name': item_tup[1], 
                               'capital': item_tup[2], 
                               'main_city': item_tup[3].split('/')}})
        else:
            pass
        
    return result
示例#6
0
def stopwords_loader():
    """ 加载停用词典 stopwords.txt """
    res = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/stopwords.txt'))
    # 一般漏掉了若干转换符号
    res.extend(['', ' ', '\t'])
    return res
示例#7
0
def chinese_char_dictionary_loader():
    """ 加载百度汉语字典,字典与新华字典大同小异,分别包括:
    汉字,偏旁,字形结构,四角编码,笔画顺序,繁体字,五笔输入编码,拼音,释义

    本词典囊括了 utf-8 编码中,“一~龥”的所有汉字,但有所删减
    考虑到百度汉语字典无法与时俱进,其中有相当多的老旧内容,故增删说明如下:
        1、删除了所有的日本和字 -> 释义中包含 “日本汉字/日本地名用字” 内容,如 “桛 ā 1.日本和字。”;
        2、删除了释义未详的字 -> 释义中包含 “义未详” 内容,或某个字的某个读音义未详,如 “穝zuō## ⒈义未详。”
        3、删除了低频汉字 -> 释义中字频低于亿分之一的,且不在 char_distribution.json 中的字。
            如 “葨wēi 1.见"葨芝"。”
        4、删除了所有的韩国、朝鲜创字、用字、用意 -> 櫷guī槐木的一种(韩国汉字)
        5、删除了古代用字、用意 -> 释义中包含  “古同~/古代~/古通~/古书~/古地名/古人名” 内容,
            但如有多个释义,且其中有非古代释义,则保留该汉字;如 “鼃 wā 古同蛙”。但常见古字,如“巙kuí”

        共计删减 3402 字。

    """
    content = read_file_by_line(os.path.join(GRAND_DIR_PATH, 'dictionary',
                                             'chinese_char_dictionary.txt'),
                                strip=False)

    pinyin_ptn = re.compile('\[[a-zàáāǎòóōǒèéēěìíīǐùúūǔǜǘǖǚǹńňüḿ]{1,8}\]')
    explanation_ptn = re.compile('\d{1,2}\.')

    char_dict = dict()
    for idx, line in enumerate(content):
        segs = line.split('\t')

        assert len(segs) == 8

        # 拆解每个读音的各个含义
        pinyin_list = [item[1:-1] for item in pinyin_ptn.findall(segs[-1])]
        explanation_list = [
            item for item in pinyin_ptn.split(segs[-1].replace(
                '~', segs[0]).strip()) if item != ''
        ]
        assert len(pinyin_list) == len(explanation_list)

        pinyin_explanation_dict = dict()
        for pinyin, explanations in zip(pinyin_list, explanation_list):
            explanations = [
                ex for ex in explanation_ptn.split(explanations) if ex != ''
            ]
            pinyin_explanation_dict.update({pinyin: explanations})

        char_dict.update({
            segs[0]: {
                'radical': segs[1],
                'structure': STRUCTURE_DICT[int(segs[2])],
                'corner_coding': segs[3],
                'stroke_order': segs[4],
                'traditional_version': segs[5],
                'wubi_coding': segs[6],
                'pinyin': pinyin_explanation_dict
            }
        })

    return char_dict
示例#8
0
def chinese_idiom_loader():
    ''' 加载成语词典 chinese_idiom.txt '''
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/chinese_idiom.txt'))

    result = dict()
    for line in content:
        item_tup = line.split('\t')
        result.update({item_tup[0]: int(item_tup[1])})
    return result
示例#9
0
def traditional_simplified_loader(file_name):
    """ 加载繁简体转换词典 """
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', file_name))

    map_dict = dict()
    for item in content:
        key, value = item.split('\t')
        map_dict.update({key: value})
    return map_dict
示例#10
0
def idf_loader():
    """ 加载 idf 文件,属于 tfidf 算法的一部分 """
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'idf.txt'))

    idf_dict = dict()
    for item in content:
        word, idf_value = item.split('\t')
        idf_dict.update({word: float(idf_value)})

    return idf_dict
示例#11
0
def pinyin_phrase_loader():
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'pinyin_phrase.txt'))

    map_dict = dict()
    for item in content:
        key, value = item.split('\t')
        value = value.split('/')
        map_dict.update({key: value})

    return map_dict
示例#12
0
def pkuseg_postag_loader():
    ''' 加载北大分词器的词性映射表 '''
    content = read_file_by_line(os.path.join(DIR_PATH,
                                             'pkuseg_postag_map.txt'))

    pkuseg_postag_map = dict()
    for line in content:
        segs = line.split('\t')
        pkuseg_postag_map.update({segs[0]: segs[1]})

    return pkuseg_postag_map
示例#13
0
def telecom_operator_loader():
    """ 加载通信运营商手机号码的匹配词典
    """
    telecom_operator = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'telecom_operator.txt'))

    telecom_operator_dict = dict()
    for line in telecom_operator:
        num, operator = line.strip().split(' ')
        telecom_operator_dict.update({num: operator})

    return telecom_operator_dict
示例#14
0
def china_location_loader():
    ''' 加载中国地名词典 china_location.txt '''
    location_jio = read_file_by_line(os.path.join(
        GRAND_DIR_PATH, 'dictionary/china_location.txt'),
                                     strip=False)

    cur_province = None
    cur_city = None
    cur_county = None
    location_dict = dict()

    for item in location_jio:
        if not item.startswith('\t'):  # 省
            if len(item.strip().split('\t')) != 3:
                continue
            province, admin_code, alias_name = item.strip().split('\t')
            cur_province = province
            location_dict.update({
                cur_province: {
                    '_full_name': province,
                    '_alias': alias_name,
                    '_admin_code': admin_code
                }
            })

        elif item.startswith('\t\t'):  # 县
            if len(item.strip().split('\t')) != 3:
                continue
            county, admin_code, alias_name = item.strip().split('\t')
            cur_county = county
            location_dict[cur_province][cur_city].update({
                cur_county: {
                    '_full_name': county,
                    '_alias': alias_name,
                    '_admin_code': admin_code
                }
            })

        else:  # 市
            if len(item.strip().split('\t')) != 3:
                continue
            city, admin_code, alias_name = item.strip().split('\t')
            cur_city = city
            location_dict[cur_province].update({
                cur_city: {
                    '_full_name': city,
                    '_alias': alias_name,
                    '_admin_code': admin_code
                }
            })

    return location_dict
示例#15
0
def xiehouyu_loader():
    """ 加载歇后语词典,共计 17000 余条,其中有相似的歇后语,如:
    一个模子出来的  一个样
    一个模子出来的  一模一样
    对于此类歇后语,均按不同的表达分为不同的歇后语,方便检索查询
    """
    xiehouyu = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'xiehouyu.txt'))

    xiehouyu = list(set(xiehouyu))
    xiehouyu = [item.split('\t') for item in xiehouyu]

    return xiehouyu
示例#16
0
def pinyin_char_loader():
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'pinyin_char.txt'))

    map_dict = dict()
    for item in content:
        key, value = item.split('\t')
        assert len(item.split('\t')) == 2

        multi_pinyin = value.split('/')
        map_dict.update({key: multi_pinyin})

    return map_dict
示例#17
0
def chinese_word_dictionary_loader():
    """ 加载新华词典,词典中有 20 万余个多音字,分别包括:
    词语及其释义
    """
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary',
                     'chinese_word_dictionary.txt'))

    word_dict = dict()
    for idx, line in enumerate(content):
        segs = line.split('\t')
        assert len(segs) == 2
        word_dict.update({segs[0]: segs[1]})

    return word_dict
示例#18
0
def sentiment_words_loader():
    """ 加载情感词典,并附带其对应的情感权重

    """
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'sentiment_words.txt'))

    sentiment_words_dict = dict()
    for item in content:
        key, value = item.split('\t')
        assert len(item.split('\t')) == 2

        # multi_pinyin = value.split('/')
        sentiment_words_dict.update({key: float(value)})

    return sentiment_words_dict
示例#19
0
def chinese_word_dictionary_loader():
    ''' 加载新华词典,词典中有 20 万余个多音字,分别包括:
    词语及其释义
    '''
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary',
                     'chinese_word_dictionary.txt'))

    word_list = list()
    for idx, line in enumerate(content):
        segs = line.split('\t')
        assert len(segs) == 2
        cur_item = {'word': segs[0], 'explanation': segs[1]}
        word_list.append(cur_item)

    return word_list
示例#20
0
def chinese_char_dictionary_loader():
    ''' 加载新华字典,分别包括:
    汉字,释义,详细释义 3 部分
    '''
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary',
                     'chinese_char_dictionary.txt'), strip=False)
    
    char_dict = dict()
    for line in content:
        segs = line.split('\t')
        
        assert len(segs) == 3
        char_dict.update({
            segs[0]: {'explanation': segs[1],
                      'more_details': segs[2].replace('\n', '')
                      if segs[2] != '\n' else None}})
        
    return char_dict
示例#21
0
def chinese_word_dictionary_loader():
    """ 加载新华词典,词典中有 20 万余个多音字,分别包括:
    词语及其释义

    考虑到新华词典无法与时俱进,其中有相当多的老旧内容,故增删说明如下:
        1、删除了所有未出现在 word_distribution.json 中的词汇;
            可发现,词典由原先 26万条锐减至 3.3万条,即新华词典中大量的词条都已被淘汰,且有很多新词未加入词典。

    """
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary',
                     'chinese_word_dictionary.txt'))

    word_dict = dict()
    for idx, line in enumerate(content):
        segs = line.split('\t')
        assert len(segs) == 2
        word_dict.update({segs[0]: segs[1]})

    return word_dict
示例#22
0
def chinese_idiom_loader():
    ''' 加载成语词典 chinese_idiom.txt '''
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/chinese_idiom.txt'))
    
    result = dict()
    cur_item = dict()
    import json
    for line in content:
        item_tup = line.split('\t')
        
        assert len(item_tup) == 5
        example = item_tup[3] if item_tup[3] != '' else None
        cur_item = {'explanation': item_tup[1],
                    'derivation': item_tup[2],
                    'example': example,
                    'freq': int(item_tup[4])}
        result.update({item_tup[0]: cur_item})
    
    return result
示例#23
0
def phone_location_loader():
    """ 加载电话号码地址与运营商解析词典 """
    content = read_file_by_line(os.path.join(GRAND_DIR_PATH, 'dictionary',
                                             'phone_location.txt'),
                                strip=False)

    def return_all_num(line):
        """ 返回所有的手机号码中间四位字符串 """
        front, info = line.strip().split('\t')
        num_string_list = info.split(',')
        result_list = list()

        for num_string in num_string_list:
            if '-' in num_string:
                start_num, end_num = num_string.split('-')
                for i in range(int(start_num), int(end_num) + 1):
                    result_list.append('{:0>4d}'.format(i))
            else:
                result_list.append(num_string)

        result_list = [front + res for res in result_list]

        return result_list

    phone_location_dict = dict()
    cur_location = ''
    zip_code_location_dict = dict()
    area_code_location_dict = dict()
    for line in content:
        if line.startswith('\t'):
            res = return_all_num(line)
            for i in res:
                phone_location_dict.update({i: cur_location})

        else:
            cur_location, area_code, zip_code = line.strip().split('\t')
            zip_code_location_dict.update({zip_code: cur_location})
            area_code_location_dict.update({area_code: cur_location})

    return phone_location_dict, zip_code_location_dict, area_code_location_dict
示例#24
0
def pinyin_char_loader():
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary', 'pinyin_char.txt'))

    map_dict = dict()
    for item in content:
        #print(item)
        if len(item.split('\t')) != 2:  # 该发音下无汉字
            continue
        #pdb.set_trace()
        key, value = item.split('\t')
        #print(key, value)
        value = list(value)
        for val in value:
            if val not in map_dict:
                map_dict.update({val: key})
            else:  # 说明存在多音字
                #logging.warn(val, map_dict[val])
                #logging.warn(val, key)
                pass
                #pdb.set_trace()
    return map_dict
示例#25
0
def chinese_char_dictionary_loader():
    ''' 加载新华字典,词典中有两千余个多音字,分别包括:
    汉字,其旧称,笔画数,拼音,偏旁部首,释义,详细释义 7 部分
    '''
    content = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary',
                     'chinese_char_dictionary.txt'))

    char_list = list()
    for line in content:
        segs = line.split('\t')
        assert len(segs) == 7
        cur_item = {
            'word': segs[0],
            'old_word': segs[1],
            'strokes': segs[2],
            'pinyin': segs[3],
            'radicals': segs[4],
            'explanation': segs[5],
            'more_details': segs[6]
        }
        char_list.append(cur_item)

    return char_list
示例#26
0
def china_location_change_loader():
    """ 加载中国地名变更词典 china_location_change.txt
    整理了 2018 年至今国内政府批复修改的县级以上的地名变化。仅添加了地名的撤销变更,
    而对未撤销地名的新增地名,如深圳市光明区,不做记录,因为不影响工具的使用。

    Args:
        None

    Returns:
        dict: 返回 省、市、县区 三级的变更地址,以及变更日期和批准部门;
            '国批' 表示国务院批准,'民批' 表示国务院民政部批准,
            '省批'表示省级政府或民政部批准。

    """
    location_change_jio = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/china_location_change.txt'))

    location_change_list = list()
    for line in location_change_jio:
        location_change_dict = dict()
        line_seg = line.split('=>')
        orig_line_seg = line_seg[0].split('\t')
        new_line_seg = line_seg[1].split('\t')
        location_change_dict.update({
            'date':
            orig_line_seg[0],
            'department':
            orig_line_seg[1],
            'old_loc':
            [orig_line_seg[2:4], orig_line_seg[4:6], orig_line_seg[6:8]],
            'new_loc':
            new_line_seg
        })
        location_change_list.append(location_change_dict)

    return location_change_list
示例#27
0
def stopwords_loader():
    ''' 加载停用词典 stopwords.txt '''
    return read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/stopwords.txt'))
示例#28
0
def pornography_loader():
    """ 加载淫秽色情词典 pornography.txt """
    return read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/pornography.txt'))
示例#29
0
def china_location_loader(detail=False):
    """ 加载中国地名词典 china_location.txt

    Args:
        detail(bool): 若为 True,则返回 省、市、县区、乡镇街道、村社区 五级信息;
            若为 False,则返回 省、市、县区 三级信息

    """
    location_jio = read_file_by_line(os.path.join(
        GRAND_DIR_PATH, 'dictionary/china_location.txt'),
                                     strip=False)

    cur_province = None
    cur_city = None
    cur_county = None
    cur_town = None
    cur_village = None
    location_dict = dict()

    for item in location_jio:
        if not item.startswith('\t'):  # 省
            if len(item.strip().split('\t')) != 3:
                continue
            province, admin_code, alias_name = item.strip().split('\t')
            cur_province = province
            location_dict.update({
                cur_province: {
                    '_full_name': province,
                    '_alias': alias_name,
                    '_admin_code': admin_code
                }
            })

        elif item.startswith('\t\t\t\t'):  # 村、社区
            if not detail:
                continue
            cur_village = item.strip()
            location_dict[cur_province][cur_city][cur_county][cur_town].update(
                {cur_village: None})

        elif item.startswith('\t\t\t'):  # 乡镇、街道
            if not detail:
                continue
            cur_town = item.strip()
            location_dict[cur_province][cur_city][cur_county].update(
                {cur_town: dict()})

        elif item.startswith('\t\t'):  # 县、区
            if len(item.strip().split('\t')) != 3:
                continue
            county, admin_code, alias_name = item.strip().split('\t')
            cur_county = county
            location_dict[cur_province][cur_city].update({
                cur_county: {
                    '_full_name': county,
                    '_alias': alias_name,
                    '_admin_code': admin_code
                }
            })

        else:  # 市
            if len(item.strip().split('\t')) != 3:
                continue
            city, admin_code, alias_name = item.strip().split('\t')
            cur_city = city
            location_dict[cur_province].update({
                cur_city: {
                    '_full_name': city,
                    '_alias': alias_name,
                    '_admin_code': admin_code
                }
            })

    return location_dict
示例#30
0
def negative_words_loader():
    """ 加载否定词典 negative_words.txt """
    res = read_file_by_line(
        os.path.join(GRAND_DIR_PATH, 'dictionary/negative_words.txt'))

    return res