Python TextProcessor 예제들, paper_parser.functions.TextProcessor Python 예제들

예제 #1

0

파일 보기

    def amount_unsure(self):
        """ 根据案件概述，初步获得起诉的总金额 """
        amount_unsure = None
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_basic_text).clean_text
            moneys = functions.TextProcessor(text).extract_moneys()
            if moneys:
                amount_unsure = max(moneys)

        return amount_unsure  # float, 以万元为单位

예제 #2

0

파일 보기

    def num_of_facts(self):
        """ 犯罪事实的数量。根据日期的数量综合判断 """
        num_of_facts = None
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_fact_text).clean_text
            fact_date_ints = []
            all_match = settings.pattern_num_of_facts.finditer(text)
            for match in all_match:
                year_str = match.group(1)
                month_str = match.group(2).translate(
                    str.maketrans({
                        '春': '3',
                        '夏': '6',
                        '秋': '9',
                        '冬': '12'
                    }))
                fact_date_int = int(year_str) * 100 + int(
                    month_str) if month_str else int(
                        year_str) * 100  # 将日期格式化为六位数的int
                fact_date_ints.append(fact_date_int)
            fact_date_ints = sorted(list(set(fact_date_ints)))  # 去重、排序
            if len(fact_date_ints) == 1:
                num_of_facts = 1
            elif len(fact_date_ints) > 1:
                num_of_facts = len(fact_date_ints) - 1

        return num_of_facts

예제 #3

0

파일 보기

 def is_simple_procedure(self):
     """ 是否简易程序 0-否 1-是 默认0 """
     is_simple_procedure = 0
     text = functions.TextProcessor(self.first_basic_text).clean_text
     if '简易程序' in text and '转为普通程序' not in text:
         is_simple_procedure = 1
     return is_simple_procedure

예제 #4

0

파일 보기

    def job_info(self):
        """ 犯罪行为人或犯罪对象的职务信息，包括职务名、单位性质、职务级别。该字段在paper.defendant_info['job']的基础上针对贪污贿赂罪拓展 """
        job_info = {'job': None, 'job_type': None, 'job_grade': None}
        if self.trial_level == 1:
            # 寻找职务名
            if self.defendant_info[
                    'job'] is not None:  # 直接引用paper.defendant_info['job']
                job_info['job'] = self.defendant_info['job']
            else:
                text = functions.TextProcessor(self.first_fact_text).clean_text
                text = text[:text.find('证据')]
                job_match = settings.pattern_job_info['job'].search(text)
                if job_match:
                    job_info['job'] = job_match.group(1)
            if job_info['job'] is not None:
                # 判断单位性质
                for job_type in settings.JOB_TYPES:
                    if job_info['job_type'] is None:
                        for job_type_key in settings.JOB_TYPE_DICT[job_type]:
                            if job_type in (
                                    'X', 'S'):  # 对行政机关、事业单位和人民团体特殊处理，只检查最后6个字
                                _job = job_info['job'][-6:] if len(
                                    job_info['job']) > 6 else job_info['job']
                                if job_type_key in _job:
                                    job_info['job_type'] = job_type
                                    break
                            else:  # 其他类型的单位普通处理，对全部职务名称检索关键词
                                if job_type_key in job_info['job']:
                                    job_info['job_type'] = job_type
                                    break

        return job_info

예제 #5

0

파일 보기

    def is_plus_investigated(self):
        """ 是否有补充侦查 0-否 1-是 默认0 """
        is_plus_investigated = 0
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_basic_text).clean_text
            if '补充侦查' in text:
                is_plus_investigated = 1

        return is_plus_investigated  # int

예제 #6

0

파일 보기

 def amount_sure(self):
     """ 根据法院认定情况或已查明的事实，初步获得认定的总金额 """
     amount_sure = None
     if self.trial_level == 1:
         # 首先在法院认定情况中寻找
         text = functions.TextProcessor(self.first_opinion_text).clean_text
         text = text[:text.find('辩护')]  # 截至'辩护'
         moneys = functions.TextProcessor(text).extract_moneys()
         if moneys:
             amount_sure = max(moneys)
         # 如果找不到，再在已查明的事实中寻找
         else:
             text = functions.TextProcessor(self.first_fact_text).clean_text
             text = text[:text.find('证据')]  # 定位事实部分
             moneys = functions.TextProcessor(text).extract_moneys()
             if moneys:
                 amount_sure = max(moneys)
     return amount_sure  # float, 以万元为单位

예제 #7

0

파일 보기

    def is_delayed(self):
        """ 是否延期 0-否 1-是 默认0 """
        is_delayed = 0
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_basic_text).clean_text
            match = settings.pattern_is_delayed.search(text)
            if match:
                is_delayed = 1

        return is_delayed

예제 #8

0

파일 보기

    def prosecute_number(self):
        """ 获取起诉书号 """
        prosecute_number = None
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_basic_text).clean_text
            match = settings.pattern_prosecute_number.search(text)
            if match:
                prosecute_number = match.group(1)

        return prosecute_number  # str

예제 #9

0

파일 보기

    def is_bad_effect(self):
        """ 是否造成恶劣社会影响/国家和人民利益损失 0否1是 默认0 """
        is_bad_effect = 0
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_opinion_text).clean_text
            text = text[:text.find('辩护')]
            match = settings.pattern_bad_effect.search(text)
            if match:
                is_bad_effect = 1

        return is_bad_effect

예제 #10

0

파일 보기

    def is_zishou(self):
        """ 是否有自首情节 0-否 1-是 默认0 """
        is_zishou = 0
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_opinion_text).clean_text
            # 消除辩护意见
            for sentence in self.defensive_opinion_sentences:
                text = text.replace(sentence, '')
            if '自首' in text:
                is_zishou = 1

        return is_zishou

예제 #11

0

파일 보기

 def defensive_opinion_sentences(self):
     """ 在法院认定意见中，获取含辩护意见的多个句子元组或空元组 """
     defensive_opinion_sentences = []
     if self.trial_level == 1:
         opinion_sentences = functions.TextProcessor(
             self.first_opinion_text).sentences
         if opinion_sentences:
             if '本院认为' in opinion_sentences[0]:
                 for s in opinion_sentences:
                     if '辩护' in s:
                         defensive_opinion_sentences.append(s)
     return tuple(defensive_opinion_sentences)  # tuple(str, )

예제 #12

0

파일 보기

    def money_usage(self):
        """ 赃款的用途 """
        money_usage = None
        if self.trial_level == 1:
            if self.cause in ('贪污罪', '受贿罪', '挪用公款罪'):
                text = functions.TextProcessor(
                    self.first_opinion_text).clean_text
                match = settings.pattern_money_usage.search(text)
                if match:
                    money_usage = match.group(1)

        return money_usage

예제 #13

0

파일 보기

    def is_suohui(self):
        """ 是否有索贿情节 0否1是 默认0 """
        is_suohui = None
        if self.trial_level == 1:
            if self.cause == '受贿罪':
                is_suohui = 0
                text = functions.TextProcessor(
                    self.first_opinion_text).clean_text
                text = text[:text.find('辩护')]
                if '索贿' in text:
                    is_suohui = 1

        return is_suohui

예제 #14

0

파일 보기

    def is_tuizang(self):
        """ 是否退赃 0否1是 默认0 """
        is_tuizang = None
        if self.trial_level == 1:
            if self.cause in ('贪污罪', '受贿罪'):
                is_tuizang = 0
                text = functions.TextProcessor(
                    self.first_opinion_text).clean_text
                text = text[:text.find('辩护')]
                if '退' in text:  # 退回 退赃 退缴 退清 退出 退交 退还 退赔 退完
                    is_tuizang = 1

        return is_tuizang

예제 #15

0

파일 보기

    def is_seek_promote(self):
        """ 是否谋求他人职务调整 0否1是 默认0 """
        is_seek_promote = None
        if self.trial_level == 1:
            if self.cause == '受贿罪':
                is_seek_promote = 0
                text = functions.TextProcessor(
                    self.first_opinion_text).clean_text
                text = text[:text.find('辩护')]
                if '提拔' in text:
                    is_seek_promote = 1

        return is_seek_promote

예제 #16

0

파일 보기

    def prosecutors(self):
        """ 获取公诉人姓名列表 """
        prosecutors = []
        if self.trial_level == 1:
            text = functions.TextProcessor(self.first_basic_text).clean_text
            match = settings.pattern_prosecutors.search(text)
            if match:
                prosecutors = list(
                    map(
                        lambda a: settings.pattern_prosecutors_delete_strings.
                        sub('', a),
                        match.group(1).split('、')))

        return prosecutors  # list[str, ]

예제 #17

0

파일 보기

    def is_punished_by_party_admin(self):
        """ 是否曾因贪污、受贿受过党纪、行政处分 0否1是 默认0 """
        is_punished_by_party_admin = None
        if self.trial_level == 1:
            if self.cause in ('贪污罪', '受贿罪'):
                is_punished_by_party_admin = 0
                text = functions.TextProcessor(
                    self.first_opinion_text).clean_text
                text = text[:text.find('辩护')]
                match = settings.pattern_punished_by_party_admin.search(text)
                if match:
                    is_punished_by_party_admin = 1

        return is_punished_by_party_admin

예제 #18

0

파일 보기

    def is_special_money(self):
        """ 是否贪污特定款项 0否1是 默认0 """
        is_special_money = None
        if self.trial_level == 1:
            if self.cause in ('贪污罪', '挪用公款罪'):
                is_special_money = 0
                text = functions.TextProcessor(
                    self.first_opinion_text).clean_text
                text = text[:text.find('辩护')]
                match = settings.pattern_special_money.search(text)
                if match:
                    is_special_money = 1

        return is_special_money

예제 #19

0

파일 보기

    def is_tanbai(self):
        """ 是否有坦白情节 0-否 1-是 默认0 """
        """ 包含表述：坦白；认罪；如实供述；交代 """
        is_tanbai = 0
        if self.is_zishou:  # 是自首的一定是坦白
            is_tanbai = 1
        elif self.trial_level == 1:
            text = functions.TextProcessor(self.first_opinion_text).clean_text
            # 消除辩护意见
            for sentence in self.defensive_opinion_sentences:
                text = text.replace(sentence, '')
            tanbai_match = settings.pattern_tanbai.search(text)
            if tanbai_match:
                is_tanbai = 1

        return is_tanbai

예제 #20

0

파일 보기

    def is_punished_by_criminal_law(self):
        """ 是否曾因故意犯罪受过刑事追究 0否1是 默认0 """
        is_punished_by_criminal_law = None
        if self.trial_level == 1:
            if self.cause in ('贪污罪', '受贿罪'):
                if self.is_leifan == 1:  # 如果是累犯，该字段值自动为1
                    is_punished_by_criminal_law = 1
                else:
                    is_punished_by_criminal_law = 0
                    text = functions.TextProcessor(
                        self.first_opinion_text).clean_text
                    text = text[:text.find('辩护')]
                    match = settings.pattern_punished_by_criminal_law.search(
                        text)
                    if match:
                        is_punished_by_criminal_law = 1

        return is_punished_by_criminal_law

예제 #21

0

파일 보기

 def to_html(self, html_path):
     """ 输出文书内容到html文件。必须指定文件的绝对路径html_path """
     """ 按段落输出，同时输出各段落标记 """
     with open(html_path, 'w', encoding='utf-8') as f:
         f.write(
             settings.html_template_head.replace('{title}',
                                                 str(self.paper_id)))
         jid, cause, title, case_number, court = functions.ItemDumper(  # 格式化输出
             self.jid, self.cause, self.title, self.case_number,
             self.court).format()
         f.write("""<p>{0} {1}</p>\n<p>{2}</p>\n<p>{3}</p>\n<p>{4}</p>\n""".
                 format(jid, cause, title, case_number, court))
         for para in self.all_paragraphs:
             f.write("<p>{0}.{1}</p>\n<p>{2}</p>\n".format(
                 para[0], para[1],
                 functions.TextProcessor(para[3]).clean_text))
         f.write(settings.html_template_tail)
     print('to html finished at paper_id: {}'.format(self.paper_id))
     return 0

예제 #22

0

파일 보기

 def gongfan(self):
     """ 共犯状态 0-不区分主从 1-主犯 2-从犯 默认None """
     """ 目前只适用于单人的判决书 """
     gongfan = None
     if self.trial_level == 1:
         text = functions.TextProcessor(self.first_opinion_text).clean_text
         # 消除辩护意见
         for sentence in self.defensive_opinion_sentences:
             text = text.replace(sentence, '')
         no_zhucong_match = settings.pattern_gongfan['no_zhucong'].search(
             text)
         if no_zhucong_match:
             gongfan = 0
         else:
             zhucong_match = settings.pattern_gongfan['zhucong'].search(
                 text)
             if zhucong_match:
                 if zhucong_match.group(1) == '主':
                     gongfan = 1
                 elif zhucong_match.group(1) == '从':
                     gongfan = 2
     return gongfan

예제 #23

0

파일 보기

    def penalty(self):
        """ 判决结果 """
        penalty = None
        if self.trial_level == 1:
            text = functions.TextProcessor(
                self.first_judge_text).clean_text.split('    ')[0]
            penalty = {
                'many': None,
                'freedom': None,
                'property': None,
                'right': None,
                'delay': None
            }
            # many 确定罪数
            many_strings = settings.pattern_penalty['many'].findall(text)
            if many_strings:  # 如果提取不到罪数，则认为该句存在问题，放弃继续提取；如有罪数，则改其他项的None为0
                penalty = {
                    'many': len(many_strings),
                    'freedom': 0,
                    'property': 0.0,
                    'right': 0,
                    'delay': 0
                }
                text = settings.pattern_penalty['split'].split(text)[
                    -1]  # 定位最终执行语句
                # freedom  主刑
                for k, v in settings.pattern_penalty['freedom'].items():
                    freedom_match = v.search(text)
                    if freedom_match:
                        if k == 'juyi':  # 拘役用负数表示
                            penalty[
                                'freedom'] = -functions.TextProcessor.period2num(
                                    freedom_match.group(1))
                        elif k == 'youqitx':  # 有期徒刑用正数表示
                            penalty[
                                'freedom'] = functions.TextProcessor.period2num(
                                    freedom_match.group(1))
                        elif k in ('wuqitx', 'sixing'):  # 无期徒刑、死刑直接写入
                            penalty['freedom'] = freedom_match.group(0)
                        break
                # property  财产刑
                if '全部' in text:  # 先搜索没收个人全部财产，如有，直接写入字符串
                    penalty['property'] = '全部'
                else:
                    fajin_match = settings.pattern_penalty['property'][
                        'fajin'].search(text)
                    moshou_match = settings.pattern_penalty['property'][
                        'moshou'].search(text)
                    fajin_money = functions.TextProcessor(fajin_match.group(
                        1)).extract_moneys() if fajin_match else None
                    moshou_money = functions.TextProcessor(
                        moshou_match.group(
                            1)).extract_moneys() if moshou_match else None
                    if fajin_money and moshou_money:  # 同时有罚金和没收，合并数额，在前面冠以±号
                        penalty['property'] = '±{0:.2f}'.format(
                            fajin_money[0] + moshou_money[0])
                    elif fajin_money:  # 只有罚金，用正值表示
                        penalty['property'] = fajin_money[0]
                    elif moshou_money:  # 只有没收，用负值表示
                        penalty['property'] = -moshou_money[0]
                # right  资格刑
                if '政治权利终身' in text:  # 先搜索剥夺政治权利终身，如有，直接写入字符串
                    penalty['right'] = '终身'
                else:  # 搜索剥夺政治权利的具体时长
                    right_match = settings.pattern_penalty['right'].search(
                        text)
                    if right_match:
                        penalty['right'] = functions.TextProcessor.period2num(
                            right_match.group(1))
                # delay  缓刑
                delay_match = settings.pattern_penalty['delay'].search(text)
                if delay_match:
                    penalty['delay'] = functions.TextProcessor.period2num(
                        delay_match.group(1))
                # free 检查是否免予处罚、无罪
                free_match = settings.pattern_penalty['free'].search(text)
                if free_match:  # 重置为0
                    penalty = {
                        'many': penalty['many'],
                        'freedom': 0,
                        'property': 0.0,
                        'right': 0,
                        'delay': 0
                    }

        return penalty

예제 #24

0

파일 보기

 def is_designated(self):
     """ 是否指定管辖 0-否 1-是 默认0"""
     is_designated = 0
     if '管辖' in functions.TextProcessor(self.first_basic_text).clean_text:
         is_designated = 1
     return is_designated

예제 #25

0

파일 보기

    def defendant_info(self):
        """ 获取被告人信息字典 """
        text = functions.TextProcessor(self.litigant_info_text).clean_text
        defendant_info = {
            'name': None,
            'is_name_covered': None,
            'sex': None,
            'birth': None,
            'age': None,
            'tribe': '汉族',
            'is_minor': 0,
            'educated': None,
            'job': None
        }
        if text:
            if self.trial_level == 1:
                # 获取更准确的含被告人信息的句子
                text_split = text.split('    ')
                if len(text_split) < 2:
                    return defendant_info  # 无法正确获得含被告人信息的句子
                defendant_text = text_split[1][:text_split[1].find('。')] + '。'
                # name, is_name_covered
                if self.litigants:  # 先引用litigants中的名字
                    defendant_info['name'] = '+'.join(self.litigants)
                else:  # 如果没有，再自己查找
                    name_match = settings.pattern_defendant['name'].search(
                        defendant_text)
                    if name_match:
                        name = name_match.group(1)
                        defendant_info['name'] = name if len(
                            name) < 10 else None
                if defendant_info['name']:
                    if '某' in defendant_info['name'] or functions.TextProcessor(
                            defendant_info['name']).check_exist(
                                r'[^\u4e00-\u9fff]'):
                        defendant_info['is_name_covered'] = 1
                    else:
                        defendant_info['is_name_covered'] = 0
                else:
                    return defendant_info  # 如果找不到姓名，则视为句子有缺陷，不再继续查找其他被告人信息，直接返回默认字典
                # sex
                if '，男' in defendant_text:
                    defendant_info['sex'] = 1
                elif '，女' in defendant_text:
                    defendant_info['sex'] = 0
                # birth, age
                birth_match = settings.pattern_defendant['birth'].search(
                    defendant_text)
                if birth_match:
                    dates = functions.TextProcessor(
                        birth_match.group(0)).extract_dates()
                    if dates:
                        defendant_info['birth'] = dates[0]
                        defendant_info[
                            'age'] = self.judge_date.year - defendant_info[
                                'birth'].year if self.judge_date else None
                if not defendant_info['age']:  # 有些判决书直接写了年龄
                    age_match = settings.pattern_defendant['age'].search(
                        defendant_text)
                    if age_match:
                        defendant_info['age'] = int(age_match.group(1))
                # tribe, is_minor
                tribe_match = settings.pattern_defendant['tribe'].search(
                    defendant_text)
                if tribe_match:
                    defendant_info['tribe'] = tribe_match.group(1)
                    if defendant_info['tribe'] != '汉族':
                        defendant_info['is_minor'] = 1
                # educated 1-小学 2-初中 3-高中、中专 4-大专、专科 5-大学、本科 6-研究生
                educated_name = None
                for pattern_educated in settings.pattern_defendant['educated']:
                    educated_match = pattern_educated.search(defendant_text)
                    if educated_match:
                        educated_name = educated_match.group(1)
                        break
                if educated_name:
                    for e_key in settings.EDUCATED_DICT.keys():
                        if e_key in educated_name:
                            defendant_info[
                                'educated'] = settings.EDUCATED_DICT[e_key]
                            break
                # job
                job_match = settings.pattern_defendant['job'].search(
                    defendant_text)  # 先在defendant_text中找
                if job_match:
                    defendant_info['job'] = job_match.group(1)

        return defendant_info  # dict