Пример #1
0
    def parse_text(self, text, d, db, cursor):
        d['body'] = re.sub('\s', '', text)
        # 提取开庭地点
        courtNum = re.findall('法院(.*?庭)', d['body'])
        if courtNum:
            d['courtNum'] = courtNum[0]
        # 提取时间
        sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['body'])
        if sorttime:
            d['sorttime'] = sorttime[0]
        # 提取审判员
        judge = re.findall('审判员(.*?)[书代]', d['body'])
        if judge:
            d['judge'] = judge[0].replace(':', '')

        # 提取被告,案由,原告(从标题上面提取)
        party = re.findall('被告(.*)', d['title'])
        if party:
            party = party[0]
            anyou = ktgg.set_anyou()
            start, end = ktgg.search_anyou(anyou, party)
            if start == 0:
                return
            d['anyou'] = party[start:end]
            d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                    party)[0].replace('人', '')
        d['md5'] = ktgg.get_md5(d['body'], d['url'])
        ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
Пример #2
0
 def parse_text(self, text, d, db, cursor):
     infos = text.xpath('//span[@class="detail_content"]//tr')[1:]
     for info in infos:
         d_info = d.copy()
         # 提取body
         d_info['body'] = info.xpath('string(.)').replace('\r', '').replace(
             '\n', '')
         # 提取案号
         d_info['caseNo'] = info.xpath('./td[2]/span/text()')[0]
         # 提取审判员
         d_info['judge'] = info.xpath('./td[5]/span/text()')[0]
         # 提取开庭地点
         d_info['courtNum'] = info.xpath('./td[6]/span/text()')[0]
         # 提取时间
         d_info['sorttime'] = info.xpath('./td[7]/span/text()')[0].split(
             ' ')[0]
         # 提取原告和被告和案由
         party = info.xpath('./td[3]/span/text()')[0]
         for i in self.party:
             try:
                 party = re.findall(i, party)[0]
             except IndexError:
                 continue
             else:
                 anyou = ktgg.set_anyou()
                 if type(party) is str:
                     start, end = ktgg.search_anyou(anyou, party)
                     if start == 0:
                         return
                     d_info['anyou'] = party[start:end]
                     d_info['pname'] = re.findall(
                         '(.*?)%s' % d_info['anyou'],
                         party)[0].replace('被告人', '').replace('被告', '')
                 elif type(party) is tuple:
                     start, end = ktgg.search_anyou(anyou, party[1])
                     if start == 0:
                         return
                     d_info['anyou'] = party[1][start:end]
                     d_info['plaintiff'] = party[0].replace('原告人',
                                                            '').replace(
                                                                '原告', '')
                     d_info['pname'] = re.findall(
                         '(.*?)%s' % d_info['anyou'],
                         party[1])[0].replace('被告人', '').replace('被告', '')
             break
         d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
         ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
Пример #3
0
    def parse_text(self, text, d, db, cursor):
        for i in self.tihuan:
            text = text.replace(i, '')
        d['body'] = text

        # 提取开庭时间
        sorttime = re.findall('\d{1,4}[年月].*?[日号]', text)
        if sorttime:
            d['sorttime'] = sorttime[0]
        # 提取开庭地点
        courtNum = re.findall('在(.{2,7}庭)', text)
        if courtNum:
            d['courtNum'] = courtNum[0]
        else:
            courtNum = re.findall('第.{1,4}庭', text)
            if courtNum:
                d['courtNum'] = courtNum[0]

        # 提取原告,被告和案由
        for i in self.party:
            try:
                party = re.findall(i, d['body'])[0]
            except IndexError:
                continue
            else:
                anyou = ktgg.set_anyou()
                if type(party) is str:
                    start, end = ktgg.search_anyou(anyou, party)
                    if start == 0:
                        ktgg.write_txt('anyou', text)
                    d['anyou'] = party[start:end]
                    d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                            party)[0].replace('人', '')
                elif type(party) is tuple:
                    start, end = ktgg.search_anyou(anyou, party[1])
                    if start == 0:
                        return
                    d['anyou'] = party[1][start:end]
                    d['plaintiff'] = party[0].replace('人', '')
                    d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                            party[1])[0].replace('人', '')
            break
        d['md5'] = ktgg.get_md5(d['body'], d['url'])
        ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
Пример #4
0
 def parse_text(self, d, db, cursor):
     # 提取详细信息
     d['body'] = d['title']
     # 提取开庭地点
     sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['title'])
     if sorttime:
         d['sorttime'] = sorttime[0]
     # 提取开庭时间
     courtNum = re.findall('第.{1,4}庭', d['title'])
     if courtNum:
         d['courtNum'] = courtNum[0]
     # 提取案由和被告以及原告
     for i in self.party:
         try:
             party = re.findall(i, d['title'])[0]
         except IndexError:
             continue
         else:
             anyou = ktgg.set_anyou()
             if type(party) is str:
                 start, end = ktgg.search_anyou(anyou, party)
                 if start == 0:
                     return
                 d['anyou'] = party[start:end]
                 d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                         party)[0].replace('被告人',
                                                           '').replace(
                                                               '被告', '')
             elif type(party) is tuple:
                 start, end = ktgg.search_anyou(anyou, party[1])
                 if start == 0:
                     return
                 d['anyou'] = party[1][start:end]
                 d['plaintiff'] = party[0].replace('原告人',
                                                   '').replace('原告', '')
                 d['pname'] = re.findall('(.*?)%s' % d['anyou'],
                                         party[1])[0].replace('被告人',
                                                              '').replace(
                                                                  '被告', '')
         break
     d['md5'] = ktgg.get_md5(d['body'], d['url'])
     ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
     time.sleep(0.5)
Пример #5
0
    def parse_text(self,d,db,cursor):
        # 由于格式原因分为两种情况
        if '排期开庭' in d['title']:
            l = re.findall('(\d{1,4}[年].*?[日上下号])(.*?)\d{1,2}、{1,2}',d['body'])
            for info in l:
                d['body'] = info[0] + info[1]
                d['sorttime'] = info[0]
                d['anyou'] = ktgg.set_anyou(info[1])

                caseNo = re.findall('[\[【((].*?号',info[1])
                d['caseNo'] = ''
                if caseNo:
                    d['caseNo'] = caseNo[0]

                courtNum = re.findall('我院(.*?)开庭审理',info[1])
                d['courtNum'] = ''
                if courtNum:
                    d['courtNum'] = courtNum[0].replace('公开','')

                for i in self.pname_p:
                    s = re.findall(i % d['anyou'],info[1])
                    if s :
                        d['plaintiff'] = s[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','')
                        d['pname'] = s[0][1].replace('被告人','').replace('被告','')
                        break
                
                d['md5'] = ktgg.get_md5(d['body'],d['url'])
                ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)

        else:
            anyou = ktgg.set_anyou()
            d['anyou'] = ktgg.search_anyou(anyou,d['body'])
            d['sorttime'] = re.findall('\d{1,4}[年月].*?[日上下号]',d['body'])[0]
            # 案号
            caseNo = re.findall('[\[【((].*?号',d['body'])
            d['caseNo'] = ''
            if caseNo:
                d['caseNo'] = caseNo[0]
            # 开庭地点
            courtNum = re.findall('我院(.*?)开庭审理',d['body'])
            d['courtNum'] = ''
            if courtNum:
                d['courtNum'] = courtNum[0].replace('公开','')
            # 获取原告和被告
            for i in self.pname_p:
                l = re.findall(i % d['anyou'],d['body'])
                if l :
                    d['plaintiff'] = l[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','')
                    d['pname'] = l[0][1].replace('被告人','').replace('被告','')
                    break
            d['md5'] = ktgg.get_md5(d['body'],d['url'])
            ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)
Пример #6
0
 def parse_text(self, text, d, db, cursor):
     # 切割文本
     infos = re.split('\n', text)
     for info in infos:
         if info:
             d_info = d.copy()
             # 提取时间
             d_info['body'] = info.replace('\xa0', '').replace('\r', '')
             sorttime = re.findall('\d{1,4}[年月].*?[日号]', info)
             if sorttime:
                 d_info['sorttime'] = sorttime[0]
             # 提取开庭地点
             courtNum = re.findall('在(.*?庭)', info)
             if courtNum:
                 d_info['courtNum'] = courtNum[0]
             d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
             # 提取案由,原告,被告
             for party in self.party:
                 party = re.findall(party, info)
                 if party:
                     party = party[0]
                     d_info['plaintiff'] = party[0].replace('原告',
                                                            '').replace(
                                                                '人', '')
                     anyou = ktgg.set_anyou()
                     start, end = ktgg.search_anyou(anyou, party[1])
                     d_info['anyou'] = party[1][start:end]
                     pname = re.findall('(.*?)%s' % d_info['anyou'],
                                        party[1])
                     if pname:
                         d_info['pname'] = pname[0].replace('被告',
                                                            '').replace(
                                                                '人', '')
                         break
                 else:
                     continue
             d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
             ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
Пример #7
0
    def parse_text(self, text, d, db, cursor):
        # 切割文本
        infos = re.split('\n', text)
        f = []
        for info in infos:
            d_info = d.copy()
            info = re.split('\s', info)
            info = list(filter(None, info))
            # 第一种情况
            start = 0
            if len(info) >= 6:
                d_info['sorttime'] = ''
                d_info['caseNo'] = ''
                d_info['body'] = ''.join(info)
                for i in info:
                    # 提取案号,案由,被告和原告
                    if ('号' in i) and (d_info['caseNo'] == ''):
                        d_info['caseNo'] = i
                        # 获取party
                        index = info.index(i)
                        party = info[index + 1]
                        # 获取案由
                        anyou = ktgg.set_anyou()
                        start, end = ktgg.search_anyou(anyou, party)
                        if start == 0:
                            break
                        d_info['anyou'] = party[start:end]
                        # 获取原告和被告
                        if '诉' in party:
                            p = re.split('诉', party)
                            d_info['plaintiff'] = p[0]
                            d_info['pname'] = re.findall(
                                '(.*?)%s' % d_info['anyou'], p[1])
                        else:
                            d_info['pname'] = re.findall(
                                '(.*?)%s' % d_info['anyou'], party)[0]
                    # 提取开庭时间和开庭地点
                    if d_info['sorttime'] == '':
                        sorttime = re.findall('\d{4}-\d{2}-\d{2}', i)
                        if sorttime:
                            d_info['sorttime'] = sorttime[0]
                            index = info.index(i)
                            d_info['courtNum'] = info[index - 1]
                if start == 0:
                    continue
                d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
                ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)

            # 第二种情况
            elif 0 < len(info):
                f.append(info)
                if len(f) == 2:
                    info = f[0] + f[1]
                    d_info['body'] = ''.join(info)
                    # 提取时间
                    sorttime = re.findall('\d{4}年.*?日', d_info['body'])
                    if sorttime:
                        d_info['sorttime'] = sorttime[0]
                    # 提取法庭
                    courtNum = re.findall('第.{2,6}庭|回龙法庭', d_info['body'])
                    if courtNum:
                        d_info['courtNum'] = courtNum[0]
                    # 获取案号
                    caseNo = re.findall('[((民].*?号', d_info['body'])
                    if caseNo:
                        d_info['caseNo'] = caseNo[0]

                    for i in info:
                        if '诉' in i:
                            # 获取案由
                            anyou = ktgg.set_anyou()
                            start, end = ktgg.search_anyou(anyou, i)
                            if start == 0:
                                break
                            d_info['anyou'] = i[start:end]
                            # 获取原告和被告
                            if '诉' in i:
                                p = re.split('诉', i)
                                if '号' in p[0]:
                                    d_info['plaintiff'] = p[0].split('号')
                                else:
                                    d_info['plaintiff'] = p[0]
                                d_info['pname'] = re.findall(
                                    '(.*?)%s' % d_info['anyou'], p[1])
                            else:
                                d_info['pname'] = re.findall(
                                    '(.*?)%s' % d_info['anyou'], party)[0]

                    f = []
                if start == 0:
                    continue
                d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url'])
                ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
Пример #8
0
    def parse_text(self,text,html,d,db,cursor):
        if '开庭公告' in d['title']:
            infos = html.xpath('//tbody/tr')[1:]
            for info in infos:
                d_info = d.copy()
                d_info['body'] = info.xpath('string(.)').replace('\r','').replace('\n','')
                if len(info.xpath('./td')) == 4:
                    # 提取时间
                    d_info['sorttime'] = info.xpath('./td[4]')[0].xpath('string(.)').split(' ')[0]
                    # 提取地点
                    d_info['courtNum'] = info.xpath('./td[3]')[0].xpath('string(.)')
                    party = info.xpath('./td[2]')[0].xpath('string(.)')
                else:
                    # 提取时间
                    d_info['sorttime'] = info.xpath('./td[3]')[0].xpath('string(.)').split(' ')[0]
                    # 提取地点
                    d_info['courtNum'] = info.xpath('./td[2]')[0].xpath('string(.)')
                    party = info.xpath('./td[1]')[0].xpath('string(.)')
                # 提取被告和原告和案由
                for i in self.party:
                    try:
                        party = re.findall(i,party)[0]
                    except IndexError:
                        continue
                    else:
                        anyou = ktgg.set_anyou()
                        if type(party) is str:
                            start,end = ktgg.search_anyou(anyou,party)
                            if start == 0:
                                return
                            d_info['anyou'] = party[start:end]
                            d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party)[0].replace('被告人','').replace('被告','')
                        elif type(party) is tuple:
                            start,end = ktgg.search_anyou(anyou,party[1])
                            if start == 0:
                                return
                            d_info['anyou'] = party[1][start:end]
                            d_info['plaintiff'] = party[0].replace('原告人','').replace('原告','')
                            d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party[1])[0].replace('被告人','').replace('被告','')
                    break
                else:
                    return
                d_info['md5'] = ktgg.get_md5(d_info['body'],d_info['url'])
                ktgg.ins_mysql(d_info,'ktgg_kt_wuhan',db,cursor)

        else:
            d['body'] = text
            # 提取日期
            sorttime = re.findall('\d{1,2}月.*?日',d['title'])
            if sorttime:
                d['sorttime'] = d['posttime'].split('-')[0] + '年' + sorttime[0]
            #提取审判庭    
            courtNum = re.findall('在(.{2,5}庭)',d['body'])
            if courtNum:
                d['courtNum'] = courtNum[0]
            for i in self.party:
                try:
                    party = re.findall(i,d['body'])[0]
                except IndexError:
                    continue
                else:
                    anyou = ktgg.set_anyou()
                    if type(party) is str:
                        start,end = ktgg.search_anyou(anyou,party)
                        if start == 0:
                            return
                        d['anyou'] = party[start:end]
                        d['pname'] = re.findall('(.*?)%s' % d['anyou'],party)[0].replace('被告人','').replace('被告','').replace('审','').replace('理','')
                    elif type(party) is tuple:
                        start,end = ktgg.search_anyou(anyou,party[1])
                        if start == 0:
                            return
                        d['anyou'] = party[1][start:end]
                        d['plaintiff'] = party[0].replace('原告人','').replace('原告','').replace('审','').replace('理','')
                        d['pname'] = re.findall('(.*?)%s' % d['anyou'],party[1])[0].replace('被告人','').replace('被告','')
                break
            else:
                return
            d['md5'] = ktgg.get_md5(d['body'],d['url'])
            ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)