def parse_text(self, text, d, db, cursor): d['body'] = re.sub('\s', '', text) # 提取开庭地点 courtNum = re.findall('法院(.*?庭)', d['body']) if courtNum: d['courtNum'] = courtNum[0] # 提取时间 sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['body']) if sorttime: d['sorttime'] = sorttime[0] # 提取审判员 judge = re.findall('审判员(.*?)[书代]', d['body']) if judge: d['judge'] = judge[0].replace(':', '') # 提取被告,案由,原告(从标题上面提取) party = re.findall('被告(.*)', d['title']) if party: party = party[0] anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, party) if start == 0: return d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'], party)[0].replace('人', '') d['md5'] = ktgg.get_md5(d['body'], d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): infos = text.xpath('//span[@class="detail_content"]//tr')[1:] for info in infos: d_info = d.copy() # 提取body d_info['body'] = info.xpath('string(.)').replace('\r', '').replace( '\n', '') # 提取案号 d_info['caseNo'] = info.xpath('./td[2]/span/text()')[0] # 提取审判员 d_info['judge'] = info.xpath('./td[5]/span/text()')[0] # 提取开庭地点 d_info['courtNum'] = info.xpath('./td[6]/span/text()')[0] # 提取时间 d_info['sorttime'] = info.xpath('./td[7]/span/text()')[0].split( ' ')[0] # 提取原告和被告和案由 party = info.xpath('./td[3]/span/text()')[0] for i in self.party: try: party = re.findall(i, party)[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start, end = ktgg.search_anyou(anyou, party) if start == 0: return d_info['anyou'] = party[start:end] d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party)[0].replace('被告人', '').replace('被告', '') elif type(party) is tuple: start, end = ktgg.search_anyou(anyou, party[1]) if start == 0: return d_info['anyou'] = party[1][start:end] d_info['plaintiff'] = party[0].replace('原告人', '').replace( '原告', '') d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party[1])[0].replace('被告人', '').replace('被告', '') break d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): for i in self.tihuan: text = text.replace(i, '') d['body'] = text # 提取开庭时间 sorttime = re.findall('\d{1,4}[年月].*?[日号]', text) if sorttime: d['sorttime'] = sorttime[0] # 提取开庭地点 courtNum = re.findall('在(.{2,7}庭)', text) if courtNum: d['courtNum'] = courtNum[0] else: courtNum = re.findall('第.{1,4}庭', text) if courtNum: d['courtNum'] = courtNum[0] # 提取原告,被告和案由 for i in self.party: try: party = re.findall(i, d['body'])[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start, end = ktgg.search_anyou(anyou, party) if start == 0: ktgg.write_txt('anyou', text) d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'], party)[0].replace('人', '') elif type(party) is tuple: start, end = ktgg.search_anyou(anyou, party[1]) if start == 0: return d['anyou'] = party[1][start:end] d['plaintiff'] = party[0].replace('人', '') d['pname'] = re.findall('(.*?)%s' % d['anyou'], party[1])[0].replace('人', '') break d['md5'] = ktgg.get_md5(d['body'], d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, d, db, cursor): # 提取详细信息 d['body'] = d['title'] # 提取开庭地点 sorttime = re.findall('\d{1,4}[年月].*?[日号]', d['title']) if sorttime: d['sorttime'] = sorttime[0] # 提取开庭时间 courtNum = re.findall('第.{1,4}庭', d['title']) if courtNum: d['courtNum'] = courtNum[0] # 提取案由和被告以及原告 for i in self.party: try: party = re.findall(i, d['title'])[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start, end = ktgg.search_anyou(anyou, party) if start == 0: return d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'], party)[0].replace('被告人', '').replace( '被告', '') elif type(party) is tuple: start, end = ktgg.search_anyou(anyou, party[1]) if start == 0: return d['anyou'] = party[1][start:end] d['plaintiff'] = party[0].replace('原告人', '').replace('原告', '') d['pname'] = re.findall('(.*?)%s' % d['anyou'], party[1])[0].replace('被告人', '').replace( '被告', '') break d['md5'] = ktgg.get_md5(d['body'], d['url']) ktgg.ins_mysql(d, 'ktgg_kt_wuhan', db, cursor) time.sleep(0.5)
def parse_text(self,d,db,cursor): # 由于格式原因分为两种情况 if '排期开庭' in d['title']: l = re.findall('(\d{1,4}[年].*?[日上下号])(.*?)\d{1,2}、{1,2}',d['body']) for info in l: d['body'] = info[0] + info[1] d['sorttime'] = info[0] d['anyou'] = ktgg.set_anyou(info[1]) caseNo = re.findall('[\[【((].*?号',info[1]) d['caseNo'] = '' if caseNo: d['caseNo'] = caseNo[0] courtNum = re.findall('我院(.*?)开庭审理',info[1]) d['courtNum'] = '' if courtNum: d['courtNum'] = courtNum[0].replace('公开','') for i in self.pname_p: s = re.findall(i % d['anyou'],info[1]) if s : d['plaintiff'] = s[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','') d['pname'] = s[0][1].replace('被告人','').replace('被告','') break d['md5'] = ktgg.get_md5(d['body'],d['url']) ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor) else: anyou = ktgg.set_anyou() d['anyou'] = ktgg.search_anyou(anyou,d['body']) d['sorttime'] = re.findall('\d{1,4}[年月].*?[日上下号]',d['body'])[0] # 案号 caseNo = re.findall('[\[【((].*?号',d['body']) d['caseNo'] = '' if caseNo: d['caseNo'] = caseNo[0] # 开庭地点 courtNum = re.findall('我院(.*?)开庭审理',d['body']) d['courtNum'] = '' if courtNum: d['courtNum'] = courtNum[0].replace('公开','') # 获取原告和被告 for i in self.pname_p: l = re.findall(i % d['anyou'],d['body']) if l : d['plaintiff'] = l[0][0].replace('原告人','').replace('原告','').replace(',','').replace(',','') d['pname'] = l[0][1].replace('被告人','').replace('被告','') break d['md5'] = ktgg.get_md5(d['body'],d['url']) ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)
def parse_text(self, text, d, db, cursor): # 切割文本 infos = re.split('\n', text) for info in infos: if info: d_info = d.copy() # 提取时间 d_info['body'] = info.replace('\xa0', '').replace('\r', '') sorttime = re.findall('\d{1,4}[年月].*?[日号]', info) if sorttime: d_info['sorttime'] = sorttime[0] # 提取开庭地点 courtNum = re.findall('在(.*?庭)', info) if courtNum: d_info['courtNum'] = courtNum[0] d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) # 提取案由,原告,被告 for party in self.party: party = re.findall(party, info) if party: party = party[0] d_info['plaintiff'] = party[0].replace('原告', '').replace( '人', '') anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, party[1]) d_info['anyou'] = party[1][start:end] pname = re.findall('(.*?)%s' % d_info['anyou'], party[1]) if pname: d_info['pname'] = pname[0].replace('被告', '').replace( '人', '') break else: continue d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self, text, d, db, cursor): # 切割文本 infos = re.split('\n', text) f = [] for info in infos: d_info = d.copy() info = re.split('\s', info) info = list(filter(None, info)) # 第一种情况 start = 0 if len(info) >= 6: d_info['sorttime'] = '' d_info['caseNo'] = '' d_info['body'] = ''.join(info) for i in info: # 提取案号,案由,被告和原告 if ('号' in i) and (d_info['caseNo'] == ''): d_info['caseNo'] = i # 获取party index = info.index(i) party = info[index + 1] # 获取案由 anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, party) if start == 0: break d_info['anyou'] = party[start:end] # 获取原告和被告 if '诉' in party: p = re.split('诉', party) d_info['plaintiff'] = p[0] d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], p[1]) else: d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party)[0] # 提取开庭时间和开庭地点 if d_info['sorttime'] == '': sorttime = re.findall('\d{4}-\d{2}-\d{2}', i) if sorttime: d_info['sorttime'] = sorttime[0] index = info.index(i) d_info['courtNum'] = info[index - 1] if start == 0: continue d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor) # 第二种情况 elif 0 < len(info): f.append(info) if len(f) == 2: info = f[0] + f[1] d_info['body'] = ''.join(info) # 提取时间 sorttime = re.findall('\d{4}年.*?日', d_info['body']) if sorttime: d_info['sorttime'] = sorttime[0] # 提取法庭 courtNum = re.findall('第.{2,6}庭|回龙法庭', d_info['body']) if courtNum: d_info['courtNum'] = courtNum[0] # 获取案号 caseNo = re.findall('[((民].*?号', d_info['body']) if caseNo: d_info['caseNo'] = caseNo[0] for i in info: if '诉' in i: # 获取案由 anyou = ktgg.set_anyou() start, end = ktgg.search_anyou(anyou, i) if start == 0: break d_info['anyou'] = i[start:end] # 获取原告和被告 if '诉' in i: p = re.split('诉', i) if '号' in p[0]: d_info['plaintiff'] = p[0].split('号') else: d_info['plaintiff'] = p[0] d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], p[1]) else: d_info['pname'] = re.findall( '(.*?)%s' % d_info['anyou'], party)[0] f = [] if start == 0: continue d_info['md5'] = ktgg.get_md5(d_info['body'], d_info['url']) ktgg.ins_mysql(d_info, 'ktgg_kt_wuhan', db, cursor)
def parse_text(self,text,html,d,db,cursor): if '开庭公告' in d['title']: infos = html.xpath('//tbody/tr')[1:] for info in infos: d_info = d.copy() d_info['body'] = info.xpath('string(.)').replace('\r','').replace('\n','') if len(info.xpath('./td')) == 4: # 提取时间 d_info['sorttime'] = info.xpath('./td[4]')[0].xpath('string(.)').split(' ')[0] # 提取地点 d_info['courtNum'] = info.xpath('./td[3]')[0].xpath('string(.)') party = info.xpath('./td[2]')[0].xpath('string(.)') else: # 提取时间 d_info['sorttime'] = info.xpath('./td[3]')[0].xpath('string(.)').split(' ')[0] # 提取地点 d_info['courtNum'] = info.xpath('./td[2]')[0].xpath('string(.)') party = info.xpath('./td[1]')[0].xpath('string(.)') # 提取被告和原告和案由 for i in self.party: try: party = re.findall(i,party)[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start,end = ktgg.search_anyou(anyou,party) if start == 0: return d_info['anyou'] = party[start:end] d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party)[0].replace('被告人','').replace('被告','') elif type(party) is tuple: start,end = ktgg.search_anyou(anyou,party[1]) if start == 0: return d_info['anyou'] = party[1][start:end] d_info['plaintiff'] = party[0].replace('原告人','').replace('原告','') d_info['pname'] = re.findall('(.*?)%s' % d_info['anyou'],party[1])[0].replace('被告人','').replace('被告','') break else: return d_info['md5'] = ktgg.get_md5(d_info['body'],d_info['url']) ktgg.ins_mysql(d_info,'ktgg_kt_wuhan',db,cursor) else: d['body'] = text # 提取日期 sorttime = re.findall('\d{1,2}月.*?日',d['title']) if sorttime: d['sorttime'] = d['posttime'].split('-')[0] + '年' + sorttime[0] #提取审判庭 courtNum = re.findall('在(.{2,5}庭)',d['body']) if courtNum: d['courtNum'] = courtNum[0] for i in self.party: try: party = re.findall(i,d['body'])[0] except IndexError: continue else: anyou = ktgg.set_anyou() if type(party) is str: start,end = ktgg.search_anyou(anyou,party) if start == 0: return d['anyou'] = party[start:end] d['pname'] = re.findall('(.*?)%s' % d['anyou'],party)[0].replace('被告人','').replace('被告','').replace('审','').replace('理','') elif type(party) is tuple: start,end = ktgg.search_anyou(anyou,party[1]) if start == 0: return d['anyou'] = party[1][start:end] d['plaintiff'] = party[0].replace('原告人','').replace('原告','').replace('审','').replace('理','') d['pname'] = re.findall('(.*?)%s' % d['anyou'],party[1])[0].replace('被告人','').replace('被告','') break else: return d['md5'] = ktgg.get_md5(d['body'],d['url']) ktgg.ins_mysql(d,'ktgg_kt_wuhan',db,cursor)