class Spider(MainSpider): def __init__(self): self.task_id = "sichuan" self.site_name = "四川法院司法公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""} url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['page']))) self.http.set_charset("unicode") self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 # break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['page']))) object_list = list() case_list = json_data["data"] for case in case_list: if "开庭公告" in html.unescape(case["ggbt"]): item = dict() item["release_date"] = case["clsj"] formdata = { "ggsdid": "{}".format(str(case['ggsdid'])), "ssfy": "{}".format(str(case['fydm'])) } ur = "http://111.230.134.78:8081/sdgl/app/getGgsdInfo.do" self.http.http_session(ur, "post", data=formdata, headers=self.headers) json = self.http.parse_json()["data"] item["taskid"] = self.task_id item["release_date"] = html.unescape(json.get("CLSJ")) item["title"] = html.unescape(json.get("GGBT")) item["court_y"] = get_content(json.get("SSFYMC")) # 法院 content = html.unescape(json.get("GGNR")) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(content)) item["court_t"] = "".join(re.findall("法院.{1,10}庭", content)).replace( "法院", "") item["court_num"] = html.unescape(json.get("AH")) # 案号 item["trial_cause"] = html.unescape( json.get("CBRXM").strip()) # 审判人员 item['bulletin_way'] = t_way item["site_name"] = self.site_name b = BulletinCourt(**item) object_list.append(b) return object_list
class Spider(MainSpider): def __init__(self): self.task_id = "qinghai" self.site_name = "青海法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers self.http.set_charset("gbk") def parse(self): form = { "p": "1", "LocationID": "0700000000", "sub": "" } url = "http://qhfy.chinacourt.org/fygg/index.php" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page)+1): try: form["p"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, ( form['p'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) # print(doc("td.td_pagebar").text()) total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace( "共", "").replace("页", "").strip() lis = doc('td.td_line').items() object_list = list() for x in lis: if "开庭" in x.text(): self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc item = dict() item["taskid"] = self.task_id item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text())) item["title"] = x.text() t_way = self.task_id + str(time.time()) + '.txt' item["bulletin_way"] = t_way item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text())) item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") item["site_name"] = self.site_name # print(item) if eval(item["release_date"].replace("-", "")) > eval("20180101"): file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "hunan" self.site_name = "湖南法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): page = 1 url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取==============湖南法院网") log.info("开始抓取==============湖南法院网,第{}页".format(page)) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============山西法院诉讼服务网,第{}页".format(page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): page = i try: log.info("开始抓取==============湖南法院网,第{}页".format(page)) url = "http://hunanfy.chinacourt.org/article/index/id/M0jONTAwNzAwNCACAAA/page/{}.shtml".format( page) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info( "开始存储==============山西法院诉讼服务网,第{}页".format(page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取湖南法院网,第{}页异常".format(page), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取湖南法院网,第{}页异常".format(page), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取湖南法院网结束") def parse_html(self, html): # 解析html doc = pq(html) page_lis = doc('a').items() for pag in page_lis: if pag.text() == "尾页": total_page = "".join(re.findall("(\d*.shtml)", pag.attr.href)).replace( ".shtml", "") lis = doc('div.font14 li').items() # 创建对象列表 object_list = list() for x in lis: # 创建item字典 item = dict() item["release_date"] = x('span.right').text() self.http.http_session("http://hunanfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 将获取的html写入文件 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.detail') item["taskid"] = self.task_id item["title"] = content('div.detail_bigtitle').text() item["court_y"] = "".join( re.findall("在.*法院", content('div.detail_txt').text())).replace("在", "") item["court_t"] = "".join( re.findall("刑.*庭", content('div.detail_txt').text())) item["start_court_t"] = "".join( re.findall("本院定于\d{4}年.{1,5}日", content('div.detail_txt').text())).replace( "年", "-").replace("月", "-").replace("日", "").replace( "本院定于", "") item["court_num"] = "".join( re.findall("审理.*号", content('div.detail_txt').text())).replace( "审理", "") item["trial_cause"] = "".join( re.findall("合议庭成员.*\s", content('div.detail_txt').text())).replace( "合议庭成员:", "").replace("\n", "") item["court_part"] = "".join( re.findall("在.*法院", content('div.detail_txt').text())).replace("在", "") item['site_name'] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "guangdong" self.site_name = "广东法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "action": "gotoggxxcx", "gglx": "ktgg", "flag": "first" } url = "http://www.gdcourts.gov.cn/web/search" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['flag']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['flag']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html.encode("utf8"))) doc = pq(html) lis = doc('div.doclist tr').items() object_list = list() x_lis = list() for x in lis: x_lis.append(x) text_lis = list() for i in x_lis[1:]: text_lis = list() for text in i('td').items(): text_lis.append(text.text()) item = dict() item["taskid"] = self.task_id item["bulletin_way"] = t_way item["court_num"] = text_lis[0] item["court_pur"] = text_lis[1] item["court_part"] = text_lis[2] item["start_court_t"] = text_lis[3] item["court_end_t"] = text_lis[4] item["court_status"] = text_lis[5] item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # # 返回对象列表和总页数 return object_list
class Spider(MainSpider): def __init__(self): self.task_id = "shanxi" self.site_name = "山西法院诉讼服务网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"channelId": "307", "listsize": "238", "pagego": "1"} url = "http://www.shanxify.gov.cn/ktggPage.jspx" log.info("开始抓取==============山西法院诉讼服务网") log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(str( form['pagego']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============山西法院诉讼服务网,第{}页".format( str(form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["pagego"] = i log.info("开始抓取==============山西法院诉讼服务网,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info( "开始存储==============抓取山西法院诉讼服务网,第{}页".format(i)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(i), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取山西法院诉讼服务网,第{}页异常".format(str(form['pagego'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取山西法院诉讼服务网结束") def parse_html(self, html): # 解析html doc = pq(html) total_page = int(doc('a.zt_02').text()[-3:]) lis = doc('div.text ul li a').items() # 创建对象列表 object_list = list() for x in lis: # 创建item字典 item = dict() self.http.http_session(x.attr.href, "post", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 将获取的html写入文件 file_out(t_way, str(htm)) content = doc('div.text') item["taskid"] = self.task_id item["release_date"] = content('h2').text()[3:13] item["title"] = content('h1').text() item["bulletin_way"] = t_way item["court_y"] = "".join( re.findall("(在.*院)", content('h1').text())).replace("在", "") item["court_t"] = "".join( re.findall("(院.*庭)", content('h1').text())).replace("院", "").replace( "开庭", "") item["start_court_t"] = x.text()[:16] if u"刑事" in item["title"]: item["defendant"] = "".join( re.findall("(审理.*)", content('p').text().replace("\xa0\xa0", ""))).replace( "审理", "") else: item["plaintiff"] = "".join( re.findall("(审理.*诉)", content('p').text().replace( "\xa0\xa0", ""))).replace("审理", "").replace("诉", "") item["defendant"] = "".join( re.findall("(诉.*等)", content('p').text().replace( "\xa0\xa0", ""))).replace("诉", "").replace("等", "") item['site_name'] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "fujian" self.site_name = "福建省高级人民法院法院公告" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): url = "https://www.fjcourt.gov.cn/page/public/courtreport.html" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, 1)) self.http.http_requst(url, "get", headers=self.headers, verify=False) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, 1)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): form = { "__VIEWSTATE": VIEWSTATE, "__VIEWSTATEGENERATOR": "54969BDC", "__EVENTTARGET": "ctl00$cplContent$AspNetPager1", } try: form["__EVENTARGUMENT"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html( html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['__EVENTARGUMENT'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, 1), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): doc = pq(html) total_page = 10 for page in doc('a.pagination').items(): if page.text() == ">>": total_page = int("".join(re.findall("\d{2,3}", page.attr.href))) VIEWSTATE = doc("div.aspNetHidden input").attr.value lis = doc('ul.module-case-items li').items() object_list = list() for x in lis: self.http.http_session("https://www.fjcourt.gov.cn" + x('a').attr.href, "get", headers=self.headers, verify=False) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) content = doc('div.article-wrap') item = dict() item["taskid"] = self.task_id item["title"] = content('p.article-hd-title').text() item["bulletin_way"] = t_way item["court_y"] = content('span.article-author').text() item["court_t"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["start_court_t"] = x('span.cir-time').text().replace( "[", "").replace("]", "") item["court_part"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["site_name"] = self.site_name pub_time = (item["start_court_t"].replace("-", "")) date = get_today_date() if eval(pub_time) > eval(date): # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page, VIEWSTATE
class Spider(MainSpider): site_name = '新疆法院诉讼服务网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://220.171.35.30/ktggSearchResult.jspx?fyid=&ktdd=&page={page}' self.taskid = taskid def parse(self): log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: d_url = p['det_url'] log.info('开始抓取新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(page=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析重新疆法院诉讼服务网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: d_url = p['det_url'] log.info('开始重新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取新疆法院诉讼服务网结束') def added_parse(self): pass def parse_list(self, r): doc = pq(r) trs = doc('table tr') p_list = list() for i in range(1, trs.size()): item = dict() tr = trs.eq(i) td1 = tr('td').eq(1) item['det_url'] = td1('a').attr('href') item['title'] = td1('a').attr('title') item['court_y'] = tr('td').eq(2).text() item['start_court_t'] = tr('td').eq(3).text() p_list.append(item) return p_list def parse_info(self, rs, item): doc = pq(rs) title = doc('title').text() con = doc('div.con') c_title = con('div.title').text() court = con('origin').text() p = con('div.content').children("p") c_html = '' for var in p.items(): c_html += var.text() + '\r\n' html = title + '\r\n' + c_title + '\r\n' + court + '\r\n' + c_html item['html'] = html def page_total(self, res): try: doc = pq(res) jump = doc('div.jump div.skip').children('a') len = jump.eq(jump.length - 1) k = int(len.attr('onclick').replace('turnPage(', '').replace(')', '')) return k except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
class Spider(MainSpider): site_name = '天津法院网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://tjfy.chinacourt.org/article/index/id/MzDIMTCwMDAwNCACAAA%3D/page/{page}.shtml' self.taskid = taskid def parse(self): log.info('开始抓取天津法院网') ct = 1 while ct < 30: log.info('开始抓取天津法院网第{page}页信息'.format(page=str(ct))) self.http.http_session(self.url.format(page=str(ct)), 'get', headers=self.http.headers) try: r = self.http.parse_html() log.info('解析天津法院网第{page}页信息'.format(page=str(ct))) p_list = self.parse_list(r) ic = self.is_c(r) object_list = list() for i in p_list: try: log.info('开始抓取天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)+1))) d_url = 'http://tjfy.chinacourt.org' + i['det_url'] self.http.http_session(d_url, 'get', headers=self.http.headers) rl = self.http.parse_html() log.info('解析天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)))) self.parse_info(rl, i) log.info('写出天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)))) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, i['html']) i['bulletin_way'] = t_way i.pop('det_url') i.pop('html') b = BulletinCourt(**i) object_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) log.info('存储天津法院网第{page}页数据'.format(page=str(ct), strip=str(p_list.index(i)))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() if ic == 0: break except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) ct += 1 self.mysql_client.session_close() log.info('开始抓取天津法院网结束') def added_parse(self): pass def parse_list(self, r): doc = pq(r) main = doc('div#main') ul = main('ul li').items() p_list = list() for l in ul: item = dict() hr = l('a').attr('href') title = l('a').attr('title') time = l('span.right').text() item["taskid"] = '111111111' item['det_url'] = hr item['start_court_t'] = time p_list.append(item) return p_list def parse_info(self, rs, item): rr = pq(rs) det = rr('div.detail') tit = det('div.title') title = tit('div.b_title').text() txt = tit('div.sth_a span').eq(0).text() time = txt.split(':')[2].strip() cont = det('div.text').text() html = title + '\r\n' + txt + '\r\n' + cont item['release_date'] = time item['html'] = html item['title'] = title def is_c(self, res): try: doc = pq(res) d = doc('#category .paginationControl').eq(0) c = int(d('.current').text()) a = d('a') count = 0 for var in a.items(): count = count + 1 s = var.text() if s == '下一页': break t = a.eq(count - 2) ts = int(t.text()) if ts <= c: return 0 else: return 1 except Exception: return 1
class Spider(MainSpider): site_name = '内蒙古自治区高级人民法院司法公开网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://www.nmgfy.gov.cn/fygg/index.jhtml' self.taskid = taskid def parse(self): log.info('开始抓取内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a') nurl = 'http://www.nmgfy.gov.cn' + skip.eq(skip.length - 1).attr('href').replace('&', '&')\ .replace('pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储内蒙古自治区高级人民法院司法公开网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info( '开始抓取内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info( '解析内蒙古自治区高级人民法院司法公开网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info( '开始抓取内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info( '解析内蒙古自治区高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info( '存储宁内蒙古自治区高级人民法院司法公开网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取内蒙古自治区高级人民法院司法公开网结束') def added_parse(self): pass def parse_list(self, r): p_list = list() doc = pq(r) sec = doc('ul.sswy_news').children('li') for var in sec.items(): item = dict() det_url = var('a').attr('href') title = var('a').attr('title') start_court_t = re.search('\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', title).group() item['det_url'] = det_url item['title'] = title item['start_court_t'] = start_court_t p_list.append(item) return p_list def parse_info(self, rs, item): doc = pq(rs) con = doc('div.ywzw_con_inner') p_source = con('p.p_source').text() title = con('h3.h3_title').text() release_date = p_source.split(' 来源:')[0].strip() p_notice = con('p.p_notice').text() p_text = con('p.p_text').text() start_court_t = re.search('\d{4}年\d{2}月\d{2}', p_text).group().replace( '年', '-').replace('月', '-') p_tcgg = con('p.tcgg').text() p_date = con('p.p_date').text() court_y = title html = p_source.replace( '\u3000', ' ' ) + '\r\n' + title + '\r\n' + p_notice + '\r\n' + p_text + '\r\n' + p_tcgg + '\r\n' + p_date item['release_date'] = release_date item['html'] = html item['court_y'] = court_y item['start_court_t'] = start_court_t def page_total(self, res): try: doc = pq(res) skip = doc('div.turn_page').children('p').children('a') tpage = int(skip.eq(skip.length - 2).text()) return tpage except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
class Spider(MainSpider): site_name = '重庆法院公共服务网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://www.cqfygzfw.com/court/gg_listgg.shtml?gg.endDate={end}&gg.startDate={start}' \ '&gg.fydm=&gg.ggnr=&page={page}' self.taskid = taskid def parse(self): log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page='1')) ts = datetime.date.today() tm = datetime.date.today() + datetime.timedelta(days=365) self.http.http_session(self.url.format(end=str(tm), start=str(ts), page='1'), 'get', headers=self.http.headers) r = self.http.parse_html().replace('►', '') log.info('解析重庆法院公共服务网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[ 'det_url'] log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储天津法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) print(p_total) for total in range(2, p_total): try: log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(end=str(tm), start=str(ts), page=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html().replace('►', '') log.info('解析重庆法院公共服务网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[ 'det_url'] log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储重庆法院公共服务网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException( m, self.taskid, self.site_name, self.url.format(end=str(tm), start=str(ts), page=str(total))) self.mysql_client.session_close() log.info('抓取重庆法院公共服务网结束') def added_parse(self): pass def parse_list(self, r): doc = pq(r) tb = doc('div.r_wenben table.table_ys tbody') trs = tb('tr') info_list = list() for tr in trs.items(): item = dict() tds = tr('td') cy = tds.eq(0).text() cn = tds.eq(1).text().strip() st = tds.eq(2).text() du = tds.eq(1).children('a').attr('onclick').replace( 'openKtgg(\'', '').replace('\')', '').strip() item['court_y'] = cy item['court_num'] = cn item['start_court_t'] = st item['det_url'] = du info_list.append(item) return info_list def parse_info(self, rs, item): doc = pq(rs) title = doc('div.tc_window_bt').text() case_num = doc('td.tc_td01').text() content = doc('table.table_ys2 tr').eq(1).children('td').text() html = title + '\r\n' + case_num + '\r\n' + content item['html'] = html item['title'] = title def page_total(self, res): try: str0 = int( re.search('共\d*条', res).group().replace('共', '').replace('条', '')) connt = math.ceil(str0 / 15) return connt except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
class Spider(MainSpider): site_name = '河北法院网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://hbgy.hbsfgk.org/ktgg/index.jhtml' self.taskid = taskid def parse(self): log.info('开始抓取河北法院网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析河北法院网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a.sp_next') nurl = 'http://hbgy.hbsfgk.org' + skip.attr('href').replace( 'pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取河北法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析河北法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储河北法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取河北法院网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析河北法院网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始河北法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析河北法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储河北法院网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) time0 = get_today_date() time1 = self.get_n_t(r) strftime0 = datetime.datetime.strptime(time1, "%Y-%m-%d") strftime1 = datetime.datetime.strptime(time0, "%Y-%m-%d") fg = strftime1 > strftime0 if fg == True: break self.mysql_client.session_close() log.info('抓取河北法院网结束') def added_parse(self): pass def parse_list(self, r): doc = pq(r) p_list = list() sec = doc('ul.sswy_news').children('li') for var in sec.items(): item = dict() det_url = var('a').attr('href') title = var('a').attr('title') start_court_t = re.search('\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', title).group() item['det_url'] = det_url item['title'] = title item['start_court_t'] = start_court_t p_list.append(item) return p_list def parse_info(self, rs, item): doc = pq(rs) doc = pq(rs) con = doc('div.ywzw_con_inner') p1 = con.children().eq(0).text() h3 = con.children().eq(1).text() p2 = con.children().eq(2).text() p3 = con.children().eq(3).text() p4 = con.children().eq(4).text() p5 = con.children().eq(5).text() html = p1 + '\r\n' + h3 + '\r\n' + p2 + '\r\n' + p3 + '\r\n' + p4 + '\r\n' + p5 item['html'] = html item['court_y'] = h3 item['release_date'] = re.search('\d{4}-\d{2}-\d{2}', p1).group() def page_total(self, res): try: doc = pq(res) skip = doc('div.turn_page').children('p').children('a') tpage = int(skip.eq(skip.length - 2).text()) if tpage > 500: return 500 else: return tpage except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0 def get_n_t(self, r): doc = pq(r) lst = doc('ul.sswy_news').children('li') li = lst.eq(lst.length - 1).children('a').attr('title') tm = re.search('\d{4}-\d{2}-\d{2}', li).group() return tm
class Spider(MainSpider): def __init__(self): self.task_id = "gansu" self.site_name = "甘肃省高级人民法院司法公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "channelId": "307", "listsize": "100", "pagecur": "0", "pagego": "add" } url = "http://gsgf.gssfgk.com/ktggPage.jspx" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['pagecur']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagecur']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() form["listsize"] = total_page for i in range(1, int(total_page) + 1): try: form["pagecur"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['pagecur']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagecur']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) page_list = doc('a.zt_02').items() total_page = 10 for page in page_list: if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('div.text ul li a').items() object_list = list() for x in lis: item = dict() self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.text') item["taskid"] = self.task_id item["release_date"] = content('h2').text()[3:13] item["title"] = content('h1').text() item["bulletin_way"] = t_way item["court_y"] = "".join( re.findall("(在.*法院)", content('h1').text())).replace("在", "") item["court_t"] = "".join( re.findall("(院.*庭)", content('h1').text())).replace("院", "").replace( "开庭", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x.text())) item["plaintiff"] = "".join( re.findall("(审理.*诉)", content("p").text())).replace("审理", "").replace("诉", "") item["site_name"] = self.site_name date = get_today_date() if eval("".join(re.findall("\d{4}-\d{2}-\d{2}", x.text())).replace( "-", "")) > eval(date): # 生成文件路径 file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "zhejiang" self.site_name = "浙江法院公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = { "pageno": "1", "pagesize": "10", "cbfy": "全部", "dsr": "", "spz": "", "jarq1": "", "jarq2": "" } url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['pageno']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["pageno"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['pageno']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["list"] for case in case_list: item = dict() item["taskid"] = self.task_id item["court_y"] = get_content(case.get("FY")) # 法院 item["court_t"] = get_content(case.get("FT")) # 法庭 item["start_court_t"] = get_content(case.get("KTRQSTRING")) # 开庭日期 item["court_num"] = get_content(case.get("AH")) # 案号 item["court_case"] = get_content(case.get("AY")) # 案由 item["trial_cause"] = get_content(case.get("SPZ")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way item["undertake_dep"] = get_content(case.get("CBBM")) item["plaintiff"] = get_content(case.get("YG")).replace("原告:", "") item["defendant"] = get_content(case.get("BG")).replace("被告:", "") item["schedule_time"] = get_content(case.get("PQRQ")) b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["total"] return int(total_page) // 10 except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
class Spider(MainSpider): def __init__(self): self.task_id = "sanxi" self.site_name = "陕西法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): page = 1 url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, page)) self.http.http_requst(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() # for i in range(2, int(total_page) + 1): try: page = i url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={},第{}页".format( self.site_name, page)) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) page = doc('div.paginationControl a').eq(5).attr.href total_page = "".join(re.findall("\d{1,3}", page)) lis = doc('span.left').items() object_list = list() for x in lis: self.http.http_session("http://sxfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc('div.detail') # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(content)) item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", content('div.sth_a').text())) item["title"] = content('div.b_title').text() item["bulletin_way"] = t_way item["court_y"] = "陕西省高级人民法院" item["court_t"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["court_part"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "hubei" self.site_name = "湖北省高级人民法院" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"folderNo": "0401", "pageIndex": "1"} url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder" log.info("开始抓取==============湖北省高级人民法院") log.info("开始抓取==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["pageIndex"] = i log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取湖北省高级人民法院结束") def added_parse(self): pass def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html)) doc = pq(html) total_page = "".join( re.findall("共.*页\s上", doc('span').text().replace("\n", "")))[1:3] lis = doc('table.newlisttable tr').items() object_list = list() for content in lis: item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("(\(.*\))", content('td').text()))[1:-1] item["title"] = content('a').text() item["bulletin_way"] = t_way item["court_y"] = "湖北省高级人民法院" if content( 'p').text()[:4] == "本院定于" else content('p').text()[:4] item["court_t"] = "".join( re.findall("(在.*判庭)", content('p').text())).replace("在", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "") item["plaintiff"] = "".join( re.findall("(原告:.*;)", content('p').text())).replace("原告:", "") item["defendant"] = "".join( re.findall("(被告:.*的)", content('p').text())).replace("被告:", "").replace("的", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): site_name = '黑龙江法院网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://www.hljcourt.gov.cn/ktgg/index.php?p={page}&st={start}&et={end}' self.taskid = taskid def parse(self): log.info('开始抓取黑龙江法院网第{page}页信息'.format(page='1')) ts = datetime.date.today() tm = datetime.date.today() + datetime.timedelta(days=365) self.http.http_session(self.url.format(page='1', start=str(ts), end=str(tm)), 'get', headers=self.http.headers) self.http.set_charset('gb2312') r = self.http.parse_html() print(r) log.info('解析抓取黑龙江法院网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p['det_url'] log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析黑龙江法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) break log.info('存储黑龙江法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取黑龙江法院网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(page=str(total), start=str(ts), end=str(tm)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析黑龙江法院网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p[ 'det_url'] log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析黑龙江法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储黑龙江法院网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException( m, self.taskid, self.site_name, self.url.format(end=str(tm), start=str(ts), page=str(total))) self.mysql_client.session_close() log.info('抓取黑龙江法院网结束') def added_parse(self): pass def parse_list(self, r): info_list = list() doc = pq(r) tb = doc('table tbody').children('tr').children('td') k = int(tb.size() / 5) for i in range(0, k): item = dict() title = tb.eq(i * 5 + 1).text() court_num = tb.eq(i * 5 + 2).text() court_part = tb.eq(i * 5 + 3).text() start_court_t = tb.eq(i * 5 + 4).text() det_url = tb.eq(i * 5 + 1).children('div').children('a').attr('href') item['title'] = title item['court_num'] = court_num item['court_part'] = court_part item['start_court_t'] = start_court_t item['det_url'] = det_url info_list.append(item) return info_list def parse_info(self, rs, item): doc = pq(rs) ct = doc('div.ggnr') h2 = ct('h2').text() h3 = ct('h3').text() p = ct('p').text() t1 = ct('div.text-01').text() t2 = ct('div.text-02').text() html = h2 + '\r\n' + h3 + '\r\n' + p + '\r\n' + t1 + '\r\n' + t2 item['html'] = html item['court_y'] = h2 def page_total(self, res): try: k = int( re.search('共\d*页', res).group().replace('共', '').replace('页', '')) return k except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
class Spider(MainSpider): def __init__(self): self.task_id = "jiangxi" self.site_name = "江西庭审公开网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): date = get_today_date() form = { 'isGeneral': 'Y', 'belongOrgId': '', 'liveStatus': '001', 'page.pageSize': '20', 'page.pageNo': '1', 'gopenCourtDate': date + ' 00:00:00', 'page.orderBy': 'openCourtDate', 'page.order': 'asc', 'caseType': '', 'searchWord': '' } url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action" log.info("开始抓取==============江西庭审公开网") log.info("开始抓取==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page.pageNo"] = i log.info("开始抓取==============江西庭审公开网,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format( str(form['page.pageNo'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])), self.task_id, url, self.site_name) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析江西庭审公开网第{}页".format(str(form['page.pageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["message"]["result"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = case.get("lastBroadcastTimeString") # 发布日期 item["title"] = get_content(case.get("caseName")) # 标题 item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("openCourtAddr")) # 法庭 item["start_court_t"] = get_content( case.get("openCourtDateString")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["case_type"] = get_content(case.get("caseTypeString")) # 案件类型 item["court_case"] = get_content( case.get("causePlacedOnFile")) # 案由 item["trial_cause"] = get_content( case.get("underJustice")).strip() # 审判人员 try: dex = case["litigants"].index("被告:") item["plaintiff"] = case["litigants"][:dex].replace( "原告:", "")[:-1] # 原告 item["defendant"] = case["litigants"][dex:].replace("被告:", "") # 被告 except: item["plaintiff"] = "" item["defendant"] = case.get("litigants") item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["message"]["totalPages"] return int(total_page) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
class Spider(MainSpider): def __init__(self): self.task_id = "guizhou" self.site_name = "贵州法院公众服务平台" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""} url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx" log.info("开始抓取==============贵州法院公众服务平台") log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["page"] = i log.info("开始抓取==============贵州法院公众服务平台,第{}页".format( str(form['page']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取贵州法院公众服务平台结束") def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) for page in doc('a').items(): if page.text() == "末页": total_page = "".join(re.findall("\d{1,3}", page.attr.onclick)) lis = doc('table.tabData a').items() object_list = list() for x in lis: self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.print-box') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}", content.text())).replace("发表日期:", "") item["title"] = x.attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "").replace("时", ":") item["court_num"] = "".join( re.findall("(审理.*案件)", content('p').text())).replace("审理", "").replace("案件", "") item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "yunan" self.site_name = "云南法院司法信息网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"channelId": "858", "listsize": "673", "pagego": "1"} url = "http://www.ynfy.gov.cn/ktggPage.jspx" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['pagego']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() form["listsize"] = total_page for i in range(2, int(total_page) + 1): try: form["pagego"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['pagego']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) total_page = 10 for page in doc('div.turn_page a.zt_02').items(): if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('ul.sswy_news li').items() object_list = list() for x in lis: self.http.http_session(x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.ywzw_con_inner') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}", content('p.p_source ').text())) item["title"] = x('a').attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3.h3_title').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
class Spider(MainSpider): site_name = '吉林省高级人民法院司法公开网' def __init__(self, taskid): MainSpider.__init__(self, task_id=taskid) self.http = HttpRequest(taskid, self.site_name) self.url = 'http://www.jlsfy.gov.cn/ktgg/index.jhtml' self.taskid = taskid def parse(self): log.info('开始抓取吉林省高级人民法院司法公开网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析吉林省高级人民法院司法公开网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a.sp_next') nurl = 'http://www.jlsfy.gov.cn' + skip.attr('href').replace('&', '&')\ .replace('pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储吉林省高级人民法院司法公开网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取林省高级人民法院司法公开网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析林省高级人民法院司法公开网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储林省高级人民法院司法公开网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取林省高级人民法院司法公开网结束') def added_parse(self): pass def parse_list(self, r): p_list = list() doc = pq(r) sec = doc('ul.organList').children('li') for var in sec.items(): item = dict() det_url = var('a').attr('href') title = var('a').attr('title') start_court_t = re.search('\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}', title).group() release_date = var('span').text() item['det_url'] = det_url item['title'] = title item['start_court_t'] = start_court_t item['release_date'] = release_date p_list.append(item) return p_list def parse_info(self, rs, item): doc = pq(rs) con = doc('div.ggnr') h2 = con('h2').text() h3 = con('h3').text() p = con('p').text() t1 = con('div.text-01').text() t2 = con('div.text-02').text() html = h2 + '\r\n' + h3 + '\r\n' + p + '\r\n' + t1 + '\r\n' + t2 item['html'] = html item['court_y'] = h2 def page_total(self, res): try: doc = pq(res) skip = doc('div.turn_page').children('p').children('a') tpage = int(skip.eq(skip.length - 2).text()) return tpage except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
class Spider(MainSpider): def __init__(self): self.task_id = "hainan" self.site_name = "天涯法律网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): today_date = get_today_date() next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:] form = { "currentPageNo": "1", "pageSize": "10", "startDate": today_date, "endDate": next_year_today_date, "caseNo": "", "litigant": "", "judge": "", "caseDesc": "", "siteId": "f7afc746-8577-4cd4-a410-884027df5bab" } url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["currentPageNo"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo']) ), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo']) ), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_list(self, json_data, form): # 解析获取到的json log.info("开始解析{}第{}页".format(self.site_name, (form['currentPageNo']))) t_way = self.task_id + str(time.time()) + '.txt' file_out(t_way, str(json_data)) object_list = list() case_list = json_data["data"] for case in case_list: item = dict() item["taskid"] = self.task_id item["release_date"] = get_content(case.get("createDate")) item["court_y"] = get_content(case.get("belongOrgName")) # 法院 item["court_t"] = get_content(case.get("trialCourt")) # 法庭 item["start_court_t"] = get_content(case.get("courtTime")) # 开庭日期 item["court_num"] = get_content(case.get("caseNo")) # 案号 item["court_case"] = get_content(case.get("caseDesc")) # 案由 item["trial_cause"] = get_content(case.get("judge")).strip() # 审判人员 item["site_name"] = self.site_name # 网站名称 item['bulletin_way'] = t_way b = BulletinCourt(**item) object_list.append(b) return object_list def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["pages"] return int(total_page) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0