class Spider(MainSpider): def __init__(self): self.task_id = "guizhou" self.site_name = "贵州法院公众服务平台" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""} url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx" log.info("开始抓取==============贵州法院公众服务平台") log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["page"] = i log.info("开始抓取==============贵州法院公众服务平台,第{}页".format( str(form['page']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取贵州法院公众服务平台结束") def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) for page in doc('a').items(): if page.text() == "末页": total_page = "".join(re.findall("\d{1,3}", page.attr.onclick)) lis = doc('table.tabData a').items() object_list = list() for x in lis: self.http.http_session(x.attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.print-box') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("发表日期:20\d{2}-\d{1,2}-\d{1,2}", content.text())).replace("发表日期:", "") item["title"] = x.attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}时\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "").replace("时", ":") item["court_num"] = "".join( re.findall("(审理.*案件)", content('p').text())).replace("审理", "").replace("案件", "") item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "sanxi" self.site_name = "陕西法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): page = 1 url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, page)) self.http.http_requst(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() # for i in range(2, int(total_page) + 1): try: page = i url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={},第{}页".format( self.site_name, page)) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) page = doc('div.paginationControl a').eq(5).attr.href total_page = "".join(re.findall("\d{1,3}", page)) lis = doc('span.left').items() object_list = list() for x in lis: self.http.http_session("http://sxfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc('div.detail') # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(content)) item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", content('div.sth_a').text())) item["title"] = content('div.b_title').text() item["bulletin_way"] = t_way item["court_y"] = "陕西省高级人民法院" item["court_t"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["court_part"] = "".join( re.findall("(在.{1,10}公开)", content('div').text())).replace("在", "").replace( "公开", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "qinghai" self.site_name = "青海法院网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers self.http.set_charset("gbk") def parse(self): form = { "p": "1", "LocationID": "0700000000", "sub": "" } url = "http://qhfy.chinacourt.org/fygg/index.php" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page)+1): try: form["p"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, ( form['p'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) # print(doc("td.td_pagebar").text()) total_page = "".join(re.findall("共\s.*\s页", doc("td.td_pagebar").text())).replace( "共", "").replace("页", "").strip() lis = doc('td.td_line').items() object_list = list() for x in lis: if "开庭" in x.text(): self.http.http_session("http://qhfy.chinacourt.org" + x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() doc = pq(htm) content = doc item = dict() item["taskid"] = self.task_id item["release_date"] = "".join(re.findall("\d{4}-\d{2}-\d{2}", content("p").text())) item["title"] = x.text() t_way = self.task_id + str(time.time()) + '.txt' item["bulletin_way"] = t_way item["court_y"] = "".join(re.findall(".{2,10}人民法院", content('span.detail_content').text())) item["court_t"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") # item["start_court_t"] = "".join(re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join(re.findall("(在.{2,10}公开)", content('span.detail_content').text()) ).replace("在", "").replace("公开", "").replace("依法", "") item["site_name"] = self.site_name # print(item) if eval(item["release_date"].replace("-", "")) > eval("20180101"): file_out(t_way, str(htm)) # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "yunan" self.site_name = "云南法院司法信息网" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"channelId": "858", "listsize": "673", "pagego": "1"} url = "http://www.ynfy.gov.cn/ktggPage.jspx" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['pagego']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() form["listsize"] = total_page for i in range(2, int(total_page) + 1): try: form["pagego"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['pagego']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): # 解析html doc = pq(html) total_page = 10 for page in doc('div.turn_page a.zt_02').items(): if int(page.text()) > total_page: total_page = int(page.text()) lis = doc('ul.sswy_news li').items() object_list = list() for x in lis: self.http.http_session(x('a').attr.href, "get", headers=self.headers) htm = self.http.parse_html() # # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) doc = pq(htm) content = doc('div.ywzw_con_inner') item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("\d{4}-\d{2}-\d{2}", content('p.p_source ').text())) item["title"] = x('a').attr.title item["bulletin_way"] = t_way item["court_y"] = content('h3.h3_title').text() item["court_t"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["start_court_t"] = "".join( re.findall("\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}", x('a').attr.title)) item["court_part"] = "".join( re.findall("(在.*依法)", content('p').text())).replace("在", "").replace("依法", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 # break return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "hubei" self.site_name = "湖北省高级人民法院" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): form = {"folderNo": "0401", "pageIndex": "1"} url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder" log.info("开始抓取==============湖北省高级人民法院") log.info("开始抓取==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["pageIndex"] = i log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取湖北省高级人民法院结束") def added_parse(self): pass def parse_html(self, html): # 解析html # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(html)) doc = pq(html) total_page = "".join( re.findall("共.*页\s上", doc('span').text().replace("\n", "")))[1:3] lis = doc('table.newlisttable tr').items() object_list = list() for content in lis: item = dict() item["taskid"] = self.task_id item["release_date"] = "".join( re.findall("(\(.*\))", content('td').text()))[1:-1] item["title"] = content('a').text() item["bulletin_way"] = t_way item["court_y"] = "湖北省高级人民法院" if content( 'p').text()[:4] == "本院定于" else content('p').text()[:4] item["court_t"] = "".join( re.findall("(在.*判庭)", content('p').text())).replace("在", "") item["start_court_t"] = "".join( re.findall("(\d{4}年\d{2}月\d{2}日\s\d{2}:\d{2})", content('p').text())).replace("年", "-").replace( "月", "-").replace("日", "") item["plaintiff"] = "".join( re.findall("(原告:.*;)", content('p').text())).replace("原告:", "") item["defendant"] = "".join( re.findall("(被告:.*的)", content('p').text())).replace("被告:", "").replace("的", "") item["site_name"] = self.site_name # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page
class Spider(MainSpider): def __init__(self): self.task_id = "fujian" self.site_name = "福建省高级人民法院法院公告" MainSpider.__init__(self, task_id=self.task_id) self.http = HttpRequest(self.task_id, self.site_name) self.headers = headers def parse(self): url = "https://www.fjcourt.gov.cn/page/public/courtreport.html" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, 1)) self.http.http_requst(url, "get", headers=self.headers, verify=False) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, 1)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): form = { "__VIEWSTATE": VIEWSTATE, "__VIEWSTATEGENERATOR": "54969BDC", "__EVENTTARGET": "ctl00$cplContent$AspNetPager1", } try: form["__EVENTARGUMENT"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html( html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['__EVENTARGUMENT'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, 1), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name)) def added_parse(self): pass def parse_html(self, html): doc = pq(html) total_page = 10 for page in doc('a.pagination').items(): if page.text() == ">>": total_page = int("".join(re.findall("\d{2,3}", page.attr.href))) VIEWSTATE = doc("div.aspNetHidden input").attr.value lis = doc('ul.module-case-items li').items() object_list = list() for x in lis: self.http.http_session("https://www.fjcourt.gov.cn" + x('a').attr.href, "get", headers=self.headers, verify=False) htm = self.http.parse_html() doc = pq(htm) # 生成文件路径 t_way = self.task_id + str(time.time()) + '.txt' # 生成文件路径 file_out(t_way, str(htm)) content = doc('div.article-wrap') item = dict() item["taskid"] = self.task_id item["title"] = content('p.article-hd-title').text() item["bulletin_way"] = t_way item["court_y"] = content('span.article-author').text() item["court_t"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["start_court_t"] = x('span.cir-time').text().replace( "[", "").replace("]", "") item["court_part"] = "".join( re.findall("(在.*公开)", content('div.article-content').text())).replace( "在", "").replace("公开", "") item["site_name"] = self.site_name pub_time = (item["start_court_t"].replace("-", "")) date = get_today_date() if eval(pub_time) > eval(date): # 将item字典映射成对象 b = BulletinCourt(**item) object_list.append(b) # 返回对象列表和总页数 return object_list, total_page, VIEWSTATE