def parse(self): date = get_today_date() form = { 'isGeneral': 'Y', 'belongOrgId': '', 'liveStatus': '001', 'page.pageSize': '20', 'page.pageNo': '1', 'gopenCourtDate': date + ' 00:00:00', 'page.orderBy': 'openCourtDate', 'page.order': 'asc', 'caseType': '', 'searchWord': '' } url = "http://www.jxfy.gov.cn/api.do?method=ttrialliveliveinfo!listAjaxp.action" log.info("开始抓取==============江西庭审公开网") log.info("开始抓取==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page.pageNo"] = i log.info("开始抓取==============江西庭审公开网,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储==============江西庭审公开网,第{}页".format( str(form['page.pageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format( str(form['page.pageNo'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json江西庭审公开网,第{}页异常".format(str(form['page.pageNo'])), self.task_id, url, self.site_name)
def parse(self): url = "https://www.fjcourt.gov.cn/page/public/courtreport.html" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, 1)) self.http.http_requst(url, "get", headers=self.headers, verify=False) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, 1)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): form = { "__VIEWSTATE": VIEWSTATE, "__VIEWSTATEGENERATOR": "54969BDC", "__EVENTTARGET": "ctl00$cplContent$AspNetPager1", } try: form["__EVENTARGUMENT"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page, VIEWSTATE = self.parse_html( html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['__EVENTARGUMENT']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['__EVENTARGUMENT'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, 1), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name))
def parse(self): form = {"channelId": "858", "listsize": "673", "pagego": "1"} url = "http://www.ynfy.gov.cn/ktggPage.jspx" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['pagego']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() form["listsize"] = total_page for i in range(2, int(total_page) + 1): try: form["pagego"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, (form['pagego']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, (form['pagego']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取{},第{}页异常".format(self.site_name, (form['pagego'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name))
def parse(self): form = {"ah": "", "page": "1", "fydm": "51", "limit": "9", "nd": ""} url = "http://111.230.134.78:8081/sdgl/app/sdggsd_list" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['page']))) self.http.set_charset("unicode") self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["page"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['page']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 # break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['page'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name))
def parse(self): today_date = get_today_date() next_year_today_date = str(int(today_date[0:4]) + 1) + today_date[4:] form = { "currentPageNo": "1", "pageSize": "10", "startDate": today_date, "endDate": next_year_today_date, "caseNo": "", "litigant": "", "judge": "", "caseDesc": "", "siteId": "f7afc746-8577-4cd4-a410-884027df5bab" } url = "http://www.hicourt.gov.cn/frontDesk/getNoticeList" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["currentPageNo"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format(self.site_name, str(form['currentPageNo']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['currentPageNo']) ), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException("抓取json{},第{}页异常".format(self.site_name, str(form['page.pageNo']) ), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name))
def parse(self): form = {"fyid": "", "page": "1", "kssj": "", "jssj": ""} url = "http://www.guizhoucourt.cn/ktggSearchResult.jspx" log.info("开始抓取==============贵州法院公众服务平台") log.info("开始抓取==============贵州法院公众服务平台,第{}页".format(str(form['page']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["page"] = i log.info("开始抓取==============贵州法院公众服务平台,第{}页".format( str(form['page']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============贵州法院公众服务平台,第{}页".format( str(form['page']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['page'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取贵州法院公众服务平台,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取贵州法院公众服务平台结束")
def parse(self): form = {"folderNo": "0401", "pageIndex": "1"} url = "http://www.ezfy.hbfy.gov.cn/DocManage/getDocsByFolder" log.info("开始抓取==============湖北省高级人民法院") log.info("开始抓取==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page) + 1): try: form["pageIndex"] = i log.info("开始抓取==============湖北省高级人民法院,第{}页".format(i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储==============湖北省高级人民法院,第{}页".format( str(form['pageIndex']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取湖北省高级人民法院,第{}页异常".format(i), self.task_id, url, self.site_name) except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException( "抓取湖北省高级人民法院,第{}页异常".format(str(form['pageIndex'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取湖北省高级人民法院结束")
def parse(self): page = 1 url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, page)) self.http.http_requst(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() # for i in range(2, int(total_page) + 1): try: page = i url = "http://sxfy.chinacourt.org/article/index/id/M8i2NDBINDAwNCACAAA/page/{}.shtml".format( page) log.info("开始抓取=============={},第{}页".format( self.site_name, page)) self.http.http_session(url, "get", headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format( self.site_name, page)) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException( "抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, page), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name))
def parse(self): form = { "p": "1", "LocationID": "0700000000", "sub": "" } url = "http://qhfy.chinacourt.org/fygg/index.php" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_requst(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() for i in range(2, int(total_page)+1): try: form["p"] = i log.info("开始抓取=============={},第{}页".format(self.site_name, (form['p']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list, total_page = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['p']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, ( form['p'])), self.task_id, url, self.site_name) # except Exception: # 捕获异常 m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['p'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name))
def parse(self): form = { "action": "gotoggxxcx", "gglx": "ktgg", "flag": "first" } url = "http://www.gdcourts.gov.cn/web/search" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, (form['flag']))) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: html_data = self.http.parse_html() object_list = self.parse_html(html_data) log.info("开始存储=============={},第{}页".format(self.site_name, (form['flag']))) # 将对象列表插入数据库 self.mysql_client.session_insert_list(object_list) # 提交 self.mysql_client.session_commit() else: SpiderException("抓取{},第{}页异常".format(self.site_name, (form['pagecur'])), self.task_id, url, self.site_name) # 关闭数据库链接 self.mysql_client.session_close() log.info("抓取{}结束".format(self.site_name))
def parse(self): log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: d_url = p['det_url'] log.info('开始抓取新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page='1', strip=str(p_list.index(p)+1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取新疆法院诉讼服务网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(page=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析重新疆法院诉讼服务网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: d_url = p['det_url'] log.info('开始重新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析新疆法院诉讼服务网第{page},第{strip}条信息'.format(page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) log.info('存储新疆法院诉讼服务网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取新疆法院诉讼服务网结束')
def parse(self): log.info('开始抓取天津法院网') ct = 1 while ct < 30: log.info('开始抓取天津法院网第{page}页信息'.format(page=str(ct))) self.http.http_session(self.url.format(page=str(ct)), 'get', headers=self.http.headers) try: r = self.http.parse_html() log.info('解析天津法院网第{page}页信息'.format(page=str(ct))) p_list = self.parse_list(r) ic = self.is_c(r) object_list = list() for i in p_list: try: log.info('开始抓取天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)+1))) d_url = 'http://tjfy.chinacourt.org' + i['det_url'] self.http.http_session(d_url, 'get', headers=self.http.headers) rl = self.http.parse_html() log.info('解析天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)))) self.parse_info(rl, i) log.info('写出天津法院网第{page},第{strip}条信息'.format(page=str(ct), strip=str(p_list.index(i)))) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, i['html']) i['bulletin_way'] = t_way i.pop('det_url') i.pop('html') b = BulletinCourt(**i) object_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) log.info('存储天津法院网第{page}页数据'.format(page=str(ct), strip=str(p_list.index(i)))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() if ic == 0: break except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) ct += 1 self.mysql_client.session_close() log.info('开始抓取天津法院网结束')
def get_total_page(self, json_data): # 获取总页数 try: total_page = json_data["totalPage"] return total_page except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, self.site_name, json_data) return 0
def page_total(self, res): try: k = int( re.search('共\d*页', res).group().replace('共', '').replace('页', '')) return k except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
def session_commit(self): suc = False try: self.db_session.commit() suc = True except Exception as e: self.db_session.rollback() SpiderException(traceback.format_exc(), self.task_id, self.data_list) return suc
def page_total(self, res): try: doc = pq(res) skip = doc('div.turn_page').children('p').children('a') tpage = int(skip.eq(skip.length - 2).text()) return tpage except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
def page_total(self, res): try: str0 = int( re.search('共\d*条', res).group().replace('共', '').replace('条', '')) connt = math.ceil(str0 / 15) return connt except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
def page_total(self, res): try: doc = pq(res) jump = doc('div.jump div.skip').children('a') len = jump.eq(jump.length - 1) k = int(len.attr('onclick').replace('turnPage(', '').replace(')', '')) return k except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, '解析总页数异常') return 0
def parse(self): log.info('开始抓取黑龙江法院网第{page}页信息'.format(page='1')) ts = datetime.date.today() tm = datetime.date.today() + datetime.timedelta(days=365) self.http.http_session(self.url.format(page='1', start=str(ts), end=str(tm)), 'get', headers=self.http.headers) self.http.set_charset('gb2312') r = self.http.parse_html() print(r) log.info('解析抓取黑龙江法院网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p['det_url'] log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析黑龙江法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) break log.info('存储黑龙江法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取黑龙江法院网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(page=str(total), start=str(ts), end=str(tm)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析黑龙江法院网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.hljcourt.gov.cn/ktgg/' + p[ 'det_url'] log.info('开始抓取黑龙江法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析黑龙江法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储黑龙江法院网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException( m, self.taskid, self.site_name, self.url.format(end=str(tm), start=str(ts), page=str(total))) self.mysql_client.session_close() log.info('抓取黑龙江法院网结束')
def parse(self): log.info('开始抓取河北法院网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析河北法院网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a.sp_next') nurl = 'http://hbgy.hbsfgk.org' + skip.attr('href').replace( 'pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取河北法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析河北法院网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储河北法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取河北法院网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析河北法院网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始河北法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析河北法院网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储河北法院网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) time0 = get_today_date() time1 = self.get_n_t(r) strftime0 = datetime.datetime.strptime(time1, "%Y-%m-%d") strftime1 = datetime.datetime.strptime(time0, "%Y-%m-%d") fg = strftime1 > strftime0 if fg == True: break self.mysql_client.session_close() log.info('抓取河北法院网结束')
def parse(self): form = { "pageno": "1", "pagesize": "10", "cbfy": "全部", "dsr": "", "spz": "", "jarq1": "", "jarq2": "" } url = "http://www.zjsfgkw.cn/Notice/NoticeKTSearch" log.info("开始抓取=============={}".format(self.site_name)) log.info("开始抓取=============={},第{}页".format(self.site_name, str(form['pageno']))) self.http.http_session(url, "post", data=form, headers=self.headers) # if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() total_page = self.get_total_page(json_data) for i in range(2, total_page + 1): try: form["pageno"] = i log.info("开始抓取=============={},第{}页".format( self.site_name, i)) self.http.http_session(url, "post", data=form, headers=self.headers) if self.http.res_code() == 200: json_data = self.http.parse_json() object_list = self.parse_list(json_data, form) log.info("开始存储=============={},第{}页".format( self.site_name, str(form['pageno']))) self.mysql_client.session_insert_list(object_list) self.mysql_client.session_commit() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) except Exception: m = traceback.format_exc() SpiderException(m, self.task_id, url, self.site_name) # 目前为测试状态,只抓取前两页内容,正式上线前将break删掉 break self.mysql_client.session_close() else: SpiderException( "抓取json{},第{}页异常".format(self.site_name, str(form['pageno'])), self.task_id, url, self.site_name) log.info("抓取{}结束".format(self.site_name))
def parse(self): log.info('开始抓取吉林省高级人民法院司法公开网第{page}页信息'.format(page='1')) self.http.http_session(self.url.format(page='1'), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析吉林省高级人民法院司法公开网第{page}页信息'.format(page='1')) doc = pq(r) skip = doc('div.turn_page').children('p').children('a.sp_next') nurl = 'http://www.jlsfy.gov.cn' + skip.attr('href').replace('&', '&')\ .replace('pagecur=1', 'pagecur={pageno}') p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始抓取吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析吉林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储吉林省高级人民法院司法公开网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) for total in range(2, p_total): try: log.info('开始抓取林省高级人民法院司法公开网第{page}页信息'.format(page=str(total))) self.http.http_session(nurl.format(pageno=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html() log.info('解析林省高级人民法院司法公开网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = p['det_url'] log.info('开始林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析林省高级人民法院司法公开网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储林省高级人民法院司法公开网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, self.url) self.mysql_client.session_close() log.info('抓取林省高级人民法院司法公开网结束')
def parse(self): log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page='1')) ts = datetime.date.today() tm = datetime.date.today() + datetime.timedelta(days=365) self.http.http_session(self.url.format(end=str(tm), start=str(ts), page='1'), 'get', headers=self.http.headers) r = self.http.parse_html().replace('►', '') log.info('解析重庆法院公共服务网第{page}页信息'.format(page='1')) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[ 'det_url'] log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format( page='1', strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储天津法院网第{page}页数据'.format(page='1')) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() p_total = self.page_total(r) print(p_total) for total in range(2, p_total): try: log.info('开始抓取重庆法院公共服务网第{page}页信息'.format(page=str(total))) self.http.http_session(self.url.format(end=str(tm), start=str(ts), page=str(total)), 'get', headers=self.http.headers) r = self.http.parse_html().replace('►', '') log.info('解析重庆法院公共服务网第{page}页信息'.format(page=str(total))) p_list = self.parse_list(r) b_list = list() for p in p_list: try: d_url = 'http://www.cqfygzfw.com/court/gg_ggxx.shtml?gg.id=' + p[ 'det_url'] log.info('开始抓取重庆法院公共服务网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.http.http_session(d_url, 'get', headers=self.http.headers) det_mess = self.http.parse_html() log.info('解析重庆法院公共服务网第{page},第{strip}条信息'.format( page=str(total), strip=str(p_list.index(p) + 1))) self.parse_info(det_mess, p) t_way = self.taskid + str(time.time()) + '.txt' file_out(t_way, p['html']) p['bulletin_way'] = t_way p.pop('det_url') p.pop('html') p['taskid'] = self.taskid b = BulletinCourt(**p) b_list.append(b) except Exception: m = traceback.format_exc() SpiderException(m, self.taskid, self.site_name, d_url) log.info('存储重庆法院公共服务网第{page}页数据'.format(page=str(total))) self.mysql_client.session_insert_list(b_list) self.mysql_client.session_commit() except Exception: m = traceback.format_exc() SpiderException( m, self.taskid, self.site_name, self.url.format(end=str(tm), start=str(ts), page=str(total))) self.mysql_client.session_close() log.info('抓取重庆法院公共服务网结束')