class GaoDunSpider(scrapy.spiders.Spider): name = "gaodun_fagui" start_urls = [ "https://fagui.gaodun.com/", ] statuteData = StatuteData() #爬虫入口 def parse(self, response): return self.parse_article_count(response) # 解析文章数量 def parse_article_count(self,response): list_url='https://fagui.gaodun.com/index.php/Search/index/t/1/p/{0}.html' for page in range(1,12926): yield scrapy.Request(list_url.format(str(page)), callback=self.parse_article_list) #获取指定页面的列表 def parse_article_list(self,response): lis=response.xpath('//div[@class="mesgebox"]/ul/li') for li in lis: article_url= 'https://fagui.gaodun.com'+li.xpath("div[@class='cb randwen randwen_2']/p/a/@href").extract_first() src = ''.join(li.css('.yxdbox img::attr(src)').extract()) level=u'现行有效' if src!='': level = int(re.match('.*/yx(?P<flag>\d+)[\.]png', src).group('flag')) if level ==2: level=u'已失效' if level == 3: level = u'尚未生效' if level ==4: level = u'已被修正' yield scrapy.Request(article_url,callback=self.parse_article,meta={'level':level}) #解析文章内容 def parse_article(self,response): item={} item["url"]=response.url top = response.xpath('//div[@class="topfond tac"]') item['title'] = ''.join(top.xpath('h1/text()').extract()) item['pubish_org'] =''.join(top.xpath('h3/text()').extract()) item['anNo'] = ''.join(top.xpath('p/text()').extract()) item['pubish_time']= ''.join(response.xpath(u'//div[@class="towaltext"]/span[contains(text(),"发文时间:")]/text()').extract()).replace(u'发文时间:','').replace(' ','') item['effect_time']= ''.join(response.xpath(u'//div[@class="towaltext"]/span[contains(text(),"生效时间:")]/text()').extract()).replace(u'生效时间:','').replace(' ','') item['level']= u'行业团体规定' item['time_liness'] = response.meta['level'] # 是否导入正式数据库 item['export'] = '0' item['source'] = u"高顿" content="".join(response.xpath('//div[@id="para"]').extract()).replace('\r\n', '') content=re.sub('(<a.*?>.*?</a>)|((class|style|color|href|target|align)="[^"]*?")|(<.*?>)|(<[/].*?>)', '', content) # 内容''' item['content'] = content.replace("\r",'').replace("\n",'').replace("\t",'').replace(' ','').replace('附件下载:','') uid = str(uuid.uuid1()).replace('-', '') if item['content']!='' and item['title']!='': self.statuteData.insert_statute((uid, item['time_liness'], item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], 0)) del item['content'] print item
class FaGuiWriterPipeline(object): def __init__(self): self.statuteData = StatuteData() def process_item(self, item, spider): line = dict(item) if line.has_key('title'): self.statuteData.insert_statute( (item['Id'], item['time_liness'], item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], item['provinceName'], item['cityName'], item['provinceCode'], item['cityCode'], item['sIndex'], item['sTypeName'], 0)) return item def close_spider(self, spider): pass
class BYNRSpider(scrapy.Spider): # 巴彦淖尔市人民政府 name = "bynr" statuteData = StatuteData() provinceName = u"内蒙古" cityName = u"巴彦淖尔市" provinceCode='15' cityCode='1502' level = u"地方法规" pubish_time='2019-04-24' allowed_domains=["www.bynr.gov.cn"] start_urls = [ 'http://www.bynr.gov.cn/xxgk/zwgkml/'] page_domain = "http://www.bynr.gov.cn/xxgk/zwgkml/%s" def parse(self, response): send_requests = [] pageSize=19; for item in response.css('.cont_left_zwgk_cont_foot ul li'): url = ''.join(item.css("a::attr(href)").extract()) url=url.replace('../','').replace('./','') type=''.join(item.css("a::text").extract()).replace(' ','') dcount=''.join(item.css("::text").extract()).replace(' ','') dcount= re.findall("\d+",dcount)[0] page= (int(dcount) + pageSize - 1) / pageSize; for index in range(1, page + 1): if index==1: send_requests.append(scrapy.Request(self.page_domain % url, callback=self.parse_list, method='get', errback=self.handle_error ,meta={'sublevel': url})) else: p=index-1 newurl=''.join((url, 'index_%d.html' % p )) send_requests.append(scrapy.Request(self.page_domain % newurl, callback=self.parse_list, method='get', errback=self.handle_error ,meta={'sublevel': url})) return send_requests def parse_list(self, response): sublevel=response.meta['sublevel'] for item in response.css(".cont_right_cont a"): detail_url=''.join((sublevel , ''.join(item.css("::attr(href)").extract()).replace('./',''))) yield scrapy.Request(self.page_domain % detail_url, callback=self.parse_detail, method='get',errback=self.handle_error) pass def parse_detail(self, response): item = {} title = ''.join(response.css('.zwgk_hei18::text').extract_first()) if title != '': item['title'] = title item['anNo'] =''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(4) td:nth-child(2)::text').re('[^\s+]')) item['pubish_time'] =''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(2) td:nth-child(4)::text').re('[^\s+]')) item['effect_time'] = None item['pubish_org'] = ''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(2) td:nth-child(2)::text').extract()) item['level'] = self.level item['time_liness'] = u"现行有效" content = ''.join(response.css('.cont_right_cont_xilan table:nth-child(2) table:nth-child(2)').extract()) item["content"] = re.sub('((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)|(?i)(<SCRIPT)[\\s\\S]*?((</SCRIPT>)|(/>))|(?i)(<style)[\\s\\S]*?((</style>)|(/>))', '',content) # 内容''' #item["content"] = '' item['url'] = response.url item["provinceName"] = self.provinceName item["cityName"] = self.cityName item["provinceCode"] = self.provinceCode item["cityCode"] = self.cityCode item['sIndex'] = ''.join(response.css('.cont_right_cont_xilan table:nth-child(1) tr:nth-child(1) td:nth-child(2)::text').extract_first()) #判断文件类型 stype='' if '通知' in title: stype=u'通知' elif '通告' in title: stype=u'通告' elif '批复' in title: stype = u'批复' elif '命令' in title: stype = u'命令' elif '通报' in title: stype = u'通报' elif '意见' in title: stype = u'意见' elif '决定' in title: stype = u'决定' elif '公告' in title: stype = u'公告' else: stype = u'其他' item["sTypeName"] = stype item['source'] = u"巴彦淖尔市人民政府" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' item["Id"] = str(uuid.uuid1()).replace('-', '') return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class OrdosSpider(scrapy.Spider): # 鄂尔多斯市人民政府网 name = "ordos" statuteData = StatuteData() provinceName = u"内蒙古" cityName = u"鄂尔多斯" provinceCode='15' cityCode='1505' level = u"地方法规" pubish_time='2019-04-23' allowed_domains=["ordos.gov.cn"] start_urls = [ 'http://xxgk.ordos.gov.cn/xxgk/channel/ordos_xxw/col10204f.html'] page_domain = "http://xxgk.ordos.gov.cn%s" def parse(self, response): send_requests = [] pageSize=15; for item in response.css('#tree4 div'): url=''.join(item.css("a::attr(href)").extract()).replace('../..','') type=''.join(item.css("a::text").extract()).replace(' ','') dcount=''.join(item.css("font::text").extract()).replace(' ','') dcount= re.findall("\d+",dcount)[0] page= (int(dcount) + pageSize - 1) / pageSize; for index in range(1, page + 1): if index==1: send_requests.append(scrapy.Request(self.page_domain % url, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type':type}, )) else: newurl=''.join((url, '&pos=%d' % index )) send_requests.append(scrapy.Request(self.page_domain % newurl, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type': type }, )) return send_requests def parse_list(self, response): for item in response.css(".recordlist a"): detail_url = self.page_domain % ''.join(item.css("::attr(href)").extract()).replace('../..','') yield scrapy.Request(detail_url, callback=self.parse_detail, method='get',errback=self.handle_error,meta={'type': response.meta['type']}) pass def parse_detail(self, response): item = {} title = ''.join(response.css('#title::text').extract_first()) if title != '': item['title'] = title item['anNo'] = ''.join(response.css('.detail table tr:nth-child(4) td:nth-child(2)::text').extract()) item['pubish_time'] =''.join(response.css('.detail table tr:nth-child(4) td:nth-child(4)::text').extract()) item['effect_time'] = None item['pubish_org'] = ''.join(response.css('.detail table tr:nth-child(2) td:nth-child(2)::text').extract()) item['level'] = self.level item['time_liness'] = u"现行有效" content = ''.join(response.css('#content').extract()) item["content"] = re.sub('((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '',content) # 内容''' #item["content"] = '' item['url'] = response.url item["provinceName"] = self.provinceName item["cityName"] = self.cityName item["provinceCode"] = self.provinceCode item["cityCode"] = self.cityCode item['sIndex'] = ''.join(response.css('.detail table tr:nth-child(1) td:nth-child(2)::text').extract()) item["sTypeName"] = response.meta["type"] item['source'] = u"鄂尔多斯市人民政府" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' item["Id"] = str(uuid.uuid1()).replace('-', '') return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class WFFGSpider(scrapy.Spider): """问法法律法规""" name = "wffagui_NoFilter" allowed_domains = ["www.51wf.com"] last_faguis = [ { 'source': u'部门规章', 'pubish_time': '2018-04-27' }, #部门规章 #{'source':u'行政法规', 'pubish_time':'2012-11-12'}, ] statuteData = StatuteData() start_urls = [ "http://www.51wf.com/law/search--authority-1--page-1", #宪法 "http://www.51wf.com/law/search--authority-2--page-1", # 行政行规 "http://www.51wf.com/law/search--authority-3--page-1", # 司法解释 "http://www.51wf.com/law/search--authority-4--page-1", # 部门规章 "http://www.51wf.com/law/search--authority-5--page-1", # 军事专项法 "http://www.51wf.com/law/search--authority-6--page-1", # 行政团体规定 "http://www.51wf.com/law/search--authority-7--page-1", # 地方法规规章 ] def parse(self, response): self.parse_list(response) page_count = ''.join( response.xpath("//a[@name='last']/text()").extract()) if page_count == "": return data_total = int(page_count) pageurl = str(response.url)[0:len(response.url) - 1] for page in range(2, data_total): yield scrapy.Request(pageurl + str(page), callback=self.parse_list, method='get', errback=self.handle_error) def parse_list(self, response): domain = "http://www.51wf.com%s" for item in response.css(".lie_biao li"): detail_url = domain % item.css( ".xin_wen a::attr(href)").extract_first() #if datetime.strptime(self.last_faguis[0]['pubish_time'], "%Y-%m-%d") < datetime.strptime(item.css(".shi_jian span::text").extract_first(), "%Y-%m-%d"): yield scrapy.Request(detail_url, callback=self.parse_detail, method='get', errback=self.handle_error) def parse_detail(self, response): item = {} item['title'] = "".join(response.css('.LL_bt_a::text').re('[^\s]')) # 发文字号 item['anNo'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【发文字号】")]/text()'). re('[^\s]')).replace(u'【发文字号】', '') # 颁布日期 item['pubish_time'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【颁布日期】")]/text()'). re('[^\s]')).replace(u'【颁布日期】', '') # 时效性 if len( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【时效性】")]')) > 1: item['time_liness'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【时效性】")][2]/text()' ).re('[^\s]')).replace(u'【时效性】', '') else: item['time_liness'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【时效性】")][1]/text()' ).re('[^\s]')).replace(u'【时效性】', '') # 生效日期 item['effect_time'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【生效日期】")]/text()'). re('[^\s]')).replace(u'【生效日期】', '') # 效力级别 item['level'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【效力级别】")]/span/a/text()' ).re('[^\s]')) # 颁布机构 item['pubish_org'] = "".join( response.xpath( u'//div[@class="LL_sx"]/p[contains(text(),"【颁布机构】")]/text()'). re('[^\s]')).replace(u'【颁布机构】', '') #[@class!="law_realate"] content = ''.join(response.css('.law-content').extract()) #替换【相关资料】【相关词条】 等内容 for cxt in response.css('.law_realate').extract(): content = content.replace(cxt, "") item["content"] = re.sub('(class|style|color|href|name)="[^"]*?"', '', content) # 内容 item['url'] = response.url item['source'] = u"问法法规" uid = str(uuid.uuid1()).replace('-', '') self.statuteData.insert_statute( (uid, item['time_liness'], item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], 0)) del item["content"] print item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class HuhhotSpider(scrapy.Spider): # 呼和浩特市人民政府 name = "huhhot" statuteData = StatuteData() provinceName = u"内蒙古" cityName = u"呼和浩特" provinceCode='15' cityCode='1506' level = u"地方法规" pubish_time='2019-04-23' allowed_domains=["www.huhhot.gov.cn"] start_urls = [ 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/'] fgparamlist = [ {'type': u'决定', 'page': '3', 'url':'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/219/index_74%s.html'}, {'type': u'命令', 'page': '1', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/220/index_74%s.html'}, {'type': u'通告', 'page': '4', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/221/index_74%s.html'}, {'type': u'公告', 'page': '6', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/1888/index_74%s.html'}, {'type': u'意见', 'page': '6', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/222/index_74%s.html'}, {'type': u'通知', 'page': '67', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/223/index_74%s.html'}, {'type': u'通报', 'page': '2', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/224/index_74%s.html'}, {'type': u'批复', 'page': '30', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/225/index_74%s.html'}, {'type': u'报告', 'page': '10', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/1020/index_74%s.html'}, {'type': u'会议纪要', 'page': '3', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/1860/index_74%s.html'}, {'type': u'其他', 'page': '67', 'url': 'http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx/218/226/index_74%s.html'}, ] page_domain = "http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx%s" def parse(self, response): send_requests = [] for item in self.fgparamlist: page = int(item['page']) listurl = item['url'] for index in range(1, page + 1): if index==1: newurl = listurl % '' send_requests.append(scrapy.Request(newurl, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type': item['type']}, )) else: p=index-1; newurl=listurl % '_%s' % p send_requests.append(scrapy.Request(newurl, callback=self.parse_list, method='get', errback=self.handle_error,meta={'type': item['type']}, )) return send_requests def parse_list(self, response): for item in response.css("#tbStu td a"): detail_url = self.page_domain % ''.join(item.css("::attr(href)").extract()).replace('../..','') yield scrapy.Request(detail_url, callback=self.parse_detail, method='get',errback=self.handle_error,meta={'type': response.meta['type']}) pass def parse_detail(self,response): item = {} title = ''.join(response.css('.zwgkxl_content h3::text').extract()) if title != '': item['title'] = title item['anNo'] = ''.join(response.css('.xxgk_tlb tr:nth-child(2) td:nth-child(4)::text').extract()) item['pubish_time'] = ''.join(response.css('.xxgk_tlb tr:nth-child(2) td:nth-child(6)::text').extract()) item['effect_time'] = ''.join(response.css('.xxgk_tlb tr:nth-child(3) td:nth-child(6)::text').extract()) item['pubish_org'] = ''.join(response.css('.xxgk_tlb tr:nth-child(3) td:nth-child(2)::text').extract()) item['level'] = self.level item['time_liness'] = u"现行有效" content = ''.join(response.css('.trs_word').extract()) item["content"] = re.sub('((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '',content) # 内容''' #item["content"] = '' item['url'] = response.url item["provinceName"] = self.provinceName item["cityName"] = self.cityName item["provinceCode"] = self.provinceCode item["cityCode"] = self.cityCode item['sIndex'] = ''.join(response.css('.xxgk_tlb tr:nth-child(2) td:nth-child(2)::text').extract()) item["sTypeName"] = response.meta["type"] item['source'] = u"呼和浩特市人民政府" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' item["Id"] = str(uuid.uuid1()).replace('-', '') return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class TongLiaoSpider(scrapy.Spider): # 通辽市政府网 name = "tongliao" statuteData = StatuteData() provinceName = u"内蒙古" cityName = u"通辽市" provinceCode = '15' cityCode = '1508' level = u"地方法规" pubish_time = '2019-04-23' allowed_domains = ["www.tongliao.gov.cn"] start_urls = ['http://www.tongliao.gov.cn/tl/ztfl/gkml.shtml'] page_domain = "http://www.tongliao.gov.cn%s" fgparamlist = [ { 'type': u'决定', 'page': '41', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=4f023e3455c2427d9a779bf9d5609b58&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'报告', 'page': '4', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=6c32e4a797504bf794249ca2282780bb&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'公告', 'page': '1', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=a2f5809614104fae871ffbe92629f036&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'通告', 'page': '2', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=6d92ebf491f445578629f29ae2d00acc&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'意见', 'page': '9', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=4d89a2fd76de4fb1a81eecad0ff3db1a&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'通知', 'page': '59', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=f50f51364fc9455181011222b2a67809&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'通报', 'page': '5', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=183816bd4c9843ab8ff177ccbd715321&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'批复', 'page': '1', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=93b71c1c6c624b059abfa9fb59f43990&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, { 'type': u'其他', 'page': '32', 'url': 'http://www.tongliao.gov.cn/xxgk/xxgktree/list.jsp?wz=8424888c9d614124ac179e6e0fd8569a&parentChannelId=402881a06053e512016053e51cfd0087&page=%d' }, ] def parse(self, response): send_requests = [] for item in self.fgparamlist: page = int(item['page']) listurl = item['url'] for index in range(1, page + 1): newurl = listurl % index send_requests.append( scrapy.Request( newurl, callback=self.parse_list, method='get', errback=self.handle_error, meta={'type': item['type']}, )) return send_requests def parse_list(self, response): for item in response.css(".dataList td a"): detail_url = self.page_domain % ''.join( item.css("::attr(href)").extract()).replace('../..', '') yield scrapy.Request(detail_url, callback=self.parse_detail, method='get', errback=self.handle_error, meta={'type': response.meta['type']}) pass def parse_detail(self, response): item = {} title = ''.join(response.css('.textc::text').re('[^\s+]')) if title != '': item['title'] = title anNo = ''.join( response.css('.detail_ysj:nth-child(5) em::text').extract()) if anNo == '': anNo = None item['anNo'] = anNo item['pubish_time'] = ''.join( response.css('.detail_ysj:nth-child(7) em::text').extract()) item['effect_time'] = None item['pubish_org'] = ''.join( response.css('.detail_ysj:nth-child(2) em::text').extract()) item['level'] = self.level item['time_liness'] = u"现行有效" content = ''.join(response.css('#text01').extract()) item["content"] = re.sub( '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '', content) # 内容''' #item["content"] = '' item['url'] = response.url item["provinceName"] = self.provinceName item["cityName"] = self.cityName item["provinceCode"] = self.provinceCode item["cityCode"] = self.cityCode item['sIndex'] = ''.join( response.css('.detail_ysj:nth-child(1) em::text').extract()) item["sTypeName"] = response.meta["type"] item['source'] = u"通辽市人民政府" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' item["Id"] = str(uuid.uuid1()).replace('-', '') return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class NMGShengSpider(scrapy.Spider): # 内蒙古自治区人民政府网 name = "nmgsheng" statuteData = StatuteData() provinceName = u"内蒙古" provinceCode = '15' level = u"地方法规" pubish_time = '2019-04-23' allowed_domains = ["www.nmg.gov.cn"] start_urls = ['http://www.nmg.gov.cn/col/col4191/index.html'] fgparamlist = [{ 'type': u'决定', 'page': '2', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '13', 'vc_bm': 'NC1', 'area': '1115000001151201XD' } }, { 'type': u'命令', 'page': '3', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '14', 'vc_bm': 'NC2', 'area': '1115000001151201XD' } }, { 'type': u'通报', 'page': '3', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '17', 'vc_bm': 'NC5', 'area': '1115000001151201XD' } }, { 'type': u'意见', 'page': '20', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '18', 'vc_bm': 'NC6', 'area': '1115000001151201XD' } }, { 'type': u'批复', 'page': '1', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '19', 'vc_bm': 'NC7', 'area': '1115000001151201XD' } }, { 'type': u'通知', 'page': '89', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '20', 'vc_bm': 'NC8', 'area': '1115000001151201XD' } }, { 'type': u'公告', 'page': '1', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '23', 'vc_bm': 'NC11', 'area': '1115000001151201XD' } }, { 'type': u'通告', 'page': '2', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '24', 'vc_bm': 'NC12', 'area': '1115000001151201XD' } }, { 'type': u'其他', 'page': '1', 'params': { 'infotypeId': '0', 'jdid': '2', 'nServiceid': '26', 'vc_bm': 'NC14', 'area': '1115000001151201XD' } }] headers = { 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8' } def parse(self, response): send_requests = [] for item in self.fgparamlist: page = int(item['page']) for index in range(1, page + 1): url = "http://www.nmg.gov.cn/module/xxgk/serviceinfo.jsp?currpage=%d" % index send_requests.append( scrapy.FormRequest(url=url, method="POST", formdata=item['params'], headers=self.headers, callback=self.parse_list, meta={'type': item['type']}, errback=self.handle_error)) return send_requests def parse_list(self, response): for item in response.css('table a[href*="http://www.nmg.gov.cn"]'): url = ''.join(item.css("::attr(href)").extract()).replace( '../..', '') yield scrapy.Request(url, callback=self.parse_detail, method='get', errback=self.handle_error, meta={'type': response.meta['type']}) def parse_detail(self, response): item = {} title = ''.join(response.css('.main-fl-tit::text').re('[^\s+]')) if title != '': item['title'] = title item['anNo'] = ''.join( response.css( '.xxgk_table tr:nth-child(3) td:nth-child(2)::text').re( '[^\s+]')) item['pubish_time'] = ''.join( response.css( '.xxgk_table tr:nth-child(2) td:nth-child(4)::text').re( '[^\s+]')) item['effect_time'] = None item['pubish_org'] = ''.join( response.css( '.xxgk_table tr:nth-child(2) td:nth-child(2)::text').re( '[^\s+]')) item['level'] = self.level item['time_liness'] = u"现行有效" content = ''.join(response.css('#zoom').extract()) item["content"] = re.sub( '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '', content) # 内容''' #item["content"] = '' item['url'] = response.url item["provinceName"] = self.provinceName item["provinceCode"] = self.provinceCode item["cityName"] = None item["cityCode"] = None item['sIndex'] = ''.join( response.css( '.xxgk_table tr:nth-child(1) td:nth-child(2)::text'). extract()) item["sTypeName"] = response.meta["type"] item['source'] = u"内蒙古自治区人民政府" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' item["Id"] = str(uuid.uuid1()).replace('-', '') return item pass def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class qfayuan_fagui_spider(scrapy.Spider): """中国法院网法规""" name = "qfyfagui" statuteData = StatuteData() allowed_domains = ["www.chinacourt.org"] start_urls = [ 'http://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA%3D%3D/page/1.shtml', # 国家法规库 'https://www.chinacourt.org/law/more/law_type_id/MzAwM0AFAA%3D%3D/page/1.shtml', # 司法解释 'https://www.chinacourt.org/law/more/law_type_id/MzAwMkAFAA%3D%3D/page/1.shtml', # 地方法规 'https://www.chinacourt.org/law/more/law_type_id/MzAwMUAFAA%3D%3D/page/1.shtml' # 政策参考 ] page_domain = "http://www.chinacourt.org%s" def parse(self, response): self.parse_list(response) pageurlstr = ''.join( response.xpath(u'//a[text()="尾页"]/@href').extract()) pagecount = int( re.match('.*/page/(?P<page>\d+)\.shtml', pageurlstr).group('page')) for page in range(2, pagecount): pageurl = re.sub(r'(?P<page>\d+)\.shtml', '{0}.shtml'.format(str(page)), response.url) yield scrapy.Request(pageurl, callback=self.parse_list, method='get', errback=self.handle_error) def parse_list(self, response): for item in response.xpath('//div[@class="law_list"]')[0].css("ul li"): detailurl = item.css('.left a::attr(href)').extract_first() title = item.css('.left a::text').extract_first() detail_url = self.page_domain % detailurl yield scrapy.Request(detail_url, callback=self.parse_detail, method='get', errback=self.handle_error, meta={"title": title}) def parse_detail(self, response): item = {} item['title'] = response.meta['title'] if response.css(" .STitle") != None: STitle = "".join( response.css(".law_content .STitle").re('[^\s]')).split("<br>") item['anNo'] = '' item['pubish_time'] = None item['effect_time'] = None item['pubish_org'] = '' item['level'] = '' for st in STitle: sindex = st.find(u"】") + 1 if st.find(u"发布文号") != -1: # 发文字号 item['anNo'] = st[sindex:len(st)] if st.find(u"发布日期") != -1: # 颁布日期 item['pubish_time'] = st[sindex:len(st)] if st.find(u"生效日期") != -1: # 生效日期 item['effect_time'] = st[sindex:len(st)] # 颁布机构 if st.find(u"发布单位") != -1: item['pubish_org'] = st[sindex:len(st)] # 效力级别 if st.find(u"所属类别") != -1: item['level'] = st[sindex:len(st)] # 时效性 item['time_liness'] = "" content = ''.join(response.css('.content_text').extract()) item["content"] = re.sub('(class|style|color|href)="[^"]*?"', '', content) # 内容''' item['url'] = response.url item['source'] = u"中国法院网" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' uid = str(uuid.uuid1()).replace('-', '') #Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild self.statuteData.insert_statute( (uid, u'', item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], 0)) del item["content"] print item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class WYFGSpider(scrapy.Spider): """中国法院网法规""" name = "chinalawfagui" statuteData = StatuteData() allowed_domains = ["www.chinalaw.gov.cn"] start_urls = [ 'http://www.chinalaw.gov.cn', ] page_domain = "http://www.chinalaw.gov.cn%s" def parse(self, response): rqs = [] with open(os.path.abspath('./chinalawData.json'), 'r') as f: line = json.load(f) rqs.extend(self.parse_list(u"国家法律法规", line["fvfg"])) rqs.extend(self.parse_list(u"地方法规", line["df"])) rqs.extend(self.parse_list(u"行业团体规定", line["xztt"])) rqs.extend(self.parse_list(u"部门规章", line["bumen"])) return rqs def parse_list(self, level, source): for item in source: yield scrapy.Request(self.page_domain % item["infostaticurl"], callback=self.parse_detail, method='get', errback=self.handle_error, meta={ 'level': level, 'title': item["listtitle"], 'pub_time': item["releasedate"] }) def parse_detail(self, response): item = {} title = response.meta['title'] if title != '': item['title'] = title item['anNo'] = None item['pubish_time'] = response.meta['pub_time'] item['effect_time'] = response.meta['pub_time'] item['pubish_org'] = None item['level'] = response.meta['level'] item['time_liness'] = "现行有效" content = ''.join( response.xpath('//div[@id="content"]/span').extract()) item["content"] = re.sub( '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)', '', content) # 内容''' item['url'] = response.url item['source'] = u"中国政府法制信息网" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' uid = str(uuid.uuid1()).replace('-', '') #Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild self.statuteData.insert_statute( (uid, u'现行有效', item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], 0)) del item["content"] print item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class XAMSpider(scrapy.Spider): # 兴安盟行政公署 name = "xinganmeng" statuteData = StatuteData() provinceName = u"内蒙古" cityName = u"兴安盟" provinceCode = '15' cityCode = '1512' level = u"地方法规" pubish_time = '2019-04-23' allowed_domains = ["xam.gov.cn"] start_urls = ['http://www.xam.gov.cn/xam/_300473/_300600/index.html'] page_domain = "http://www.huhhot.gov.cn/zwgk/zfxxgkzl/zfxxgkmlx%s" page_domain = "http://www.xam.gov.cn%s" def parse(self, response): send_requests = [] for item in response.css('#tsxa_r a[href^="/xam"]'): url = ''.join(item.css("::attr(href)").extract()).replace( '../..', '') send_requests.append( scrapy.Request(self.page_domain % url, callback=self.parse_list, method='get', errback=self.handle_error)) return send_requests def parse_list(self, response): for item in response.css(".gkml_list a"): detail_url = ''.join(item.css("::attr(href)").extract()).replace( '../..', '') yield scrapy.Request(self.page_domain % detail_url, callback=self.parse_detail, method='get', errback=self.handle_error) #处理分页 nexturl = ''.join( response.css('.gkml_con .page a:nth-child(5)::attr(tagname)'). extract()).replace('../..', '') if nexturl <> '[NEXTPAGE]': yield scrapy.Request(self.page_domain % nexturl, callback=self.parse_list, method='get', errback=self.handle_error) pass def parse_detail(self, response): item = {} title = ''.join(response.css('.qzqd_tit::text').re('[^\s+]')) if title != '': item['title'] = title item['anNo'] = ''.join( response.css('.xxgk_top li:nth-child(3)::text').re('[^\s+]')) item['pubish_time'] = ''.join( response.css('.xxgk_top li:nth-child(4)::text').re( '[^\s+]')).replace(u"年", "-").replace(u"月", "-").replace(u"日", "") item['effect_time'] = None item['pubish_org'] = ''.join( response.css('.xxgk_top li:nth-child(2)::text').re('[^\s+]')) item['level'] = self.level item['time_liness'] = u"现行有效" content = ''.join(response.css('.content_xilan').extract()) item["content"] = re.sub( '((class|style|color|href|target|align|title)="[^"]*?")|(<img .*?>)|(?i)(<SCRIPT)[\\s\\S]*?((</SCRIPT>)|(/>))|(?i)(<style)[\\s\\S]*?((</style>)|(/>))', '', content) # 内容''' #item["content"] = '' item['url'] = response.url item["provinceName"] = self.provinceName item["cityName"] = self.cityName item["provinceCode"] = self.provinceCode item["cityCode"] = self.cityCode item['sIndex'] = ''.join( response.css('.xxgk_top li:nth-child(1)::text').re('[^\s+]')) #判断文件类型 stype = '' if '通知' in title: stype = u'通知' elif '通告' in title: stype = u'通告' elif '批复' in title: stype = u'批复' elif '命令' in title: stype = u'命令' elif '通报' in title: stype = u'通报' elif '意见' in title: stype = u'意见' elif '决定' in title: stype = u'决定' elif '公告' in title: stype = u'公告' else: stype = u'其他' item["sTypeName"] = stype item['source'] = u"兴安盟行政公署" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' item["Id"] = str(uuid.uuid1()).replace('-', '') return item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
def __init__(self): self.statuteData = StatuteData()
class WYFGSpider(scrapy.Spider): """中国法院网法规""" name = "fyfagui" statuteData = StatuteData() allowed_domains = ["www.chinacourt.org"] #抓取法规最后发布时间 last_faguis = [{ 'source': u'地方法规', 'pubish_time': '1950-03-29' }, { 'source': u'国家法律法规', 'pubish_time': '1949-09-29' }, { 'source': u'司法解释', 'pubish_time': '2011-03-01' }, { 'source': u'政策参考', 'pubish_time': '1987-05-31' }] start_urls = [ 'http://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA%3D%3D/page/1.shtml', #国家法规库 'https://www.chinacourt.org/law/more/law_type_id/MzAwM0AFAA%3D%3D/page/1.shtml', #司法解释 'https://www.chinacourt.org/law/more/law_type_id/MzAwMkAFAA%3D%3D/page/1.shtml', #地方法规 'https://www.chinacourt.org/law/more/law_type_id/MzAwMUAFAA%3D%3D/page/1.shtml' #政策参考 ] page_domain = "http://www.chinacourt.org%s" def find_last_faguis(self, source, pub_time): hax_next = 1 for item in self.last_faguis: if item['source'] == source and datetime.strptime( item['pubish_time'], "%Y-%m-%d") < datetime.strptime( pub_time, "%Y-%m-%d"): hax_next = 0 break return hax_next def find_spider_max_fagui(self, response): hax_next = 1 source = response.xpath('//div[@id="title"]/text()').extract_first() for item in response.xpath('//div[@class="law_list"]')[0].css( "ul li").css('.right::text').extract(): hax_next = self.find_last_faguis(source, item) if hax_next == 1: break return hax_next def parse(self, response): page_list = [] page_list.extend(self.parse_list(response)) if self.find_spider_max_fagui(response) == 0: pageIndex = int( re.match('.*/page/(?P<page>\d+)\.shtml', response.url).group('page')) if response.xpath(u'//a[text()="下一页"]') != None: pageurl = re.sub(r'(?P<page>\d+)\.shtml', '{0}.shtml'.format(str(pageIndex + 1)), response.url) page_list.append( scrapy.Request(pageurl, callback=self.parse, method='get', errback=self.handle_error)) return page_list def parse_list(self, response): page_list = [] source = response.xpath('//div[@id="title"]/text()').extract_first() for item in response.xpath('//div[@class="law_list"]')[0].css("ul li"): detailurl = item.css('.left a::attr(href)').extract_first() if self.find_last_faguis( source, ''.join(item.xpath('span[2]/text()').extract())) == 0: detail_url = self.page_domain % item page_list.append( scrapy.Request(self.page_domain % detailurl, callback=self.parse_detail, method='get', errback=self.handle_error)) return page_list def parse_detail(self, response): item = {} title = "".join(response.xpath("//strong[1]/text()").re('[^\s]')) item['title'] = title if response.css(".law_content .STitle") != None and title != '': STitle = "".join( response.css(".law_content .STitle").re('[^\s]')).split("<br>") item['anNo'] = None item['pubish_time'] = None item['effect_time'] = None item['pubish_org'] = None item['level'] = None for st in STitle: sindex = st.find(u"】") + 1 if st.find(u"发布文号") != -1: # 发文字号 item['anNo'] = st[sindex:len(st)] if st.find(u"发布日期") != -1: # 颁布日期 item['pubish_time'] = st[sindex:len(st)] if st.find(u"生效日期") != -1: # 生效日期 item['effect_time'] = st[sindex:len(st)] # 颁布机构 if st.find(u"发布单位") != -1: item['pubish_org'] = st[sindex:len(st)] # 效力级别 if st.find(u"所属类别") != -1: item['level'] = st[sindex:len(st)] # 时效性 item['time_liness'] = "" content = ''.join(response.css('.content_text').extract()) item["content"] = re.sub('(class|style|color|href)="[^"]*?"', '', content) # 内容''' item['url'] = response.url item['source'] = u"中国法院网" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' uid = str(uuid.uuid1()).replace('-', '') #Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild # self.statuteData.insert_statute((uid,u'',item['effect_time'],item['level'],item['pubish_time'],item['title'],item['anNo'],item['source'],item['pubish_org'],item["content"],0)) del item["content"] print item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)
class LawtimeSpider(scrapy.spiders.Spider): name = "lawtime" start_urls = [ "http://law.lawtime.cn/gjfg_2_100.html", "http://law.lawtime.cn/gjfg_3_101.html", "http://law.lawtime.cn/gjfg_4_102.html", "http://law.lawtime.cn/gjfg_5_117.html", "http://law.lawtime.cn/sfjs.html", "http://law.lawtime.cn/dffg_2_107.html", "http://law.lawtime.cn/dffg_3_114.html", "http://law.lawtime.cn/dffg_4_116.html", "http://law.lawtime.cn/dffg_5_115.html", "http://law.lawtime.cn/gjty.html", "http://law.lawtime.cn/hygf.html", "http://law.lawtime.cn/lfca.html", "http://law.lawtime.cn/lifadongtai.html" ] statuteData = StatuteData() #爬虫入口 def parse(self, response): return self.parse_article_count(response) # 解析文章数量 def parse_article_count(self, response): page_count = response.xpath( '//div[@class="paging"]/a[@class="dot"]/following-sibling::*[1]/text()' ).extract_first() if page_count is None: return for index in range(1, int(page_count) + 1): extension_index = response.url.index(".html") url = response.url[0:extension_index] + "_" + str(index) + ".html" yield scrapy.Request(url, callback=self.parse_article_list) #获取指定页面的列表 def parse_article_list(self, response): spans = response.xpath('//ul[@class="kc_complex_ul"]/li/span') for span in spans: article_url = span.xpath("a/@href").extract_first() if article_url is not None: pass yield scrapy.Request("http:" + article_url, callback=self.parse_article) #解析文章内容 def parse_article(self, response): item = {} item["url"] = response.url top = response.xpath('//div[@class="a_cont_top"]') item['title'] = top.xpath('h1/text()').extract()[0] organization = top.xpath( 'p/span[contains(text(),"颁布单位")]/following-sibling::span[1]/text()' .decode("utf-8")).extract() publish_time = top.xpath( 'p/span[contains(text(),"颁布时间")]/following-sibling::span[1]/text()' .decode("utf-8")).extract() release_time = top.xpath( 'p/span[contains(text(),"实施日期")]/following-sibling::span[1]/text()' .decode("utf-8")).extract() level = top.xpath( 'p/span[contains(text(),"效力级别")]/following-sibling::span[1]/text()' .decode("utf-8")).extract() effect = top.xpath( 'p/span[contains(text(),"时效性")]/following-sibling::span[1]/text()'. decode("utf-8")).extract() anNo = top.xpath( 'p/span[contains(text(),"发文字号")]/following-sibling::span[1]/text()' .decode("utf-8")).extract() item['anNo'] = '' item['pubish_time'] = '' item['effect_time'] = '' item['pubish_org'] = '' item['level'] = '' item['time_liness'] = u'已失效' if (len(organization) != 0): item['pubish_org'] = organization[0].replace("\r\n", "").split(u" ")[0] if (len(publish_time) != 0): item['pubish_time'] = publish_time[0] if (len(release_time) != 0): item['effect_time'] = release_time[0] if (len(level) != 0): item['level'] = level[0].replace("\r\n", "") if (len(effect) != 0): df = effect[0].replace(" ", "").replace("\n", "") if df == u'有效': item['time_liness'] = u'现行有效' else: item['time_liness'] = u'已失效' if (len(anNo) != 0): item['anNo'] = anNo[0].replace("\r\n", "").replace('-', '') # 是否导入正式数据库 item['export'] = '0' item['source'] = u"法律快车" content = "".join( response.xpath('//div[@class="a_cont_main"]').extract()).replace( '\r\n', '') content = re.sub('(class|style|color|href|target|align)="[^"]*?"', '', content) # 内容''' item['content'] = content uid = str(uuid.uuid1()).replace('-', '') self.statuteData.insert_statute( (uid, item['time_liness'], item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], 0)) del item['content'] print item
class cjfayuan_fagui_spider(scrapy.Spider): """财经法规""" name = "cjfyfagui" statuteData = StatuteData() allowed_domains = ["www.chinaacc.com"] start_urls = [ # 'http://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA%3D%3D/page/1.shtml', #国家法规库 #'https://www.chinacourt.org/law/more/law_type_id/MzAwM0AFAA%3D%3D.shtml', #司法解释 'http://www.chinaacc.com/fagui/search.shtm', #地方法规 #'https://www.chinacourt.org/law/more/law_type_id/MzAwMUAFAA%3D%3D/page/1.shtml'#政策参考 ] page_domain = "http://www.chinaacc.com%s" def parse(self, response): pageurl = 'http://www.chinaacc.com/dffg/page{0}.shtm' for item in range(1, 4120): yield scrapy.Request(pageurl.format(str(item)), callback=self.parse_list, method='get', errback=self.handle_error) def parse_list(self, response): for item in response.xpath('//div[@class="lqnr clearfix"]/dl/dd'): detailurl = item.css('a::attr(href)').extract_first() detail_url = self.page_domain % detailurl yield scrapy.Request(detail_url, callback=self.parse_detail, method='get', errback=self.handle_error) def parse_detail(self, response): item = {} item['title'] = response.xpath( "//div[@class='top clearfix']/h1/text()").extract_first() # 发文字号 item['anNo'] = ''.join( response.xpath("//div[@class='top clearfix']").css( ".c::text").extract()) # 颁布日期 item['pubish_time'] = ''.join( response.xpath("//div[@class='top clearfix']").css(".b::text").re( '[^\s]')).replace(u'颁布时间:', '').replace('\n', '').replace( '\r', '').replace('\t', '').replace(' ', '') # 生效日期 item['effect_time'] = ''.join( response.xpath("//div[@class='top clearfix']").css(".b::text").re( '[^\s]')).replace(u'颁布时间:', '').replace('\n', '').replace( '\r', '').replace('\t', '').replace(' ', '') # 颁布机构 item['pubish_org'] = ''.join( response.xpath("//div[@class='top clearfix']").css( ".b span::text").re('[^\s]')).replace(u'发文单位:', '').replace( '\n', '').replace('\r', '').replace('\t', '').replace(' ', '') # 效力级别 item['level'] = u'地方法规' # 时效性 item['time_liness'] = "" content = ''.join( response.xpath('//div[@class="cen clearfix"]').extract()) item["content"] = re.sub( '(class|style|color|href|target|align)="[^"]*?"', '', content).replace(u'【', '').replace(u'我要纠错', '').replace( u'】 责任编辑:', '').replace(u'大白兔', '').replace(u'小海鸥', '').replace('qzz', '') # 内容''' item['url'] = response.url item['source'] = u"中华会计网校" # 是否导入正式数据库 item['export'] = '0' item['collection'] = 'fagui' uid = str(uuid.uuid1()).replace('-', '') # Id,Time_liness,Effect_time,Level,Pubish_time,Title,AnNo,Source,Pubish_org,Content,IsBuild self.statuteData.insert_statute( (uid, u'', item['effect_time'], item['level'], item['pubish_time'], item['title'], item['anNo'], item['source'], item['pubish_org'], item["content"], 0)) del item["content"] print item def handle_error(self, result, *args, **kw): print "error url is :%s" % result.request.url self.logger.error("error url is :%s" % result.request.url)