def parse_item(self, response): if response.status == 209: urls = 'http://39.96.199.128:8888/getCookie?url=' + str(response.url) yield scrapy.Request(urls, callback=self.parseCookie, meta={'url': str(response.url), 'type': 'parse_item'}, dont_filter=True, priority=10) else: try: item = HaiguanDataItem() item['title'] = response.css('title::text').extract_first() item['time'] = get_times( response.css('.easysite-news-describe::text').extract_first()) item['content'] = response.css('#easysiteText').extract_first() appendix, appendix_name = get_attachments(response) item['appendix'] = appendix item['appendix_name'] = appendix_name item['name'] = '中华人民共和国沈阳海关' item['website'] = '中华人民共和国沈阳海关-统计数据' item['link'] = response.url item['txt'] = ''.join( response.css('#easysiteText *::text').extract()) item['module_name'] = '中华人民共和国沈阳海关-统计数据' item['spider_name'] = 'SYHG_TJSJ' print( "===========================>crawled one item" + response.request.url) except Exception as e: logging.error( self.name + " in parse_item: url=" + response.request.url + ", exception=" + e.__str__()) logging.exception(e) yield item
def parse_list(self, response): if response.status == 209: urls = 'http://39.96.199.128:8888/getCookie?url=' + str( response.url) yield scrapy.Request(urls, callback=self.parseCookie, meta={ 'url': str(response.url), 'type': 'parse_list' }, dont_filter=True, priority=10) else: for href in response.css('.mtfsljb a::attr(href)').extract(): url = response.urljoin(href).strip() if (url.endswith('.html') or url.endswith('.htm') ) and url.startswith('http://') and (url != response.url): yield scrapy.Request(url, callback=self.parse_item, dont_filter=True) else: try: item = HaiguanDataItem() item['title'] = response.css( 'title::text').extract_first() item['time'] = '2020-09-22' item['content'] = url item['appendix'] = url item['appendix_name'] = '' item['name'] = '中华人民共和国上海海关' item['website'] = '中华人民共和国上海海关-统计数据' item['link'] = url item['txt'] = url item['module_name'] = '中华人民共和国上海海关-统计数据' item['spider_name'] = 'SHHG_TJSJ' print("===========================>crawled one item" + str(item)) except Exception as e: logging.error(self.name + " in parse_item: url=" + response.request.url + ", exception=" + e.__str__()) logging.exception(e) yield item