class moNews(scrapy.Spider): name = "henanSpider" start_url = "http://www.henan.gov.cn/ywdt/hnyw/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.henan.gov.cn/ywdt/hnyw/", callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath( "//div[@class='main']//li/a/@href").extract() for detail_url in detail_urls: time.sleep(random.uniform(1, 3)) print(detail_url) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: titles = response.xpath( """//*[@id='title']//text()""").extract() title = "".join(titles).strip() except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//*[@id='source']//text()""").extract() author = "".join(authors).strip() except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: # spiderUtil.log_level(8, response.url) pass source = "http://www.henan.gov.cn/" try: content_detail = response.xpath( """//*[@class='content']//text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class xizangNews(scrapy.Spider): name = "xizangSpider" start_url = [ "http://www.xizang.gov.cn/xwzx/zwyw/", "http://www.xizang.gov.cn/xwzx/jjjs/", "http://www.xizang.gov.cn/xwzx/qnyw/", "http://www.xizang.gov.cn/xwzx/dsyw/", "http://www.xizang.gov.cn/xwzx/xqxw/", "http://www.xizang.gov.cn/xwzx/xwrp/", "http://www.xizang.gov.cn/xwzx/dwjl/", "http://www.xizang.gov.cn/xwzx/shfz/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(4,5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home) def parse_item_page_home(self, response): # text = response.text # max_page = text.split("createPageHTML(")[2].split(",")[0] # for page in range(1, int(max_page) + 1): # for page in range(1, 2): url = response.url + "index" + ".html" # time.sleep(random.uniform(4,5)) yield scrapy.Request(url=url, callback=self.parse_item_page_list) def parse_item_page_list(self, response): news_list = response.xpath( "//div[@class='zx-wdsyw-con']/ul/li/a/@href").extract() for news_url in news_list: if news_url.startswith("./"): news_url = response.url.split("/index")[0] + news_url[1:] # time.sleep(random.uniform(4,5)) yield scrapy.Request(url=news_url, callback=self.parse) else: # time.sleep(random.uniform(4,5)) yield scrapy.Request(url=news_url, callback=self.parse) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//div[@class='xz-xl-tit']/h3/text()" ).extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//div[@class='xz-xl-info']/p/span/text()").extract() if len(author) == 2: author = author[1] else: author = "西藏自治区人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='xz-xl-article']" )[0].xpath("string(.)").extract() content = "".join(content_arr).split( "//显示下载附件")[0][:-106].strip() except: spiderUtil.log_level(7, response.url) source = "http://www.xizang.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "shanxiSpider" start_url = "http://www.shanxi.gov.cn/yw/sxyw/index.shtml" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://www.shanxi.gov.cn/yw/sxyw/index.shtml", callback=self.parse_item_page_list, headers=self.header) # # 全量数据 # for i in range(1,1292): # url = "http://www.shanxi.gov.cn/yw/sxyw/index_"+str(i)+".shtml" # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath( "/html/body/div/div/div/div/div/ul/li/a/@href").extract() for detail_url in detail_urls: detail_url = "http://www.shanxi.gov.cn/yw/sxyw/" + detail_url.replace( "./", "") time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( "//div[starts-with(@class,'detail-article-title')]//text()" ).extract() title = "".join(content_title).strip() except: spiderUtil.log_level(6, response.url) try: content_author = response.xpath( """/html/body/div/div/div/div/ul/li/span[2]/text()""" ).extract() author = "".join(content_author).strip() except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """/html/body/div/div/div/div/ul/li/span[1]/text()""" ).extract() public_time = str(content_time[0].strip()) + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.shanxi.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class heilongjiangNews(scrapy.Spider): name = "jiangsuSpider" start_url = "http://www.js.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://www.js.gov.cn/col/col60096/index.html", callback=self.parse_item_page_list, headers=self.header) # 全量爬取历史数据 # for i in range(2, 69): # url = "http://www.js.gov.cn/col/col60096/index.html?uid=212860&pageNum=" + str(i) # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): s = response.xpath("//script[@type='text/xml']/text()").extract()[0] url_list = etree.HTML(s).xpath("""//a/@href""") for url in url_list: url = "http://www.js.gov.cn" + url yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( """//div[@class='sp_title']/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//div[@class='sp_time']/font[2]/text()""").extract() author = "".join(authors).replace("来源:", "").strip() if author == "": author = "江苏人民政府网" except: spiderUtil.log_level(9, response.url) try: public_times = response.xpath( """//div[@class='sp_time']/font[1]/text()""").extract()[0] public_time = str(str(public_times).replace("发布日期:", "")) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='zoom']//text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.js.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "hubeiSpider" start_url = "http://www.hubei.gov.cn/zwgk/hbyw/hbywqb/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.hubei.gov.cn/zwgk/hbyw/hbywqb/", callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath( "//div[@class='container']//li/a/@href").extract() for detail_url in detail_urls: if "./" in detail_url: time.sleep(random.uniform(1, 3)) deurl = "http://www.hubei.gov.cn/zwgk/hbyw/hbywqb/" + str( detail_url).replace("./", "") yield scrapy.Request(url=deurl, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( """//*[@class='text-center']/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: author = str( response.xpath( """//*[@class='list-unstyled list-inline']/li[2]/span/text()""" ).extract()[0]).replace("来源:", "") except: spiderUtil.log_level(9, response.url) try: public_time = str( response.xpath( """//*[@class='list-unstyled list-inline']/li[1]/span/text()""" ).extract()[0]).replace("发布时间:", "").strip() + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.hubei.gov.cn/" try: content_detail = response.xpath( """//*[@class='TRS_Editor']//text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class qhNews(scrapy.Spider): name = "qhSpider" start_url = [ "http://www.qh.gov.cn/zwgk/xwdt/qhyw/", "http://www.qh.gov.cn/zwgk/xwdt/bmdt/", "http://www.qh.gov.cn/zwgk/xwdt/dqdt/", "http://www.qh.gov.cn/zwgk/xwdt/jqgz/", "http://www.qh.gov.cn/zwgk/xwdt/tzgg/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath( "//div[@class='box11 tabs topline']/div/ul/p[@class='item']/a/@href" ).extract() for news_url in news_list: time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) # if "下一页" in response.xpath("//div[@class='pages']/a/text()").extract(): # next_list = response.xpath("//div[@class='pages']/a/@href").extract() # next_list=next_list[len(next_list)-1] # yield scrapy.Request(url=next_list, callback=self.parse_item_page_list, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( "//h1[@class='blue tc']/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@class='abstract tc']/text()").extract() author = "".join(author_arr).strip() if author == "": author = "青海省人民政府" else: author = author.split("来源:")[1].split("发布时间")[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='details_content']/p//text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.qh.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "liaoningSpider" start_url = "http://www.ln.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://www.ln.gov.cn/zfxx/jrln/wzxx2018/index.html", callback=self.parse_item_page_list, headers=self.header) # for i in range(1,24): # url = "http://www.ln.gov.cn/zfxx/jrln/wzxx2018/index_"+str(i) # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath( "//ul[@class='list_rul']/li/a/@href").extract() for detail_url in detail_urls: detail_url = "http://www.ln.gov.cn/zfxx/jrln/wzxx2018/" + detail_url.replace( "./", "") time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text # print(text) html_size = sys.getsizeof(text) try: titles = response.xpath( """//td[@align="center"]/text()""").extract() title = "".join(titles).replace("来源:", "").strip() except: spiderUtil.log_level(6, response.url) try: content_author = response.xpath( """//table[@class="time"]//td[@align="left"]/text()""" ).extract() authors = content_author[0].split(" 信息来源:") author = str(authors[1]) except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//table[@class="time"]//td[@align="left"]/text()""" ).extract() public_times = str(content_time[0]).split(" 信息来源:") public_time = str( str(public_times[0]).replace("发布时间:", "").replace( "年", "-").replace("月", "-").replace("日", "") + " 00:00:00").strip() except: spiderUtil.log_level(8, response.url) source = "http://www.ln.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']/div/p/text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class moNews(scrapy.Spider): name = "shandongSpider" start_url = "http://www.shandong.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.shandong.gov.cn/col/col3199/index.html", callback=self.parse_item_page_list, headers=self.header) # 全量爬取历史数据 # for i in range(1,460): # url = "http://www.shandong.gov.cn/col/col3199/index.html?uid=5836&pageNum="+str(i) # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): s = response.xpath("//script[@type='text/xml']/text()").extract()[0] detail_urls = etree.HTML(s).xpath("//a/@href") for key in detail_urls: yield scrapy.Request(url=key, callback=self.parse, headers=self.header, dont_filter=True) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("""//div[@class='xq-tit']/span/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: author = str(response.xpath("""//div[@class='R-tit']/span[2]/text()""").extract()[0]).replace("来源:","") except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) source = "http://www.shangdong.gov.cn/" try: content_detail = response.xpath("""//div[@class='article']//text()""").extract() content = "".join(content_detail) except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class shanghaiNews(scrapy.Spider): name = "shanghaiSpider" start_url = ["http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw4411/index.html", "http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw18454/index.html", "http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw15343/index.html", "http://www.shanghai.gov.cn/nw2/nw2314/nw2315/nw31406/index.html"] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath("//ul[@class='uli14 pageList']/li/a/@href").extract() for news_url in news_list: news_url="http://www.shanghai.gov.cn"+news_url # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) # next_page = response.xpath("//li/a[@class='action']/@href").extract() # if next_page!=[]: # next_url="http://www.shanghai.gov.cn"+next_page[0] # # time.sleep(random.uniform(3, 5)) # yield scrapy.Request(url=next_url, callback=self.parse_item_page_list, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//div[@id='ivs_title']/text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//small[@class='PBtime']/text()").extract()[0].split("来源:")[1] if author == "": author = "上海市人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\s\s\d{1,2}\s:\s\d{1,2})", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace(" "," ").replace(" : ",":")+":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='ivs_content']")[0].xpath('string(.)').extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.shanghai.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class bjNews(scrapy.Spider): name = "bjSpider" start_url = [ "http://www.beijing.gov.cn/ywdt/zybwdt/", "http://www.beijing.gov.cn/ywdt/yaowen/", "http://www.beijing.gov.cn/ywdt/gqrd/", "http://www.beijing.gov.cn/ywdt/gzdt/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header) def parse_item_page_home(self, response): yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True) # max_page = response.text.split("pageCount = ")[1][:4].split(";")[0] # for page in range(1, int(max_page)): # news_list_url = response.url+"default_" + str(page)+".htm" # # time.sleep(random.uniform(3, 5)) # yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath("//li[@class='col-md']/a/@href").extract() for news_url in news_list: if news_url.startswith("http"): # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) elif news_url.startswith("../../"): news_url = "http://www.beijing.gov.cn/" + news_url.replace( "../../", "") yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) elif news_url.startswith("../"): news_url = "/".join( response.url.split("/")[:-2]) + news_url.replace( "../", "/") yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) elif news_url.startswith("./"): news_url = "/".join( response.url.split("/")[:-1]) + news_url.replace( "./", "/") yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: # title = response.xpath("//head/title/text()").extract()[0].split("-")[0] title_arr = response.xpath( "//div[@class='header']/p/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//p[@class='fl']/span/text()").extract() author = "".join(author_arr).strip() if author == "": author = "北京市人民政府" else: author = author.split("来源:")[1] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='TRS_Editor']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.beijing.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "moSpider" start_url = "https://www.gov.mo/zh-hant/%s/%s/page/%s/?post_type=news_post" header = spiderUtil.header_util() def start_requests(self): # date_set = set() # begin_date = datetime.datetime.strptime("2017-09-01", "%Y-%m-%d") # end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") # while begin_date <= end_date: # date_str = begin_date.strftime("%Y-%m") # date_set.add(date_str) # begin_date += datetime.timedelta(days=1) # date_list = list(date_set) # date_list.sort() # # for date in date_list: # today = date.split("-") today = spiderUtil.get_time().split("-") year = today[0] month = today[1] for page in range(1, 3): url = self.start_url % (year, month, page) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): # news_url_list = response.xpath("//h2/a/@href").extract() news_url_list = response.xpath( "//div[@class='card-head news--item-head style-primary']/a/@href" ).extract() for news_url in news_url_list: yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( "//head/title/text()").extract()[0].split("–")[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//dl/dd")[0].xpath( 'string(.)').extract()[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) source = "https://www.gov.mo/" try: content_arr = response.xpath("//article/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class heilongjiangNews(scrapy.Spider): name = "hebeiSpider" start_url = ["http://www.hebei.gov.cn/"] header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.hebei.gov.cn/hebei/11937442/10761139/index.html", callback=self.parse_item_page_list,headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath("//div[2]/div[2]/div/ul/li/a/@href").extract() for detail_url in detail_urls: if not detail_url.startswith("http"): url = "http://www.hebei.gov.cn" + detail_url yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("""//h2[@class="cont_title"]/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: author="河北省人民政府网" except: spiderUtil.log_level(9, response.url) try: public_timess = response.xpath("""//li[@class="xl_shijian"]//text()""").extract()[0] public_times = str(public_timess).replace("年", "-").replace("月", "-").replace("日", "").strip() public_time = str(public_times)+" 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='zoom']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.hebei.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "anhuiSpider" start_url = "http://www.ah.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://www.ah.gov.cn/UserData/SortHtml/1/549213957.html", callback=self.parse_item_page_list, headers=self.header) # for i in range(2,524): # url = "http://www.ah.gov.cn/tmp/Nav_nav.shtml?SS_ID=7&tm=29357.31&Page="+str(i) # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): url_list = response.xpath("//div[@class='navjz']//a/@href").extract() for url in url_list: if "http://www.ah.gov.cn" in url: yield scrapy.Request(url=url, callback=self.parse, headers=self.header, dont_filter=True) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: titles = response.xpath( """//div[@class='wztit']//text()""").extract() title = "".join(titles).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( """//div[@class='wzbjxx']/p/text()[3]""").extract() author = "".join(author_arr).replace("来源:", "").strip() except: spiderUtil.log_level(9, response.url) try: content_date = response.xpath( """//div[@class='wzbjxx']/p/text()[1]""").extract()[0] content_time = response.xpath( """//div[@class='wzbjxx']/p/text()[2]""").extract()[0] public_time = str(content_date) + " " + str( content_time) + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.ah.gov.cn/" try: content_detail = response.xpath( """//div[@class='wzcon']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class nxNews(scrapy.Spider): name = "tjSpider" start_url = [ "http://www.tj.gov.cn/xw/xwfbh/", "http://www.tj.gov.cn/xw/tztg/", "http://www.tj.gov.cn/xw/bum/", "http://www.tj.gov.cn/xw/qx1/", "http://www.tj.gov.cn/xw/bdyw/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(2, 3)) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) # def parse_item_page_home(self, response): # yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True) # max_page = response.text.split("countPage = ")[1][:3].split("/")[0] # for page in range(2, int(max_page)): # news_list_url = response.url+"index_" + str(page)+".html" # time.sleep(1) # yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath("//div/div/ul/li/a/@href").extract() for news_url in news_list: if news_url.startswith("./"): news_url = response.url.split("/index")[0] + news_url[2:] time.sleep(1) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) elif news_url.startswith("http"): # time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//div[@class='title']/text()").extract( )[0].strip() + response.xpath( "//div[@class='t_title']/text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//span[@class='ly']/text()").extract()[0].split("来源:")[1] if author == "": author = "天津市人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\s\d{1,2}:\d{1,2})", response.text).group(0).replace(" ", " ") + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='TRS_Editor']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.tj.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "fujianSpider" start_url = "http://www.fujian.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.fujian.gov.cn/xw/fjyw/", callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): url_list = response.xpath("//ul[@class='list-gl']//a/@href").extract() for urls in url_list: url = "http://www.fujian.gov.cn/xw/fjyw/" + str(urls).replace( "./", "") yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( """//div[@class='xl-nr clearflx']//h3/text()""").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( """//div[@class='xl-nr clearflx']//h5/span/text()""" ).extract() author = "".join(author_arr).replace("[", "").replace("]", "").strip() except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//div[@class='xl-nr clearflx']//h5/text()""").extract() public_time = "".join(content_time).replace("字号:", "").replace( "|", "").strip() public_time = str(public_time) + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.fujian.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class gdNews(scrapy.Spider): name = "gdSpider" start_url = ["http://www.gd.gov.cn/gdywdt/gdyw/index.html", "http://www.gd.gov.cn/gdywdt/dczl/jcbs/index.html", "http://www.gd.gov.cn/gdywdt/dczl/gcls/index.html", "http://www.gd.gov.cn/gdywdt/dczl/dcxd/index.html", "http://www.gd.gov.cn/gdywdt/bmdt/index.html", "http://www.gd.gov.cn/gdywdt/dsdt/index.html", "http://www.gd.gov.cn/gdywdt/zfjg/index.html", "http://www.gd.gov.cn/gdywdt/tzdt/index.html", "http://www.gd.gov.cn/gdywdt/ydylygd/index.html"] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header) def parse_item_page_home(self, response): yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True) # max_page = response.xpath("//a[@class='last']/@href").extract()[0].split("index_")[1].split(".")[0] # for page in range(2, int(max_page)+1): # news_list_url = response.url.replace("index","index_" + str(page)) # # time.sleep(random.uniform(3, 5)) # yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath("//span[@class='til']/a/@href").extract() for news_url in news_list: # time.sleep(random.uniform(3, 5)) if news_url.startswith("http://www.gd.gov.cn"): yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//h3[@class='zw-title']/text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//span[@class='ly']/text()").extract()[0].split("来源 :")[1].strip() if author == "": author = "广东省人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='zw']/p")[0].xpath('string(.)').extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.gd.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "jilinSpider" start_url = "http://www.jl.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.jl.gov.cn/zw/yw/jlyw/index.html", callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath( """//li[@class="item"]//a/@href""").extract() for detail_url in detail_urls: if "./" in detail_url: detail_url = "http://www.jl.gov.cn/zw/yw/jlyw/" + detail_url.replace( "./", "") time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( """//div[@id="dbt"]//text()""").extract() title = "".join(content_title).strip() except: spiderUtil.log_level(6, response.url) try: author = "吉林省人民政府" except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//div[@class="c_xx"]//text()""").extract() public_times = str(content_time[0]).split(" ") public_time = str(public_times[1]) except: spiderUtil.log_level(8, response.url) source = "http://www.jl.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class cqNews(scrapy.Spider): name = "cqSpider" start_url = [ "http://www.cq.gov.cn/zqfz/whly", "http://www.cq.gov.cn/zqfz/sthj", "http://www.cq.gov.cn/zqfz/shfz", "http://www.cq.gov.cn/zqfz/gmjj", "http://www.cq.gov.cn/zwxx/jrcq", "http://www.cq.gov.cn/zwxx/zwdt" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # for page in range(1,473): for page in range(1, 2): # time.sleep(random.uniform(2, 3)) yield scrapy.Request(url=url + "_" + str(page), callback=self.parse_item_page_list, headers=self.header, dont_filter=True) def parse_item_page_home(self, response): print("=====" * 40) print(response.text) # max_page = response.xpath("//span[@class='total']/text()").extract()[0].split("/")[1][1:-1] # for page in range(1, int(max_page) + 1): # url = response.url + "_" + str(page) # time.sleep(random.uniform(2, 3)) # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header,dont_filter=True) def parse_item_page_list(self, response): print(response.text) print("=" * 100) news_list = response.xpath("//ul[@class='list']/li/a/@href").extract() for news_url in news_list: news_url = "http://www.cq.gov.cn" + news_url time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( "//h2[@class='title']/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//span[@class='fl']/span/text()").extract() author = "".join(author_arr).strip() if author == "": author = "重庆市人民政府" else: author = author.split("来源:")[1] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='conTxt']")[0].xpath('string(.)').extract() content = "".join(content_arr).split("终审 :")[0].strip() except: spiderUtil.log_level(7, response.url) source = "http://www.cq.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class hnNews(scrapy.Spider): name = "hnSpider" start_url = [ "http://www.hainan.gov.cn/hainan/tingju/list3.shtml", "http://www.hainan.gov.cn/hainan/sxian/list3.shtml", "http://www.hainan.gov.cn/hainan/5309/list3.shtml", "http://www.hainan.gov.cn/hainan/mtkhn/list3.shtml", "http://www.hainan.gov.cn/hainan/ldhd/sj_list3.shtml" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header) def parse_item_page_home(self, response): yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True) def parse_item_page_list(self, response): news_list = response.xpath( "//div[@class='list-right_title fon_1']/a/@href").extract() for news_url in news_list: if not news_url.startswith("http"): news_url = "http://www.hainan.gov.cn" + news_url yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( "//div[@class='title_cen mar-t2 text']/ucaptitle/text()" ).extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//span[@id='ly']/text()").extract() if author == []: author = "海南省人民政府" else: author = author[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='zoom']/div[@id='font']/ucapcontent/p/text()" ).extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.hainan.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "zhejiangSpider" start_url = "http://www.zj.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://www.zj.gov.cn/col/col1554467/index.html", callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): s = response.xpath("//script[@type='text/xml']/text()").extract()[0] url_list = etree.HTML(s).xpath("""//a/@href""") for url in url_list: url = "http://www.zj.gov.cn" + url yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( """//td[@align="center"]/text()""").extract() title = "".join(content_title) except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//ul[@class="list"]/li[2]/text()""").extract() author = "".join(authors).replace("来源:", "").strip() except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//ul[@class="list"]/li[1]/text()""").extract() public_time = str(content_time[0]).replace("发布日期:", "").strip() except: spiderUtil.log_level(8, response.url) source = "http://www.jl.gov.cn/" try: content_detail = response.xpath( """//div[@id="zoom"]//text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class sxNews(scrapy.Spider): name = "sxSpider" start_url = ["http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=10001", "http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=10003", "http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=10002", "http://www.shaanxi.gov.cn/info/iList.jsp?cat_id=17469"] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath("//ul[@class='xwlist-ul']/li/a/@href").extract() for news_url in news_list: news_url="http://www.shaanxi.gov.cn"+news_url yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) # next_list = response.xpath("//form/span/a/@href").extract() # next_list=response.url.split("?")[0]+next_list[-2] # yield scrapy.Request(url=next_list, callback=self.parse_item_page_list, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//h1[@class='news_h1']/text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//span[@id='info_source']/text()").extract()[0].strip() if author == "": author = "陕西省人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='info-cont']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.shanxi.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class nxNews(scrapy.Spider): name = "nxSpider" start_url = [ "http://www.nx.gov.cn/zwxx_11337/wztt/", "http://www.nx.gov.cn/zwxx_11337/zwdt/", "http://www.nx.gov.cn/zwxx_11337/sxdt/", "http://www.nx.gov.cn/zwxx_11337/hygq/", "http://www.nx.gov.cn/ztsj/zt/tpgj_1542/", "http://www.nx.gov.cn/ztsj/zt/hjbhdc/", "http://www.nx.gov.cn/zwxx_11337/zcjd/", "http://www.nx.gov.cn/zwgk/tzgg/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header) def parse_item_page_home(self, response): # text = response.text # max_page = text.split("createPageHTML(")[2].split(",")[0] # for page in range(1, int(max_page)+1): # for page in range(1, 2): news_list_url = response.url + "index" + ".html" # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath( "//ul[@class='commonList_dot']/li/a/@href").extract() for news_url in news_list: if news_url.startswith("./"): news_url = response.url.split("index")[0] + news_url[2:] elif news_url.startswith("../"): news_url = "http://www.nx.gov.cn/zwxx_11337" + news_url[2:] # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( "//div[@id='info_title']/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//span[@id='info_source']/text()").extract() author = "".join(author_arr).strip() if author == "": author = "宁夏回族自治区人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath( "//div[@class='view TRS_UEDITOR trs_paper_default trs_word trs_key4format']/p//text()" ).extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.nx.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class nmgNews(scrapy.Spider): name = "nmgSpider" start_url = [ "http://www.nmg.gov.cn/col/col442/index.html", "http://www.nmg.gov.cn/col/col443/index.html", "http://www.nmg.gov.cn/col/col1141/index.html", "http://www.nmg.gov.cn/col/col1972/index.html", "http://www.nmg.gov.cn/col/col1973/index.html", "http://www.nmg.gov.cn/col/col365/index.html", "http://www.nmg.gov.cn/col/col151/index.html", "http://www.nmg.gov.cn/col/col152/index.html", "http://www.nmg.gov.cn/col/col360/index.html", "http://www.nmg.gov.cn/col/col1253/index.html", "http://www.nmg.gov.cn/col/col359/index.html", "http://www.nmg.gov.cn/col/col389/index.html" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # for page in range(1, 329): for page in range(1, 2): news_list = url + "?uid=777&pageNum=" + str(page) # time.sleep(random.uniform(1, 3)) yield scrapy.Request(url=news_list, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): s = response.xpath("//script[@type='text/xml']/text()").extract()[0] url_list = etree.HTML(s).xpath("//a/@href") for url in url_list: url = "http://www.nmg.gov.cn/" + url time.sleep(1) yield scrapy.Request(url=url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( "//div[@class='main-fl-tit']/text()").extract() title = "".join("".join(title).split()) except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@class='main-fl-bjxx']/div/text()").extract() author = "".join(author_arr).strip() if author == "": author = "内蒙古自治区人民政府网" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='zoom']")[0].xpath( 'string(.)').extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.nmg.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class moNews(scrapy.Spider): name = "jiangxiSpider" start_url = "http://www.jiangxi.gov.cn/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request( url="http://www.jiangxi.gov.cn/col/col393/index.html", callback=self.parse_item_page_list, headers=self.header) # 全量爬取历史数据 # for i in range(2,917): # url = "http://www.jiangxi.gov.cn/col/col393/index.html?uid=45663&pageNum="+str(i) # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): s = response.xpath("//script[@type='text/xml']/text()").extract()[0] detail_urls = etree.HTML(s).xpath("//a/@href") for detail_url in detail_urls: detail_url = "http://www.jiangxi.gov.cn" + detail_url yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( """//div[@class='artile_zw']/div/p/text()""").extract() title = "".join(content_title).strip() print(title) except: spiderUtil.log_level(6, response.url) try: author = response.xpath( """//div[@id='zoom']//font/text()[2]""").extract()[0] except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//div[@class='sp_time screen']/font[1]/text()""" ).extract()[0] public_time = str(content_time).replace("发布时间:", "") except: spiderUtil.log_level(8, response.url) source = "http://www.jiangxi.gov.cn/" try: content_detail = response.xpath( """//div[@id='zoom']/p/text()""").extract() content = "".join(content_detail) except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
class gxNews(scrapy.Spider): name = "gxSpider" start_url = [ "http://www.gxzf.gov.cn/zwhd/index.shtml", "http://www.gxzf.gov.cn/sytt/index.shtml", "http://www.gxzf.gov.cn/zcjd/index.shtml", "http://www.gxzf.gov.cn/gggs/index.shtml", "http://www.gxzf.gov.cn/zwdc/index.shtml", "http://www.gxzf.gov.cn/dflz/yw/index.shtml" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header) def parse_item_page_home(self, response): yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True) # max_page = response.xpath("//div[@class='more-page']/a/@href").extract()[-1].split("-")[1][:-6] # for page in range(2, int(max_page)+1): # news_list_url = response.url.replace("index","index-" + str(page)) # # time.sleep(random.uniform(3, 5)) # yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath( "//ul[@class='more-list']/li/a/@href").extract() for news_url in news_list: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( "//div[@class='article']/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@class='article-inf-left']/text()").extract() author = "".join(author_arr) if author == "": author = "广西壮族自治区人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='article-con']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.gxzf.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class ynNews(scrapy.Spider): name = "ynSpider" start_url = [ "http://www.yn.gov.cn/ywdt/bmdt/", "http://www.yn.gov.cn/ywdt/ynyw/", "http://www.yn.gov.cn/ywdt/zsdt/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) # for i in range(1,100): # for i in range(1,2): # url =url.split("index")[0]+("index_"+str(i))+".html" # yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath( "//dl[@class='thlist']/dt/a/@href").extract() for news_url in news_list: if news_url.startswith("./"): news_url = response.url.split("index")[0] + news_url[2:] yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( "//div[@class='articl']/h3//text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//div[@class='datetime']/text()").extract()[0].split( "来源:")[1].split("2")[0].strip() if author == "": author = "云南省人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='TRS_Editor']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.yn.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class scNews(scrapy.Spider): name = "scSpider" start_url = ["http://www.sc.gov.cn/10462/10705/10709/xwfbt_list.shtml", "http://www.sc.gov.cn/10462/10705/10708/xwfbt_list.shtml", "http://www.sc.gov.cn/10462/10705/10707/xwfbt_list.shtml", "http://www.sc.gov.cn/10462/10705/10706/xwfbt_one.shtml"] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_list, headers=self.header) # if "list" in url: # for page in range(1, 6): # list_url = url.split(".shtml")[0]+"_"+str(page)+".shtml" # yield scrapy.Request(url=list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath("//td/a/@href").extract() for url in news_list: if not url.startswith("http"): news_url = "http://www.sc.gov.cn"+url yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath("//h2/ucaptitle/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath("//ul[@id='articleattribute']/li/text()").extract() author = "".join(author_arr).strip() if author == "": author = "四川省人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}时\d{1,2}分)", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace("时",":").replace("分","")+":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("//div[@id='cmsArticleContent']")[0].xpath('string(.)').extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.sc.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class gzNews(scrapy.Spider): name = "gzSpider" start_url = [ "http://www.guizhou.gov.cn/xwdt/rmyd/", "http://www.guizhou.gov.cn/xwdt/jrgz/", "http://www.guizhou.gov.cn/xwdt/gzyw/", "http://www.guizhou.gov.cn/xwdt/qgyw/", "http://www.guizhou.gov.cn/xwdt/mtkgz/", "http://www.guizhou.gov.cn/xwdt/djfb/", "http://www.guizhou.gov.cn/xwdt/tzgg/", "http://www.guizhou.gov.cn/xwdt/szf/ldjh/", "http://www.guizhou.gov.cn/xwdt/szf/ldhd/", "http://www.guizhou.gov.cn/xwdt/dt_22/bm/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/gy/index.html", "http://www.guizhou.gov.cn/xwdt/dt_22/df/zy/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/lps/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/as/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/bj/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/tr/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/qdn/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/qn/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/qxn/", "http://www.guizhou.gov.cn/xwdt/dt_22/df/gaxq/" ] header = spiderUtil.header_util() def start_requests(self): for url in self.start_url: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=url, callback=self.parse_item_page_home, headers=self.header) def parse_item_page_home(self, response): yield scrapy.Request(url=response.url, callback=self.parse_item_page_list, headers=self.header, dont_filter=True) # max_page = response.xpath("//div[@class='page']/script/text()").extract()[0].split("HTML(")[1].split(",")[0] # for page in range(1, int(max_page)): # news_list_url = response.url + "index_" + str(page) + ".html" # # time.sleep(random.uniform(3, 5)) # yield scrapy.Request(url=news_list_url, callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): news_list = response.xpath( "//div[@class='right-list-box']/ul/li/a/@href").extract() for news_url in news_list: # time.sleep(random.uniform(3, 5)) yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//h1/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//head/meta[@name='ContentSource']/@content").extract() if author == []: author = "贵州省人民政府" else: author = author[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='view TRS_UEDITOR trs_paper_default trs_web']/p/text()" ).extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.guizhou.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
class heilongjiangNews(scrapy.Spider): name = "heilongjiangSpider" start_url = ["http://www.hlj.gov.cn/"] header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url="http://www.hlj.gov.cn/zwfb/zxfb/index.shtml", callback=self.parse_item_page_list, headers=self.header) yield scrapy.Request(url="http://www.hlj.gov.cn/szf/lddt/cxhd/", callback=self.parse_item_page_list, headers=self.header) def parse_item_page_list(self, response): detail_urls = response.xpath( "//div[@class='li-left hei']//span/a/@href").extract() for detail_url in detail_urls: yield scrapy.Request(url=detail_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( """//div[@class="tm2"]/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//div[@class="tm3"]/span[2]/text()""").extract() author = "".join(authors).replace("来源:", "").strip() if author == "": author = "黑龙江人民政府网" except: spiderUtil.log_level(9, response.url) try: public_times = response.xpath( """//div[@class="tm3"]/span[1]/text()""").extract() public_time = "".join(public_times).replace("时间:", "").strip() except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='nr5']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.hlj.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)