class eNews(scrapy.Spider): name = "ePaperSpider" start_url = "http://epaper.ssrb.com.cn/html/%s-%s/%s/node_1.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2000-01-01", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath("//a[@id='pageLink']/@href").extract() for paper in paper_list: paper_url = response.url.split("node")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath("//div/a/@href").extract() for news in news_list: news_url = response.url.split("node")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='ozoom']/founder-content/text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://epaper.ssrb.com.cn/" try: author = "石狮日报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//td[@class='font01']/founder-title/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class ddNews(scrapy.Spider): name = "ddPaperSpider" start_url = "http://szb.ddswcm.com/Html/%s-%s-%s/Qpaper.html" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2016-10-10", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath( "//div[@class='verpaper']/ul/li/a/@href").extract() for paper in paper_list: if paper.startswith("Qpaper"): paper_url = response.url.split("Qpaper")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath( "//div[@class='vertitle']/ul/li/a/@href").extract() for news in news_list: # if news.startswith("content"): news_url = response.url.split("Qpaper")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='content']/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://szb.ddswcm.com/" try: author = response.xpath( "//div[@class='property']//text()").extract()[0].split( "作者:")[1].split("浏览次数")[0].strip() if author == "": author = "当代商报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@class='papertitle']//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class zwNews(scrapy.Spider): name = "zwPaperSpider" start_url = "http://124.224.204.62:8081/szb/pc/%s%s/%s/l1.html" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2009-02-03", "%Y-%m-%d") end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath("//li[@class='posRelative']/a/@href").extract() for paper in paper_list: paper_url = response.url.split("l1")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath("//li[@class='clearfix']/a/@href").extract() for news in news_list: yield scrapy.Request(url=news, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//founder-content//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://124.224.204.62:8081/szb/pc/" try: author_arr = response.xpath("//h3[@class='title-author']/text()").extract() if author_arr == []: author = "中卫日报" else: author=author_arr[0] except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//h2[@id='Title']/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class yaNews(scrapy.Spider): name = "yaPaperSpider" start_url = "http://paper.yanews.cn/yarb/%s%s%s/html/index.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2016-12-13", "%Y-%m-%d") end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath("//a[@class='bmml_con_div_name']/@href").extract() for paper in paper_list: paper_url = response.url.split("index")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath("//a[@class='bmdh_con_a']/@href").extract() for news in news_list: if news.startswith("index"): news_url = response.url.split("index")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) if news.startswith("page"): news_url = response.url.split("page")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='zoom']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://paper.yanews.cn/" try: author = "延安日报" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div[@class='bmnr_con_biaoti']/text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class qhNews(scrapy.Spider): name = "qhPaperSpider" start_url = "http://www.qhdb.com.cn/Newspaper/PageNavigate.aspx?nid=%s" header = spiderUtil.header_util() def start_requests(self): for date in range(15, 2553): url = self.start_url % date time.sleep(1) # 全量数据 yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) # 增量数据 # yield scrapy.Request(url=url, callback=self.parse_item_today, headers=self.header) # 增量数据拿到今天日报页面 def parse_item_today(self, response): today_list = response.xpath("//tr/td/a/@href").extract()[-1] today_url = response.url.split("PageNavigate")[0] + today_list yield scrapy.Request(url=today_url, callback=self.parse_item_home, headers=self.header, dont_filter=True) def parse_item_home(self, response): paper_list = response.xpath( "//div/span[@class='float']/a/@href").extract() for paper in paper_list: paper_url = response.url.split("PageNavigate")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath( "//div[@class='p_l_bottom']/div/a/@href").extract() for news in news_list: news_url = response.url.split("PageNavigate")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}/\d{1,2}/\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0).replace("/", "-") except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='article_content']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.qhdb.com.cn/" try: author_arr = response.xpath( "//span[@id='sZuoZhe']/text()").extract() author = "".join(author_arr) if author == "": author = "期货日报" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div[@class='article_title']//text()" ).extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class tsNews(scrapy.Spider): name = "tsPaperSpider" start_url = "http://dzb.tsrb.com.cn/tswb/content/%s%s%s/Page%sHO.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2010-06-21", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] for page in range(1, 17): if page < 10: page = "0" + str(page) url = self.start_url % (year, month, day, page) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_list, headers=self.header) def parse_item_list(self, response): paper_list = response.xpath("//tr/td/a/@href").extract() for paper in paper_list: if paper.startswith("Artice"): paper_url = response.url.split("Page")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse, headers=self.header, dont_filter=True) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//span[@id='contenttext']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.tsrb.com.cn/" try: author = "天水晚报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@class='detailtitle']//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class qlNews(scrapy.Spider): name = "qlPaperSpider" start_url = "http://epaper.qlwb.com.cn/qlwb/content/%s%s%s/PageArticleIndexLB.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2019-04-12", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_list, headers=self.header) def parse_item_list(self, response): news_list = response.xpath( "//div[@class='linkto']/ul/li/a/@title").extract() for news in news_list: news_url = response.url.split("PageArticleIndex")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='contenttext']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://epaper.qlwb.com.cn/" try: author = "齐鲁晚报" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//h2//text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class jjNews(scrapy.Spider): name = "jjPaperSpider" start_url = "http://epaper.21jingji.com/" header = spiderUtil.header_util() def start_requests(self): yield scrapy.Request(url=self.start_url, callback=self.parse_item_list, headers=self.header) def parse_item_list(self, response): news_list = response.xpath( "//div[@class='main']/ul/li/a/@href").extract() for news in news_list: yield scrapy.Request(url=news, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='txtContent']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://epaper.21jingji.com/" try: author_arr = response.xpath( "//div[@class='newsInfo']//text()").extract() author = "".join(author_arr).split(" ")[1].strip() if author == "": author = "21世纪经济报道数字报" except: spiderUtil.log_level(9, response.url) try: title_arr1 = response.xpath( "//div[@class='titleHead']/h1//text()").extract() title = "".join(title_arr1).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class ckNews(scrapy.Spider): name = "ckPaperSpider" start_url = "http://dz.jjckb.cn/www/pages/webpage2009/html/%s-%s/%s/node_2.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2018-01-02", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): for page in range(2, 10): paper = "node_" + str(page) + ".htm" if paper.startswith("node"): paper_url = response.url.split("node")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath("//a[@class='hei12']/@href").extract() for news in news_list: if news.startswith("content"): news_url = response.url.split("node")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//founder-content/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://dz.jjckb.cn/www/pages/webpage2009/" try: author_arr = response.xpath( "//td[@class='black12']//text()").extract() author = "".join(author_arr).split("来源:")[1].strip() if author == "": author = "经济参考报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//tr/td[@class='hei16b']//text()").extract() title_arr1 = response.xpath( "//tr/td[@class='hui12']//text()").extract() title = "".join(title_arr).strip() + "".join( title_arr1).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class hzNews(scrapy.Spider): name = "hzPaperSpider" start_url = "https://hzdaily.hangzhou.com.cn/hzrb/%s/%s/%s/page_list_%s%s%s.html" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2017-01-01", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day, year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath( "//li/a[@target='_parent']/@href").extract() for paper in paper_list: if paper.startswith("page"): paper = paper.replace("page_detail", "article_list") paper_url = response.url.split("page")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath( "//ul[@class='page-list']/li/a/@href").extract() for news in news_list: if news.startswith("article_detail"): news_url = response.url.split("article_list")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='content']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "https://hzdaily.hangzhou.com.cn/hzrb/" try: author = "杭州日报" except: spiderUtil.log_level(9, response.url) try: title_arr1 = response.xpath( "//div[@class='head']/h1//text()").extract() title_arr2 = response.xpath( "//div[@class='head']/h2//text()").extract() title_arr3 = response.xpath( "//div[@class='head']/h3//text()").extract() title = "".join(title_arr1).strip() + "".join( title_arr2).strip() + "".join(title_arr3).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class rdNews(scrapy.Spider): name = "rdPaperSpider" start_url = "http://media.workercn.cn/sites/media/jlgrb/%s_%s/%s/GR0100.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2016-01-05", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath("//table/tr/td/a/@href").extract() for paper in paper_list: if paper.startswith("GR"): paper_url = response.url.split("GR")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath("//ul/li/a/@href").extract() for news in news_list: news_url = response.url.split("GR")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='ozoom']/div/span/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://media.workercn.cn/" try: author_arr = response.xpath( "//div[@class='lai']/span//text()").extract() author = "".join(author_arr).strip().split("(")[0] if author == "": author = "劳动新闻" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//h1//text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
class bdNews(scrapy.Spider): name = "bdPaperSpider" start_url = "http://bddsb.bandao.cn/pc/bddsb/%s%s%s/PageA01BC.htm" header = spiderUtil.header_util() def start_requests(self): list = [] begin_date = datetime.datetime.strptime("2018-01-01", "%Y-%m-%d") end_date = datetime.datetime.strptime( time.strftime('%Y-%m-%d', time.localtime(time.time())), "%Y-%m-%d") while begin_date <= end_date: date_str = begin_date.strftime("%Y-%m-%d") list.append(date_str) begin_date += datetime.timedelta(days=1) for date in list: data_info = date.split("-") year = data_info[0] month = data_info[1] day = data_info[2] url = self.start_url % (year, month, day) time.sleep(1) yield scrapy.Request(url=url, callback=self.parse_item_home, headers=self.header) def parse_item_home(self, response): paper_list = response.xpath( "//div[@class='banmianlist_box']/a/@href").extract() for paper in paper_list: if paper.startswith("Page"): paper_url = response.url.split("Page")[0] + paper yield scrapy.Request(url=paper_url, callback=self.parse_item_list, headers=self.header, dont_filter=True) def parse_item_list(self, response): news_list = response.xpath( "//div[@class='bd_newslist']/a/@href").extract() for news in news_list: # if news.startswith("content"): news_url = response.url.split("Page")[0] + news yield scrapy.Request(url=news_url, callback=self.parse, headers=self.header) def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2})", response.text).group(0).replace( "年", "-").replace( "月", "-") + " 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//span[@id='contenttext']//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://bddsb.bandao.cn/pc/bddsb/" try: author = "半岛都市报" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@class='neirong']/h3//text()").extract() title_arr1 = response.xpath( "//div[@class='neirong']/h2//text()").extract() title = "".join(title_arr).strip() + "".join( title_arr1).strip() except: spiderUtil.log_level(6, response.url) try: # and public_time.startswith(spiderUtil.get_first_hour()): if len(content) > 50: item = PaperAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)