def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( "//div[@class='title_cen mar-t2 text']/ucaptitle/text()" ).extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//span[@id='ly']/text()").extract() if author == []: author = "海南省人民政府" else: author = author[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='zoom']/div[@id='font']/ucapcontent/p/text()" ).extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.hainan.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( """//div[@class='xl-nr clearflx']//h3/text()""").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( """//div[@class='xl-nr clearflx']//h5/span/text()""" ).extract() author = "".join(author_arr).replace("[", "").replace("]", "").strip() except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//div[@class='xl-nr clearflx']//h5/text()""").extract() public_time = "".join(content_time).replace("字号:", "").replace( "|", "").strip() public_time = str(public_time) + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.fujian.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( "//div[@class='article']/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@class='article-inf-left']/text()").extract() author = "".join(author_arr) if author == "": author = "广西壮族自治区人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='article-con']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.gxzf.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath( "//h2[@class='title']/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//span[@class='fl']/span/text()").extract() author = "".join(author_arr).strip() if author == "": author = "重庆市人民政府" else: author = author.split("来源:")[1] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='conTxt']")[0].xpath('string(.)').extract() content = "".join(content_arr).split("终审 :")[0].strip() except: spiderUtil.log_level(7, response.url) source = "http://www.cq.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//h1/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//head/meta[@name='ContentSource']/@content").extract() if author == []: author = "贵州省人民政府" else: author = author[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='view TRS_UEDITOR trs_paper_default trs_web']/p/text()" ).extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.guizhou.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//div[@class='xz-xl-tit']/h3/text()" ).extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//div[@class='xz-xl-info']/p/span/text()").extract() if len(author) == 2: author = author[1] else: author = "西藏自治区人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='xz-xl-article']" )[0].xpath("string(.)").extract() content = "".join(content_arr).split( "//显示下载附件")[0][:-106].strip() except: spiderUtil.log_level(7, response.url) source = "http://www.xizang.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: titles = response.xpath( """//div[@class='wztit']//text()""").extract() title = "".join(titles).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( """//div[@class='wzbjxx']/p/text()[3]""").extract() author = "".join(author_arr).replace("来源:", "").strip() except: spiderUtil.log_level(9, response.url) try: content_date = response.xpath( """//div[@class='wzbjxx']/p/text()[1]""").extract()[0] content_time = response.xpath( """//div[@class='wzbjxx']/p/text()[2]""").extract()[0] public_time = str(content_date) + " " + str( content_time) + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.ah.gov.cn/" try: content_detail = response.xpath( """//div[@class='wzcon']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( "//div[starts-with(@class,'detail-article-title')]//text()" ).extract() title = "".join(content_title).strip() except: spiderUtil.log_level(6, response.url) try: content_author = response.xpath( """/html/body/div/div/div/div/ul/li/span[2]/text()""" ).extract() author = "".join(content_author).strip() except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """/html/body/div/div/div/div/ul/li/span[1]/text()""" ).extract() public_time = str(content_time[0].strip()) + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.shanxi.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( """//td[@align="center"]/text()""").extract() title = "".join(content_title) except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//ul[@class="list"]/li[2]/text()""").extract() author = "".join(authors).replace("来源:", "").strip() except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//ul[@class="list"]/li[1]/text()""").extract() public_time = str(content_time[0]).replace("发布日期:", "").strip() except: spiderUtil.log_level(8, response.url) source = "http://www.jl.gov.cn/" try: content_detail = response.xpath( """//div[@id="zoom"]//text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( """//*[@class='text-center']/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: author = str( response.xpath( """//*[@class='list-unstyled list-inline']/li[2]/span/text()""" ).extract()[0]).replace("来源:", "") except: spiderUtil.log_level(9, response.url) try: public_time = str( response.xpath( """//*[@class='list-unstyled list-inline']/li[1]/span/text()""" ).extract()[0]).replace("发布时间:", "").strip() + ":00" except: spiderUtil.log_level(8, response.url) source = "http://www.hubei.gov.cn/" try: content_detail = response.xpath( """//*[@class='TRS_Editor']//text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( """//div[@class='artile_zw']/div/p/text()""").extract() title = "".join(content_title).strip() print(title) except: spiderUtil.log_level(6, response.url) try: author = response.xpath( """//div[@id='zoom']//font/text()[2]""").extract()[0] except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//div[@class='sp_time screen']/font[1]/text()""" ).extract()[0] public_time = str(content_time).replace("发布时间:", "") except: spiderUtil.log_level(8, response.url) source = "http://www.jiangxi.gov.cn/" try: content_detail = response.xpath( """//div[@id='zoom']/p/text()""").extract() content = "".join(content_detail) except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( """//div[@class='sp_title']/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//div[@class='sp_time']/font[2]/text()""").extract() author = "".join(authors).replace("来源:", "").strip() if author == "": author = "江苏人民政府网" except: spiderUtil.log_level(9, response.url) try: public_times = response.xpath( """//div[@class='sp_time']/font[1]/text()""").extract()[0] public_time = str(str(public_times).replace("发布日期:", "")) + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='zoom']//text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.js.gov.cn/" try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_title = response.xpath( """//div[@id="dbt"]//text()""").extract() title = "".join(content_title).strip() except: spiderUtil.log_level(6, response.url) try: author = "吉林省人民政府" except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//div[@class="c_xx"]//text()""").extract() public_times = str(content_time[0]).split(" ") public_time = str(public_times[1]) except: spiderUtil.log_level(8, response.url) source = "http://www.jl.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']//text()""").extract() content = "".join(content_detail).strip() except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath( "//head/title/text()").extract()[0].split("–")[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//dl/dd")[0].xpath( 'string(.)').extract()[0] except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) source = "https://www.gov.mo/" try: content_arr = response.xpath("//article/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: titles = response.xpath( """//*[@id='title']//text()""").extract() title = "".join(titles).strip() except: spiderUtil.log_level(6, response.url) try: authors = response.xpath( """//*[@id='source']//text()""").extract() author = "".join(authors).strip() except: spiderUtil.log_level(9, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: # spiderUtil.log_level(8, response.url) pass source = "http://www.henan.gov.cn/" try: content_detail = response.xpath( """//*[@class='content']//text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("""//div[@class='xq-tit']/span/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: author = str(response.xpath("""//div[@class='R-tit']/span[2]/text()""").extract()[0]).replace("来源:","") except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) source = "http://www.shangdong.gov.cn/" try: content_detail = response.xpath("""//div[@class='article']//text()""").extract() content = "".join(content_detail) except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("//div[@id='ivs_title']/text()").extract()[0].strip() except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//small[@class='PBtime']/text()").extract()[0].split("来源:")[1] if author == "": author = "上海市人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\s\s\d{1,2}\s:\s\d{1,2})", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace(" "," ").replace(" : ",":")+":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='ivs_content']")[0].xpath('string(.)').extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.shanghai.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title = response.xpath("""//h2[@class="cont_title"]/text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: author="河北省人民政府网" except: spiderUtil.log_level(9, response.url) try: public_timess = response.xpath("""//li[@class="xl_shijian"]//text()""").extract()[0] public_times = str(public_timess).replace("年", "-").replace("月", "-").replace("日", "").strip() public_time = str(public_times)+" 00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='zoom']/p/text()").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.hebei.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: title_arr = response.xpath("//h2/ucaptitle/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath("//ul[@id='articleattribute']/li/text()").extract() author = "".join(author_arr).strip() if author == "": author = "四川省人民政府" except: spiderUtil.log_level(9, response.url) try: public_time = re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}时\d{1,2}分)", response.text).group(0).replace("年","-").replace("月","-").replace("日","").replace("时",":").replace("分","")+":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("//div[@id='cmsArticleContent']")[0].xpath('string(.)').extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.sc.gov.cn/" try: if len(content) > 50 and (public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith( spiderUtil.get_first_twohour()) or public_time.startswith(spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item # print(item) except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text # print(text) html_size = sys.getsizeof(text) try: titles = response.xpath( """//td[@align="center"]/text()""").extract() title = "".join(titles).replace("来源:", "").strip() except: spiderUtil.log_level(6, response.url) try: content_author = response.xpath( """//table[@class="time"]//td[@align="left"]/text()""" ).extract() authors = content_author[0].split(" 信息来源:") author = str(authors[1]) except: spiderUtil.log_level(9, response.url) try: content_time = response.xpath( """//table[@class="time"]//td[@align="left"]/text()""" ).extract() public_times = str(content_time[0]).split(" 信息来源:") public_time = str( str(public_times[0]).replace("发布时间:", "").replace( "年", "-").replace("月", "-").replace("日", "") + " 00:00:00").strip() except: spiderUtil.log_level(8, response.url) source = "http://www.ln.gov.cn/" try: content_detail = response.xpath( """//div[@class='TRS_Editor']/div/p/text()""").extract() content = "" for i in range(0, len(content_detail)): content = content + content_detail[i] except: spiderUtil.log_level(7, response.url) try: if len(content) > 50 and ( public_time.startswith(spiderUtil.get_first_hour()) or public_time.startswith(spiderUtil.get_first_twohour()) or public_time.startswith( spiderUtil.get_first_threehour())): item = GovAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status_code, response.url)