def parsebody(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//div[@class='content-article']/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "https://news.qq.com/" try: if content != "" and str( response.meta["public_time"]).startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = response.meta["public_time"] item["url"] = response.url item["title"] = response.meta["title"] item["author"] = response.meta["author"] item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("""//div[contains(@id,'detail')]//p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.xinhuanet.com/" try: if content != "" and str(response.meta["public_time"]).startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = response.meta["public_time"] item["url"] = response.url item["title"] = response.meta["title"] item["author"] = response.meta["author"] item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( """//article//p//text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://news.sohu.com/" try: if content != "" and str( response.mete['public_time']).startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["public_time"] = response.mete['public_time'] item["url"] = response.mete['url'] item["title"] = response.mete['title'] item["author"] = response.mete['author'] item["source"] = source item["content"] = content item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(self.item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//div[@class='section-main']/p/text()").extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: public_time = re.search(r"(\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" public_time = response.url.split( "/")[-3][:-2] + "-" + public_time except: # spiderUtil.log_level(8, response.url) pass try: title_arr = response.xpath("//head/title//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//span[@id='copyfrom']//text()").extract() author = "".join(author_arr).strip() if author == "": author = "中青在线" except: spiderUtil.log_level(9, response.url) source = "http://www.cyol.com/" try: if content != "" and len( content) >= 100 and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: # content_time = response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract() public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: contents = response.xpath( """//*[@id="articleBody"]/p/text()""").extract() content = "".join(contents) except: spiderUtil.log_level(7, response.url) source = "http://www.china.com.cn/" try: author_arr = response.xpath( """//*[@id="source_baidu"]//text()""").extract() author = "".join(author_arr) if author == '': author = "中国网" else: author = author.split("来源:")[1].strip() except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "/html/body/div/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-").replace("日", "") + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( """/html/body/div/div/div/p/text()""").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.southcn.com/" try: author_arr = response.xpath( """/html/body/div/div/div/div/span/i/a/text()""").extract( ) author = "".join(author_arr).strip() if author == "": author = "央视网" else: author = author.replace("来源:", "") except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( """/html/body/div/div/div/h1/text()""").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group(0).replace("年", "-").replace( "月", "-").replace("日", "") + ":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath( "//div[@class='left_zw']/p//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath( "//div/div/div/div/h1//text()").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@class='left-t']//text()").extract() author = "".join(author_arr) if author == "": author = "中国新闻网" except: spiderUtil.log_level(9, response.url) source = "http://www.chinanews.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath( """//div/div/div/div/div/p/text()""").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("//div/div/h2/text()").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( """//*[@id="xl-headline"]/div/div/text()""").extract() author = "".join(author_arr).strip() if author == "": author = "大众网" else: author = author.split("来源: ")[1].split("作者:")[0].strip() except: spiderUtil.log_level(9, response.url) source = "http://www.dzwww.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( "//main/section/section/div/span[4]/text()").extract() public_time = str(str(content_time[0]) + ":00") except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "/html/body/main/section/section/article/section/p/text()" ).extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.rednet.cn/" try: author_arr = response.xpath( "//main/section/section/div/span[1]/text()").extract() author = "".join(author_arr).strip() if author == "": author = "红网" else: author = author.split("来源:")[1] except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//main/section/section/h1/text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: # if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size print(item) # yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( """//*[@id="main"]/div/div/div/div/div/span[1]/text()""" ).extract() public_time = str(str(content_time[0]) + ":00") except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( """//*[@id="article-content"]/p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.71.cn/" try: author_arr = response.xpath( """//*[@id="main"]/div/div/div/div/div/span[2]/text()""" ).extract() author = "".join(author_arr) except: spiderUtil.log_level(9, response.url) try: title = response.xpath( """//*[@id="main"]/div/div/div/div/h1/text()""").extract( )[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): # if content != "" : item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size print(content, public_time, title, author) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("//div[@class='TRS_Editor']" )[0].xpath('string(.)').extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: author_arr = response.xpath( "//meta[@name='author']/@content").extract() if author_arr == []: author = "中国青年网" else: author = author_arr[0] except: spiderUtil.log_level(9, response.url) source = "http://www.youth.cn/" try: title = "".join( response.xpath("//head/title/text()").extract()[0].split( "_")[:-2]).strip() except: spiderUtil.log_level(6, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( """//meta[@name="PubDate"]//@content""").extract() # print(content_time) # content_times = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", content_time[0]).group(0) # print(content_times) public_time = str(content_time[0]) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( """//div[@class="content"]//p//text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.comnews.cn/" try: author = response.xpath( """//meta[@name="ContentSource"]//@content""").extract( )[0].strip() except: spiderUtil.log_level(9, response.url) try: titles = response.xpath( """//meta[@name="ArticleTitle"]//@content""").extract() title = "".join(titles) except: spiderUtil.log_level(6, response.url) try: # if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size print(item) # yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", response.text).group(0) + "00:00:00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@class='TRS_Editor']/div/p//text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) source = "http://www.wenming.cn/" try: author_arr = response.xpath( "//div[@class='box01']/div[@class='fl']/a//text()" ).extract() author = "".join(author_arr).strip() if author == "": author = "文明网" except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath( "//div[@id='title_tex']//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_yesterday_date()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//div[@id='Content']//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath("//head/title//text()").extract() title = "".join(title_arr).strip().strip()[:-8] except: spiderUtil.log_level(6, response.url) try: author = response.xpath( "//head/meta[@name='author']/@content").extract() if author == []: author = "中国日报网" else: author = author[0] except: spiderUtil.log_level(9, response.url) source = "http://www.chinadaily.com.cn/" try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})", response.text).group(0) + ":00" except: # spiderUtil.log_level(8, response.url) pass try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='left newstext']")[ 0].xpath('string(.)').extract()[0] content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: author_list = [ "三湘风纪", "搜狐", "英国报姐", "我们爱历史", "最爱历史", "慢青年", "伟人勘察", "大象公会", "中国历史文化网", "凤凰", "腾讯", "新浪", "解放日报", "参考消息", "新华网", "红瞰天下", "海外网", "人民网", "中华读书报", "今日头条", "中国新闻网" ] author = author_list[random.randint(0, len(author_list) - 1)] except: spiderUtil.log_level(9, response.url) try: title = response.xpath( "//head/meta[@name='description']/@content").extract()[0] except: spiderUtil.log_level(6, response.url) source = "http://www.xilu.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2})", response.text).group(0).replace("T", " ") + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@id='post_description']" )[0].xpath('string(.)').extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: title = response.xpath("//head/title/text()").extract()[0][:-3] except: spiderUtil.log_level(6, response.url) try: author_arr = response.xpath( "//div[@id='post_author']/text()").extract() author = "".join(author_arr) if author == "": author = "亿欧网" except: spiderUtil.log_level(9, response.url) source = "https://www.iyiou.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = response.xpath( "//head/meta[@name='publishdate']/@content" ).extract()[0].replace("年", "-").replace("月", "-").replace( "日", " ") + ":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='cl']")[0].xpath( 'string(.)').extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: author = response.xpath("//head/meta[@name='source']/@content" ).extract()[0].split("-")[1] except: spiderUtil.log_level(9, response.url) try: title = response.xpath( "//head/meta[@itemprop='name']/@content").extract()[0] except: spiderUtil.log_level(6, response.url) source = "https://www.dahe.cn/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath( """//*[@id="pubtime_baidu"]/text()""").extract() public_time = str(content_time[0]) except: spiderUtil.log_level(8, response.url) try: contents = response.xpath( """//*[@id="allList"]/div/div/div/p/text()""").extract() content = "".join(contents) except: spiderUtil.log_level(7, response.url) source = "http://www.cankaoxiaoxi.com/" try: author = str( response.xpath("""//*[@id="source_baidu"]/text()"""). extract()[0].strip()).replace("来源:", "") except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div/div/h1/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): # if content != "" : item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) # 标题解析 try: title = response.xpath("//h1[@class='article-title']/text()" ).extract()[0].replace( '\t', '').replace('\n', '').replace('\r', '') except: spiderUtil.log_level(6, response.url) try: public_time_tmp = response.xpath( "//div[@class='article-infos']/span[@class='date']/text()" ).extract()[0] if len(public_time_tmp) == 16: public_time = public_time_tmp + ":00" else: public_time = public_time_tmp except: spiderUtil.log_level(8, response.url) source = "http://www.stnn.cc/" try: content_arr = response.xpath( "//div[@class='article-content fontSizeSmall BSHARE_POP']/p/text()" ).extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.meta["url"] item["title"] = title item["author"] = "星岛环球网" item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = str(response.xpath("""//*[@id="pubtime_baidu"]/text()""").extract()[0].strip())+":00" except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("""//*[@id="content"]/p/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.southcn.com/" try: author_arr = response.xpath("""//*[@id="source_baidu"]/text()""").extract() author = "".join(author_arr).strip() if author == "": author = "澎湃新闻" else: author = author.replace("来源:","") except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("""//*[@id="article_title"]/text()""").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parsebody(self,response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) all_arr = response.xpath("""//script//text()""").extract() data = "".join(all_arr).split("allData = ")[1].split("var adData")[0].strip()[:-1] data = json.loads(data) doc = data['docData'] try: public_time = doc['newsTime'] except: spiderUtil.log_level(8, response.url) try: content = doc['contentData']['contentList'][0]['data'] content = "".join(etree.HTML(content).xpath("//p//text()")).strip() except: spiderUtil.log_level(7, response.url) source = "http://news.ifeng.com/" try: author = doc['source'] except: spiderUtil.log_level(9, response.url) try: title = doc['title'] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("//div[@id='main-content']")[ 0].xpath('string(.)').extract()[0] content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: public_time = response.meta["public_time"] except: spiderUtil.log_level(8, response.url) try: author = response.xpath( "//p[@class='fromInfo']/text()").extract()[0].split(":")[1] except: spiderUtil.log_level(9, response.url) try: title = response.xpath( "//head/meta[@name='title']/@content").extract()[0] except: spiderUtil.log_level(6, response.url) source = "http://www.wenweipo.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath("""//*[@id="article"]/div/div/div/span/text()""").extract() public_time = str(time.strftime('%Y', time.localtime(time.time()))) +"-"+ str(content_time[0]) + " " + str(content_time[1]) + ":00" except: # spiderUtil.log_level(8, response.url) pass try: content_arr = response.xpath("""//*[@id="article"]/div/p/span/text()""").extract() content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://news.baidu.com/" try: author_arr = response.xpath("""//*[@id="article"]/div/div/p/text()""").extract() author = "".join(author_arr) except: spiderUtil.log_level(9, response.url) try: title_arr = response.xpath("""//*[@id="article"]/div/h2/text()""").extract() title = "".join(title_arr) except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parsebody(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath( "//div[@id='endText']/p/text()").extract() content = "".join(content_arr).strip() except: spiderUtil.log_level(7, response.url) try: author_arr = response.xpath( "//a[@id='ne_article_source']//text()").extract() author = "".join(author_arr).strip() if author == "": author = "网易新闻" except: spiderUtil.log_level(9, response.url) source = "https://news.163.com/" try: if content != "" and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = response.meta["title"] item["author"] = author item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("//div[@id='articleText']//text()").extract() content = "".join(content_arr).strip() # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: public_time = (re.search(r"(\d{4}年\d{1,2}月\d{1,2}日\s\d{1,2}:\d{1,2})", response.text).group( 0) + ":00").replace( "年", "-").replace("月", "-").replace("日", "") except: spiderUtil.log_level(8, response.url) source = "http://www.ce.cn/" try: title = response.xpath("//head/title/text()").extract()[0].split("_")[0] except: spiderUtil.log_level(6, response.url) try: author = response.xpath("//head/meta[@name='author']/@content").extract()[0] except: spiderUtil.log_level(9, response.url) try: if public_time.startswith(spiderUtil.get_first_hour()) and content != "": item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: content_arr = response.xpath("//div[@class='content']/p/text()").extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) source = "http://www.bjnews.com.cn/" try: author = response.xpath("//span[@class='author']/text()").extract()[0].strip() except: spiderUtil.log_level(9, response.url) try: title = response.xpath("//div[@class='title']/h1/text()").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_time = response.xpath("""//div[@class="info"]//text()""").extract() public_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", content_time[0]).group(0) except: spiderUtil.log_level(8, response.url) try: content_arrs = response.xpath("""//td[@class="content"]//p//text()""").extract() content_arr = content_arrs.split('推荐信息')[0] content = "".join(content_arr) except: spiderUtil.log_level(7, response.url) source = "http://www.188cf.net/" try: author = "188财富网" except: spiderUtil.log_level(9, response.url) try: title = response.xpath("""//h1//text()""").extract()[0] except: spiderUtil.log_level(6, response.url) try: if content != "" and str(public_time).startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath("//div[@class='post_text']/p/text()").extract() content = "".join(content_arr) # content = "".join(content_tmp.split()) except: spiderUtil.log_level(7, response.url) try: author = response.xpath("//head/meta[@name='author']/@content").extract()[0] except: spiderUtil.log_level(9, response.url) source = "http://www.e23.cn/" try: title = "".join(response.xpath("//head/title/text()").extract()[0].split("-")[:-2]).strip() except: spiderUtil.log_level(6, response.url) try: public_time = response.xpath("//div[@class='post_time']/p/text()").extract()[0] except: spiderUtil.log_level(8, response.url) try: if content != "" and public_time.startswith(spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # 数据打入piplines处理 yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) data_str = response.text data_str = data_str[9:-1] data_str = eval( data_str, type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))()) data_str = json.dumps(data_str) data_str = json.loads(data_str) try: content = str(data_str['normalized_content']) except: spiderUtil.log_level(7, response.url) source = "http://www.xuexi.cn/" try: if content != "" and str( response.meta["public_time"]).startswith( spiderUtil.get_first_hour()): # if content != "" : item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = response.meta["public_time"] item["url"] = response.url item["title"] = response.meta["title"] item["author"] = response.meta["author"] item["html_size"] = html_size item["crawl_time"] = spiderUtil.get_time() # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)
def parse(self, response): if response.status == 200: text = response.text html_size = sys.getsizeof(text) try: content_arr = response.xpath( "//dl/dd[@id='CONTENT']/p//text()").extract() content = "".join(content_arr).replace("\xa0", "") except: spiderUtil.log_level(7, response.url) try: title_arr = response.xpath( "//dd[@class='f18 b black02 yh center']//text()").extract( ) title = "".join(title_arr).strip() if title == "": title_arr = response.xpath( "//td[@class='f22 b black02']//text()").extract() title = "".join(title_arr).strip() except: spiderUtil.log_level(6, response.url) try: public_time = re.search( r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2})", response.text).group(0) except: spiderUtil.log_level(8, response.url) try: author_arr = response.xpath( "//dd[@class='f12 black02']//text()").extract() author_tmp = "".join(author_arr).strip() if author_tmp == "": author = "法制网" else: author = author_tmp except: spiderUtil.log_level(9, response.url) source = "http://www.legaldaily.com.cn/" try: if len(content) > 80 and public_time.startswith( spiderUtil.get_first_hour()): item = NewsAllItem() item["source"] = source item["content"] = content item["public_time"] = public_time item["url"] = response.url item["title"] = title item["author"] = author item["crawl_time"] = spiderUtil.get_time() item["html_size"] = html_size # print(item) yield item except: pass else: spiderUtil.log_level(response.status, response.url)