class XueqianSpider(scrapy.Spider): name = "news_eastday_com_gd2008_society_63" # allowed_domains = ["news.eastday.com"] start_urls = [ "http://news.eastday.com/gd2008/society/index.html", "http://news.eastday.com/eastday/13news/auto/news/society/index_K33.html" ] custom_settings = {"DOWNLOAD_DELAY": 0.2} class_id = 63 num = 1 items = EduInformationItem() flags = True bf = BloomFilter() next_index = "" header = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept - Encoding": "gzip, deflate", "Accept - Language": "zh-CN,zh;q=0.9", "Cache - Control": "no - cache", # "Connection": "keep - alive", "Host": "news.eastday.com", "Pragma": "no - cache", "Referer": "http://news.eastday.com", "Upgrade - Insecure - Requests": 1, "User - Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", } def parse(self, response): node_obj = response.xpath( '''//div[@id="left"]/ul/li|//div[@class="leftsection"]/ul/li''') if not node_obj: print("error_spider", self.name) for detail in node_obj: url = detail.xpath('a/@href').extract_first() time_node = detail.xpath( 'span[@class="hui12"]/text()|span[@class="black12 fr text4"]/text()' ).extract_first(default="").strip() url = urljoin(response.url, url) if url == None or url == "": pass else: if BL: if self.bf.isContains(url): # 判断字符串是否存在 print('url exists!') else: self.bf.insert(url) print("请求详情页:", url) yield scrapy.Request(url, callback=self.parse_detail, headers=self.header, meta={"time_node": time_node}) else: yield scrapy.Request(url, callback=self.parse_detail, headers=self.header, meta={"time_node": time_node}) # # # 多页 # next_node = response.xpath('''//div[@class="plist"]/div/a[contains(text(),"下一页")]/@href''').extract_first() # if next_node != None: # next_page = urljoin(response.url,next_node) # print("请求下页链接:",next_page) # yield scrapy.Request(next_page, callback=self.parse) def parse_detail(self, response): #标题title title = response.xpath('//div[@id="biaoti"]/text()').extract_first( default="") title = title.strip() title = title_slice(title) #关键字keyman keyman = response.xpath( '''//meta[@name="keywords"]/@content''').extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: #简介summary try: summary = response.xpath('//meta[@name="description"]/@content' ).extract_first(default="").strip() summary = summary.replace("东方网-东方新闻-", "") except Exception as e: summary = "" summary = summay_slice(summary) index_node = response.xpath( 'string(//div[@class="time grey12a fc lh22"]/p[last()])' ).extract_first() try: time_node = response.meta.get("time_node", "") time_node = time_node.replace("/", "-") news_time = datetime.datetime.strptime( str(time_node).strip(), "%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print(e, "time") news_time = None # '来源:新华社 作者:胡浩 林晖 朱基钗 史竞男 选稿:刘晓晶 ' #writer作者 try: writer = re.search(r".*?作者:(.*?)选稿:.*?", index_node, re.S).group(1) writer = writer.strip() except Exception as e: print(e, "writer") writer = writer_defined writer = writer_slice(writer) # 新闻来源news_source try: source = re.search(r".*?来源:(.*?)作者:.*?", index_node, re.S).group(1) source = source.strip() except Exception as e: try: source = re.search(r".*?来源:(.*?)选稿:.*?", index_node, re.S).group(1) source = source.strip() except Exception as e: try: source = re.search(r".*?来源:(.*)", index_node, re.S).group(1) source = source.strip() except Exception as e: print(e, "source") source = news_source_defined news_source = news_source_slice(source) #新闻内容content content = response.xpath('//div[@id="zw"]').extract_first() content = content.replace(" ", "") content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items
class XueqianSpider(scrapy.Spider): name = "mobile_zol_com_cn_more_3_506_85" # allowed_domains = ["eol.cn"] start_urls = ["http://mobile.zol.com.cn/more/3_506.shtml"] # custom_settings = {"DOWNLOAD_DELAY": 0.3} class_id = 85 num = 0 items = EduInformationItem() flags = True page_count = 1 bf = BloomFilter() next_index = "" def parse(self, response): node_obj = response.xpath( '''//ul[contains(@class,"content-list")]/li''') if not node_obj: print("error_spider", self.name) for detail in node_obj: url = detail.xpath('div/a/@href').extract_first() # new_time = detail.xpath('//ul[@class="ysh-test-list"]/li/p[contains(@class,"tickling")]/text()').extract_first() titlepic_image = detail.xpath('''div/a/img/@src''').extract_first() # print(titlepic_images) if not titlepic_image: titlepic_images = detail.xpath('''div/a/img''').extract_first() titlepic_image = re.search( '''<img.*?onerror="javascript:this\.src=.*?\.src=(.*?\.([Jj][pP][gG]|[Pp][Nn][gG])).*?>''', titlepic_images).group(1) url = urljoin(response.url, url) if url == None or url == "": pass else: if BL: if self.bf.isContains(url): # 判断字符串是否存在 print('url exists!', url) else: self.bf.insert(url) print("请求详情页:", url) yield scrapy.Request( url, callback=self.parse_detail, meta={"titlepic_image": titlepic_image}) else: yield scrapy.Request( url, callback=self.parse_detail, meta={"titlepic_image": titlepic_image}) # # # # # 多页 next_node = response.xpath( '''//div[@class="page"]/a[contains(text(),"下一页")]/@href''' ).extract_first() next_page = urljoin(response.url, next_node) if next_node and self.num <= 10: print("请求下页链接:", next_page) self.num += 1 yield scrapy.Request(next_page, callback=self.parse) def parse_detail(self, response): #标题title title = response.xpath('//h1/text()').extract_first(default="") #关键字keyman keyman = response.xpath( '''//meta[@name="keywords"]/@content''').extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: title = title_slice(title) #简介summary try: summary = response.xpath('//meta[@name="description"]/@content' ).extract_first(default="") except Exception as e: summary = "" summary = summay_slice(summary) titlepic_image = response.meta.get("titlepic_image", "") index_node = response.xpath( '''string(//div[@class="article-aboute"])''').extract_first() try: time_node = response.xpath( '''//div[@class="article-aboute"]/span[@id="pubtime_baidu"]/text()''' ).extract_first() news_time = datetime.datetime.strptime( str(time_node).strip(), "%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print("time", e) news_time = None # writer作者 writer = writer_defined try: writer = response.xpath( '''string(//div[@class="article-aboute"]/span[@id="author_baidu"])''' ).extract_first() writer = writer.replace("作者:", "") writer = writer.strip() except Exception as e: print(e, "writer") writer = writer_defined writer = writer_slice(writer) # 新闻来源news_source news_source = news_source_defined try: source = response.xpath( '''//div[@class="article-aboute"]/span[@id="source_baidu"]/text()''' ).extract_first() source = source.replace("[", "").replace("]", "") source = source.strip() except Exception as e: print(e, "source") source = news_source_defined news_source = news_source_slice(source) #新闻内容content content = response.xpath( '//div[@id="article-content"]').extract_first() content = content.replace(" ", "") content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic_image self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items
class XueqianSpider(scrapy.Spider): name = "sports_huanqiu_com_others_zh_67" # allowed_domains = ["eol.cn"] start_urls = [ "http://sports.huanqiu.com/others/zh/", "http://sports.huanqiu.com/basketball/nba/", "http://sports.huanqiu.com/basketball/cba/", "http://sports.huanqiu.com/soccer/gn/", "http://sports.huanqiu.com/soccer/xj/", "http://sports.huanqiu.com/soccer/yc/" ] # custom_settings = {"DOWNLOAD_DELAY": 0.3} class_id = 67 num = 0 items = EduInformationItem() flags = True page_count = 1 bf = BloomFilter() next_index = "" def parse(self, response): node_obj = response.xpath('''//div[@class="fallsFlow"]/ul/li''') if not node_obj: print("error_spider", self.name) for detail in node_obj: url = detail.xpath('h3/a/@href').extract_first() # new_time = detail.xpath('p[contains(@class,"time")]/text()').extract_first() titlepic_image = detail.xpath('''a/img/@src''').extract_first( default="") # if str(titlepic_image).startswith("//"): # if str(titlepic_image).endswith("gif"): # titlepic_image =detail.xpath('''a/img/@data-original''').extract_first() # titlepic_image = 'https:' + titlepic_image # else: # # titlepic_image = 'https:'+titlepic_image # if not titlepic_image: # titlepic_images = detail.xpath('''div/a/img''').extract_first() # titlepic_image = re.search('''<img.*?onerror="javascript:this\.src=.*?\.src=(.*?\.([Jj][pP][gG]|[Pp][Nn][gG])).*?>''',titlepic_images).group(1) url = urljoin(response.url, url) if url == None or url == "": pass else: if BL: if self.bf.isContains(url): # 判断字符串是否存在 print('url exists!', url) else: self.bf.insert(url) print("请求详情页:", url) yield scrapy.Request( url, callback=self.parse_detail, meta={"titlepic_image": titlepic_image}) else: yield scrapy.Request( url, callback=self.parse_detail, meta={"titlepic_image": titlepic_image}) # '''http://china.huanqiu.com/article/2.html''' # next_node = response.xpath('''//div[@id="pages"]/a[contains(text(),"下一页")]/@href''').extract_first() # next_page = urljoin(response.url, next_node) # try: # page = re.search(r".*/(\d+)\.html",next_page).group(1) # except Exception as e: # pass # # if next_node and int(page)<=2 and self.num<=200: # print("请求下页链接:",next_page) # self.num += 1 # yield scrapy.Request(next_page, callback=self.parse) def parse_detail(self, response): #标题title title = response.xpath('//h1/text()').extract_first(default="") #关键字keyman keyman = response.xpath( '''//meta[@name="keywords"]/@content|//meta[@name="Keywords"]/@content''' ).extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: title = title_slice(title) #简介summary try: summary = response.xpath( '//meta[@name="description"]/@content|//meta[@name="Description"]/@content' ).extract_first(default="") except Exception as e: summary = "" summary = summay_slice(summary) titlepic_image = response.meta.get("titlepic_image", "") index_node = response.xpath( '''//span[@class="la_t_a"]/text()''').extract_first() try: time_node = re.search( r".*?(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}).*?", index_node, re.S).group(1) time_node = time_node.strip() time_node = time_node.replace("年", "-").replace("月", "-").replace( "日", "") time_node = time_node + ":00" news_time = datetime.datetime.strptime( str(time_node).strip(), "%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print(e, "time") news_time = None '2016年04月13日 09:42 来源:深圳中原地产网 作者: 中原地产' # writer作者 writer = writer_defined source = response.xpath( '''string(//span[@class="la_t_b"])''').extract_first( default=news_source_defined) source = source.strip() news_source = news_source_slice(source) #新闻内容content content = response.xpath('//div[@class="la_con"]').extract_first() content = content.replace("【环球时报综合报道】", "") content = content.replace("【环球网体育频道】", "") content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic_image self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items
class XueqianSpider(scrapy.Spider): name = "news_eastday_com_gd2008_finance_66" # allowed_domains = ["news.eastday.com"] start_urls = ["http://news.eastday.com/eastday/13news/auto/news/finance/index_K47.html"] custom_settings = {"DOWNLOAD_DELAY": 0.2} class_id = 66 num = 1 items = EduInformationItem() flags = True bf = BloomFilter() next_index = "" def parse(self, response): node_obj = response.xpath('''//div[@id="left"]/ul/li|//div[@class="leftsection"]/ul/li''') if not node_obj: print("error_spider",self.name) for detail in node_obj: url = detail.xpath('a/@href').extract_first() url = urljoin(response.url, url) if url == None or url =="": pass else: if BL: if self.bf.isContains(url): # 判断字符串是否存在 print('url exists!') else: self.bf.insert(url) print("请求详情页:",url) yield scrapy.Request(url,callback=self.parse_detail) else: yield scrapy.Request(url, callback=self.parse_detail) # # # 多页 # next_node = response.xpath('''//div[@class="plist"]/div/a[contains(text(),"下一页")]/@href''').extract_first() # if next_node != None: # next_page = urljoin(response.url,next_node) # print("请求下页链接:",next_page) # yield scrapy.Request(next_page, callback=self.parse) def parse_detail(self,response): #标题title title = response.xpath('//h1/text()').extract_first(default="") title = title.strip() title = title_slice(title) #关键字keyman keyman = response.xpath('''//meta[@name="keywords"]/@content''').extract_first(default="") if keyman: keyman = keyman_slice(keyman) else: keyman = "" if title: #简介summary try: summary = response.xpath('//meta[@name="description"]/@content').extract_first(default="").strip() summary = summary.replace("东方网-东方财经-", "") except Exception as e: summary = "" summary = summay_slice(summary) index_node = response.xpath('string(//div[@class="time grey12a fc lh22"])').extract_first() try: time_node = re.search(r".*?(\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}).*?", index_node,re.S).group(1) time_node = time_node.strip() time_node = time_node.replace("/","-") news_time = datetime.datetime.strptime(str(time_node).strip(),"%Y-%m-%d %H:%M:%S") news_time = int(time.mktime(news_time.timetuple())) except Exception as e: print(e,"time") news_time = None # '来源:新华社 作者:胡浩 林晖 朱基钗 史竞男 选稿:刘晓晶 ' #writer作者 try: writer = re.search(r".*?作者:(.*?)选稿:.*?", index_node,re.S).group(1) writer = writer.strip() except Exception as e: print(e,"writer") writer = writer_defined writer = writer_slice(writer) # 新闻来源news_source try: source = re.search(r".*?来源:(.*?)作者:.*?", index_node,re.S).group(1) source = source.strip() except Exception as e: try: source = re.search(r".*?来源:(.*?)选稿:.*?", index_node, re.S).group(1) source = source.strip() except Exception as e: try: source = re.search(r".*?来源:(.*)", index_node, re.S).group(1) source = source.strip() except Exception as e: print(e,"source") source = news_source_defined news_source = news_source_slice(source) #新闻内容content content = response.xpath('//div[@id="zw"]').extract_first() content = content.replace(" ", "") content = content.replace("    ", "") content = content.replace("&", "") content = content.replace("nbsp", "") content = content.replace("&nbsp", "") content = contentfilter(content) self.items["news_keyman"] = keyman self.items["title"] = title self.items["content"] = content self.items['content_summary'] = summary self.items['click_num'] = click_num self.items['news_time'] = news_time self.items['news_source'] = news_source self.items['writer'] = writer # # self.items["class_id"] = self.class_id self.items["user_id"] = user_id self.items["istop"] = istop self.items["ismember"] = ismember self.items["userfen"] = userfen self.items["isgood"] = isgood self.items["user_name"] = "admin" self.items["group_id"] = group_id self.items["plnum"] = plnum self.items["first_title"] = first_title self.items["is_qf"] = is_qf self.items["totaldown"] = totaldown self.items["have_html"] = have_html self.items["last_dotime"] = int(time.time()) self.items["diggtop"] = diggtop self.items["stb"] = stb self.items["ttid"] = ttid self.items["ispic"] = ispic self.items["isurl"] = isurl self.items["fstb"] = fstb self.items["restb"] = restb self.items["news_tem_pid"] = news_tem_pid self.items["dokey"] = dokey self.items["closepl"] = closepl self.items["haveaddfen"] = haveaddfen self.items["infotags"] = keyman self.items["checked"] = checked self.items["keyid"] = keyid self.items["news_path"] = news_path self.items["titlepic"] = titlepic self.items["ftitle"] = ftitle # # self.items['filename'] = filename self.items['titlefont'] = titlefont self.items['title_url_z'] = title_url_z self.items['originalurl'] = response.url # yield self.items