def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body, "lxml") topic, referer_web, author, news_date = None, None, None, None #article_type:topic,出处,作者,发布时间 article_type = soup.find("div", class_="article-type") if article_type: topic = article_type.a.string #专题 span_list = article_type("span") if span_list: referer_web = span_list[0].text #出处 author = span_list[1].text #作者 news_date = span_list[2].text #发布时间 #内容 content = soup.find("div", class_="article-content").text if soup.find( "div", class_="article-content") else None #评论次数 comment_num = soup.find( "div", class_="jl-comment-title").span.string if soup.find( "div", class_="jl-comment-title") else None #新闻编号 news_no = response.url.split("/")[-1][:-5] item['content'] = content item['referer_web'] = referer_web item['author'] = author item['news_date'] = news_date item['comment_num'] = comment_num item['crawl_date'] = NOW item['topic'] = topic item['news_no'] = news_no yield item
def parse_news(self,response): item = response.meta.get("item",NewsItem()) soup = BeautifulSoup(response.body.decode("gbk"),"lxml") referer_web = soup.find("span",bosszone="jgname").text if soup.find("span",bosszone="jgname") else None referer_url = soup.find("span",bosszone="jgname").get("href") if soup.find("span",bosszone="jgname") else None abstract = soup.find("p",class_="Introduction").text.strip() if soup.find("p",class_="Introduction") else None temp = soup.find("p",align="center") if soup.find("p",align="center") else None if temp: pic = temp.find("img").get("src") if temp.find("img") else None else: pic = None author = soup.find("span",class_="auth").text if soup.find("span",class_="auth") else None crawl_date = NOW catalogue = "热点推荐" comment_num = soup.find("em",id="top_count").text.strip() if soup.find("em",id="top_count") else None temp = soup.find_all("p",style="TEXT-INDENT: 2em") if soup.find("p",style="TEXT-INDENT: 2em") else None if temp: content = "\n\n".join([ t.text.strip() for t in temp]) else: content = None item["referer_web"] = referer_web item["referer_url"] = referer_url item["abstract"] = abstract item["pic"] = pic item["author"] = author item["crawl_date"] = crawl_date item["catalogue"] = catalogue item["comment_num"] = comment_num item["content"] = content item['crawl_date'] = NOW item_keywords = judge_key_words(item) #获得item和关键词匹配的词 if item_keywords: #筛选出有关键词的item item["keywords"] = item_keywords yield item
def parse(self,response): origin_url = response.url if 'index' not in origin_url: pageindex = 0 else: pageindex = origin_url.rsplit('index_',1)[-1].replace('.html','') pageindex = int(pageindex) soup = BeautifulSoup(response.body.decode('utf8'),"lxml") news_list = soup.find_all('li',style = 'overflow:hidden;') for news in news_list: news_date = news.find('span').text if news.find('span') else None if news_date : news_url = news.find('a').get('href') news_no = news_url.rsplit('/',1)[-1].replace('.html','') # http://www.caac.gov.cn/XWZX/MHYW/201607/t20160726_39146.html title = news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')).text.strip() if news.find('a',href = re.compile('http://www.caac.gov.cn/XWZX/MHYW/')) else None item = NewsItem( news_date = news_date + ' 00:00:00', title = title, news_url = news_url, news_no = news_no ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={"item":item}) else: self.flag = pageindex else: logger.warning("mhyw can't find news_date") if not self.flag: next_url = self.next_url % (str(pageindex + 1 )) yield scrapy.Request(next_url)
def parse_topic(self,response): origin_url = response.url if "_" not in origin_url: pageindex = 0 topic_url = origin_url.rsplit(".",1)[0] else: temp = origin_url.rsplit("_",1) pageindex = temp[-1].split(".",1)[0] topic_url = temp[0] soup = BeautifulSoup(response.body,"lxml") catalogue = soup.find("a",class_ = "blue CurrChnlCls").get("title").strip() news_list = soup.find("div", class_ = "lie_main_m").find_all("li") for news in news_list: news_date = news.find("span").text.strip() + " 00:00:00" title = news.find("a").text.strip()[10:] news_url = topic_url.rsplit("/",1)[0] + news.find("a").get("href")[1:] news_no = news_url.rsplit("/",1)[-1].split(".")[0] item = NewsItem( news_date = news_date, news_url =news_url, title = title, news_no = news_no, catalogue = catalogue, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item}) else: self.flag[topic_url] = pageindex if not self.flag[topic_url]: next_url = topic_url + "_" + str(int(pageindex) + 1) + ".shtml" yield scrapy.Request(next_url,callback=self.parse_topic)
def parse(self, response): url = response.url pageindex = url.rsplit("/",1)[-1] soup = BeautifulSoup(response.body, "lxml") wrap = soup.find("div",class_="lph-pageList index-pageList") news_list = wrap.find_all("li") for news in news_list: topic = news.find("div",class_="img").a.string.strip() if news.find("div",class_="img") else None pic = news.find("img").get("data-original",None) if news.find("img") else None title = news.find("h3").text.strip() if news.find("h3") else None abstract = news.find("div",class_="des").text.strip() if news.find("div",class_="des") else None author = news.find("a",class_="aut").text.strip() if news.find("a",class_="aut") else None news_url = news.find("h3").a.get("href") if news.find("h3") else None tag_list = news.find("div", class_="tags").find_all("a") tags = [i.text for i in tag_list] if tag_list else None item = NewsItem(topic=topic, news_url=news_url, pic=pic, title=title, abstract=abstract, author=author, tags=tags, ) request = scrapy.Request(news_url,meta={"item":item,"pageindex":pageindex},callback=self.parse_news) yield request if not self.flag: pageindex = int(pageindex)+1 next_url = self.next_url % pageindex yield scrapy.Request(next_url)
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body.decode("gbk")) referer_web = soup.find('span', id='source_baidu').find('a').text.strip() referer_url = soup.find('span', id='source_baidu').find('a').get('href') author = soup.find( 'span', id='author_baidu').find('a').text.strip() if soup.find( 'span', id='author_baidu').find('a') else None crawl_date = NOW news_date = soup.find('span', id='pubtime_baidu').text.strip() comment_num = soup.find( 'span', class_='pltit').find('b').text.strip() if soup.find( 'span', class_='pltit') else 0 zan = soup.find('span', class_='zan-plus').text.strip() if soup.find( 'span', class_='zan-plus') else None read_num = int(comment_num) + int(zan) content = soup.find("div", id="newstext").get_text( strip=True) if soup.find("div", id="newstext") else None item['referer_web'] = referer_web item['content'] = content item['referer_url'] = referer_url item['author'] = author item['crawl_date'] = crawl_date item['news_date'] = news_date item['comment_num'] = int(comment_num) item['read_num'] = read_num yield item
def parse(self, response): origin_url = response.url pageindex = re.search(r"list_(\d+?).shtml",origin_url).group(1) if re.search(r"list_(\d+?).shtml",origin_url) else None soup = BeautifulSoup(response.body,"lxml") news_list = soup.find_all("div",class_="con_one") for news in news_list: title = news.h2.get_text(strip=True) news_url = news.h2.a.get("href",None) news_no = re.search(r"/(\d+?).shtml",news_url).group(1) if re.search(r"/(\d+?).shtml",news_url) else None abstract = news.p.get_text(strip=True) pic = news.find("img").get("src",None) if news.find("img") else None tags_list = news.find("span",class_="tag")("a") if news.find("span",class_="tag") else None tags = [i.text for i in tags_list] if tags_list else None catalogue = u"原创" if "yuanchuang" in origin_url else u"咨询" item = NewsItem( news_url=news_url, news_no=news_no, title=title, pic=pic, abstract=abstract, tags=tags, catalogue=catalogue ) yield scrapy.Request(news_url,callback=self.parse_news,meta={"pageindex":pageindex,"item":item}) news_next_url = self.news_next_url % str(int(pageindex)+1) if "yuanchuang" in origin_url: if not self.yuanchuang_flag: yield scrapy.Request(news_next_url) else: if not self.news_flag: yield scrapy.Request(news_next_url)
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body) read_num = soup.find("a", title=u"浏览").text.strip() if soup.find( "a", title=u"浏览") else None comment_num = soup.find( "span", class_="comment_count").text.strip() if soup.find( "span", class_="comment_count") else None author = soup.find("span", class_="author").text.strip() if soup.find( "span", class_="author") else None news_date = soup.find("span", class_="date").text.strip().replace( "/", "-") + ":00" if soup.find("span", class_="date") else None pic = soup.find( "div", class_="article-img").find("img").get("src").strip() if soup.find( "div", class_="article-img") and soup.find( "div", class_="article-img").find("img") else None temp = soup.find("div", class_="article-content") content = "\n".join([t.text.strip() for t in temp.find_all("p") ]) if temp.find("p") else None item["read_num"] = read_num item["author"] = author item['comment_num'] = comment_num item["news_date"] = news_date item['pic'] = pic item["content"] = content item['crawl_date'] = NOW yield item
def parse_news(self, response): item = response.meta.get("item", NewsItem()) pageindex = response.meta.get("pageindex", 1) topic_url = response.meta.get("topic_url", None) origin_url = response.url news_no_res = re.search(r"news/(\d+)\.html", origin_url) news_no = news_no_res.group(1) if news_no_res else None soup = BeautifulSoup(response.body, "lxml") ff3 = soup.find("h2", class_="f-ff3 f-fwn") referer_web = soup.find("h2", class_="f-ff3 f-fwn").i.text if ff3 else None #日期 origin_date = soup.find( "h2", class_="f-ff3 f-fwn").contents[-1].text if ff3 else None struct_date = datetime.datetime.strptime(origin_date, "%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") content = soup.find("div", class_="m-text").text if soup.find( "div", class_="m-text") else None author = soup.find("h3", class_="f-ff3 f-fwn").span.text if soup.find( "h3", class_="f-ff3 f-fwn") else None crawl_date = NOW item["referer_web"] = referer_web item["crawl_date"] = crawl_date item["author"] = author item["content"] = content item["news_no"] = news_no item["news_date"] = news_date item = judge_news_crawl(item) if item: yield item else: self.flag[topic_url] = pageindex
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body) # topic = soup.find("div","navItem").find_all("a")[2].text if len(soup.find("div","navItem").find_all("a")) >= 3 else None temp = soup.find("div", class_="actTitle") news_date = re.search('(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', temp.text) referer_web = temp.find("a").text if temp.find("a") else None referer_url = temp.find("a").get("href") if temp.find("a") else None temp = soup.find("div", class_="content") content = "\n".join([t.text.strip() for t in temp.find_all("p")]) pic = Cntour2Spider.start_urls[0][:-1] + temp.find("img").get( "src") if temp.find("img") else None # item["topic"] = topic item["news_date"] = news_date item["referer_web"] = referer_web item["referer_url"] = referer_url item["pic"] = pic item["content"] = content item['crawl_date'] = NOW yield item
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body.decode('gbk')) pic = soup.find('p', class_='f_center').find('img').get( 'src') if soup.find('p', class_='f_center') and soup.find( 'p', class_='f_center').find('img') else None referer_web = soup.find('a', id='ne_article_source').text if soup.find( 'a', id='ne_article_source') else None referer_url = soup.find( 'a', id='ne_article_source').get('href') if soup.find( 'a', id='ne_article_source') else None author = soup.find('span', class_='ep-editor').text if soup.find( 'span', class_='ep-editor') else None if u":" in author: author = author.split(u":")[-1] crawl_date = NOW read_num = soup.find( 'div', class_='post_comment_joincount').find('a').text if soup.find( 'div', class_='post_comment_tiecount') else 0 comment_num = soup.find( 'div', class_='post_comment_tiecount').find('a').text if soup.find( 'div', class_='post_comment_tiecount') else 0 content = soup.find('div', class_='post_text').get_text( strip=True) if soup.find('div', class_='post_text') else None item['referer_web'] = referer_web item['content'] = content item['referer_url'] = referer_url item['author'] = author item['crawl_date'] = crawl_date item['pic'] = pic item['comment_num'] = int(comment_num) item['read_num'] = int(read_num) yield item
def parse(self, response): soup = BeautifulSoup(response.body, "lxml") page_res = re.search("page=(\d+)", response.url) pageindex = page_res.group(1) if page_res else None #爬取页数 search_result = soup.find_all("ul", id="search-result") if search_result: news_list = search_result[0].find_all("li", class_="news") print len(news_list) #新闻列表的新闻,获取图片和摘要 for news in news_list: news_url = news.a.get("href", None) if news.a else None abstract = None if news.find("div", class_="summary hidden-xxs"): abstract = news.find( "div", class_="summary hidden-xxs").string.strip() pic = news.find("img").get("data-original", None) if news.find("img") else None item = NewsItem(news_url=news_url, abstract=abstract, pic=pic) #将item作为元素传递到解析页面中 if news_url: request = scrapy.Request(news_url, callback=self.parse_news) request.meta['item'] = item request.meta['pageindex'] = pageindex yield request else: logger.info("can't find news_url") #下一页 # if int(pageindex)<self.crawl_page: if not self.flag: next_url = self.page_url % str(int(pageindex) + 1) yield scrapy.Request(next_url, callback=self.parse) else: logger.info("can't find search_result")
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body) referer_web = soup.find("a", id="ne_article_source").text if soup.find( "a", id="ne_article_source") else None referer_url = soup.find("a", id="ne_article_source").get( "href", None) if soup.find("a", id="ne_article_source") else None comment_num = soup.find("a", class_="post_cnum_tie").text if soup.find( "a", id="ne_article_source") else None content = soup.find("div", class_="post_text").text.strip() if soup.find( "div", class_="post_text") else None #格式: 本文来源:证券日报-资本证券网 作者:矫 月 author_source = soup.find("span", class_="left").text if soup.find( "span", class_="left") else None #TODO 作者编码出错 # import pdb;pdb.set_trace() # author = re.search(u"作者(.*)",author_source).group(1)[1:] if author_source else None # item["author"]=author item["referer_web"] = referer_web item["referer_url"] = referer_url item["comment_num"] = comment_num item["content"] = content item["crawl_date"] = NOW yield item
def parse(self, response): origin_url = response.url #http://money.163.com/special/002526O5/transport_02.html search_result = re.search(r"_(\d)*?\.", origin_url) #获取页数 pageindex = search_result.group(1) if search_result else 1 soup = BeautifulSoup(response.body, "lxml") news_list = soup("div", class_="list_item clearfix") for news in news_list: news_date = news.find("span", class_="time").text if news.find( "span", class_="time") else None title = news.find("h2").text if news.find("h2") else None news_url = news.find("h2").a.get("href", None) if news.find("h2") else None abstract = news.find("p").contents[0] if news.find("p") else None item = NewsItem(title=title, news_url=news_url, abstract=abstract, news_date=news_date) item = judge_news_crawl(item) #判断是否符合爬取时间 if item: request = scrapy.Request(news_url, callback=self.parse_news, meta={"item": item}) yield request else: self.flag = int(pageindex) if not self.flag: next_url = self.next_url % int(pageindex) + 1 yield scrapy.Request(next_url)
def parse_news(self,response): item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",None) soup = BeautifulSoup(response.body) #TODO:新闻列表中会有专题,里面没有新闻的内容。现在是抛弃! #爬取新闻 news_txt=soup.find("div",class_="news_txt") if news_txt: content = news_txt.text news_about = soup.find("div",class_="news_about") #referer_web,news_date if news_about: referer_web = news_about.p.string news_date = news_about.p.next_sibling.next_sibling.text[0:16] struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") item["referer_web"]=referer_web item["news_date"]=news_date item["content"]=content item["crawl_date"]=NOW # import pdb;pdb.set_trace() item = judge_news_crawl(item) if item: yield item else: self.flag = pageindex else: logger.info("news page can't find news_txt.That may be a theme")
def parse(self, response): origin_url = response.url pageindex = origin_url.rsplit("=", 1)[-1] soup = BeautifulSoup(response.body, "lxml") news_temp = soup.find("table", class_="list").find("table", border="0").find("tbody") if not news_temp: return news_list = news_temp.find_all("tr")[1:] for news in news_list: temp = news.find("a") news_url = temp.get("href") title = temp.text.strip() temp = news.find_all("span") referer_web = temp[0].text.strip() news_date = temp[1].text.strip() news_no = news_url.rsplit("=", 1)[-1] item = NewsItem(news_date=news_date, title=title, referer_web=referer_web, news_url=news_url, news_no=news_no) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={"item": item}) else: self.flag = pageindex if not self.flag: next_url = self.next_url % (int(pageindex) + 1) yield scrapy.Request(next_url)
def parse(self, response): topic_url = response.url catalogue = u"商业" if "business" in topic_url else u"消费" self.driver.get(topic_url) #打开页面 index_page_code = self.driver.page_source #页面源代码 code =index_page_code pageindex = 1 interval =10 while True: soup = BeautifulSoup(code,"lxml") news_list = soup.find_all("dl",class_="f-cb") #页数来爬,因为每次都会有之前的新闻.间隔为10 for news in news_list[interval*(pageindex-1):interval*pageindex]: pic = news.find("img").get("src") if news.find("img") else None title = news.find("h3").text if news.find("h3") else None news_url = news.find("h3").a.get("href") if news.find("h3") else None item = NewsItem(pic=pic,title=title,news_url=news_url,catalogue=catalogue) yield scrapy.Request(news_url,callback=self.parse_news,meta={"item":item, "pageindex":pageindex, "topic_url":topic_url}) #结束 if self.flag[topic_url]: break #触发下一页操作,追加在页面,而不是打开一个新的页面 self.driver.find_element_by_id("clickMore").click() #找到“更多”按钮,触发点击操作 time.sleep(1) #等浏览器渲染 next_page_code = self.driver.page_source code = next_page_code #更新页面源代码 pageindex += 1 #如果放在start_urls一起爬取的话,会报错。原因应该是display不支持并行。 #现在只能是把每个页面分开 yield scrapy.Request("http://m.yicai.com/news/consumer/",callback=self.parse)
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body) content = soup.find( "div", class_=re.compile(r"pageCont")).text if soup.find( "div", class_=re.compile(r"pageCont")) else None item["crawl_date"] = NOW item["content"] = content yield item
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body) content = soup.find("div", class_="article-content").text tag_list = soup.find_all("a", "tag-link") tags = [i.text for i in tag_list] if tag_list else None item["content"] = content item["tags"] = tags item["crawl_date"] = NOW yield item
def parse_news(self, response): # print response.url,"response" PageKey = response.meta.get("topic_id") PageNumber = response.meta.get("PageNumber") flag_id = str(int(PageKey) - 40037910) soup = BeautifulSoup(response.body, "lxml") #2016-07-13 news_date = soup.find("time").text if soup.find("time") else None # print self.flag[flag_id],int(PageNumber) """ 条件是该类别标记(self.flag[flag_id])是0爬取,说明还没有爬到过期的。 爬取页面是该页的也继续爬取。因为一个页面的爬取顺序是异步的。 self.flag[flag_id]=过期页数 """ if not self.flag[flag_id] or int(PageNumber) == self.flag[flag_id]: #,没有超出范围 struct_date = datetime.datetime.strptime(news_date, "%Y-%m-%d") # print self.end_now,struct_date,"time" delta = self.end_now - struct_date # print delta.days,"delta day ~~~~~~~~~~~~~~~~" if delta.days > self.end_day: self.flag[str(flag_id)] = int(PageNumber) # print flag_id,"stop ~~~~~~" # raise CloseSpider('today scrapy end') else: head = soup.find("div", class_="post-head") topic, title, abstract = None, None, None if head: topic = head.find("span", class_="category").text if head.find( "span", class_="category") else None title = head.find("h1", class_="h1").text if head.find( "h1", class_="h1") else None abstract = head.find("span", class_="kicker").text if head.find( "span", class_="kicker") else None content = soup.find( "div", class_="post-body clearfix").text if soup.find( "div", class_="post-body clearfix") else None news_no = response.url.split("/")[-1].split("?")[0] #TODO 评论数量js渲染,未解决 item = NewsItem( title=title, topic=topic, abstract=abstract, news_date=news_date, content=content, news_no=news_no, crawl_date=NOW, news_url=response.url, ) yield item
def parse_quick_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body) referer_web = soup.find("span", class_="name").text if soup.find( "span", class_="name") else None tag_list = soup.find_all("a", "tag-link") tags = [i.text for i in tag_list] if tag_list else None item["tags"] = tags item['referer_web'] = referer_web item['crawl_date'] = NOW yield item
def parse_quick(self, response): soup = BeautifulSoup(response.body) news_list_inner = soup.find("div", class_="list-inner") next_timestamp = None news_list = news_list_inner.find_all( "div", class_=re.compile(r"bulletin-item.*")) if news_list_inner else None #json 页面 if not news_list: news_list = soup.find_all("div", class_=re.compile(r"bulletin-item.*")) for index, news in enumerate(news_list): origin_date = news.find("div", class_="news-time").get( "data-time", None) if news.find("div", class_="news-time") else None next_timestamp = origin_date if index == len( news_list) - 1 else None #取最后一篇文章的时间戳作下一页的时间戳 struct_date = datetime.datetime.fromtimestamp(int(origin_date)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find("a", class_="item-title").text if news.find( "a", class_="item-title") else None news_url = news.find("a", class_="item-title").get( "href", None) if news.find("a", class_="item-title") else None pic = news.find("img").get("src", None) if news.find("img") else None content = news.find("div", class_="item-desc").text if news.find( "div", class_="item-desc") else None id_result = re.search(r"/(\d+)\.html", news_url) news_no = id_result.group(1) if id_result else None item = NewsItem(content=content, news_url=news_url, pic=pic, title=title, news_no=news_no, news_date=news_date, catalogue=u"快报") item = judge_news_crawl(item) if item: request = scrapy.Request(news_url, meta={"item": item}, callback=self.parse_quick_news) yield request else: self.quick_flag = int(self.quick_page) if not self.quick_flag: if next_timestamp: next_quick_url = self.quick_json_url % next_timestamp yield scrapy.Request(next_quick_url, callback=self.parse_quick) else: logger.warning("can't find next_timestamp,url is %s " % response)
def parse(self, response): soup = BeautifulSoup(response.body, "lxml") origin_url = response.url res = re.search(r'ndex_(.*?)\.shtml', origin_url) new_index = 1 if res: index = res.group(1) new_index = int(index) + 1 #爬取列表 viewlist = soup.find_all("div", "list list-640") if viewlist: for news in viewlist: title = news.select("h3 a")[0].string if news.select( "h3 a") else None news_url = news.select("h3 a")[0].get( "href", None) if news.select("h3 a") else None abstract = news.select( 'p[class="info"]')[0].string if news.select( 'p[class="info"]') else None #info pic = news.find('img').get( "src", None) if news.find('img') else None #图片链接 #brand tags = [] #标签组 fl = news.find(class_="clear date") if fl and fl.select("a"): topic = fl.select("a")[0].string #专题 for i in fl.select("a")[1:-1]: tags.append(i.string) news_date = fl.find(class_="fr arial").string #%Y-%m-%d else: news_date = None topic = None item = NewsItem(title=title, news_url=news_url, abstract=abstract, pic=pic, topic=topic, news_date=news_date, catalogue=u"咨询") request = scrapy.Request(news_url, callback=self.parse_news) request.meta['item'] = item request.meta['pageindex'] = index yield request else: logger.info("can't find news list") if not self.flag and new_index: new_url = re.sub(r'ndex_(.*?)\.shtml', 'ndex_%s.shtml' % str(new_index), origin_url) yield scrapy.Request(new_url) else: logger.info("can't find index")
def parse_news(self, response): item = response.meta.get("item", NewsItem()) temp = re.search("\"content\":\"([\w\W]+?)\"", response.body).group(1) comment_num = re.search('"comment":"([\w\W]+?)"', response.body).group(1) print comment_num soup = BeautifulSoup(temp) content = "\n\n".join([t.text.strip() for t in soup.find_all("p")]) item['content'] = content item['comment_num'] = comment_num item['crawl_date'] = NOW yield item
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body, "lxml") temp = soup.find("div", class_="main-content") if soup.find( "div", class_="main-content") else None if temp: content = "\n\n".join([t.text.strip() for t in temp.find_all("p")]) else: content = None item["content"] = content item['crawl_date'] = NOW yield item
def parse(self, response): soup = BeautifulSoup(response.body,"lxml") newslist = soup.find(name="div", attrs={"data-lastkey": True}) lastkey = newslist.get("data-lastkey",None) logger.info(lastkey) if not lastkey: logger.warning("can't find next page") else: if newslist: for i in newslist.children: #文章中间有其余无关信息 if i != u' ': news_url = self.domain+i.a.get('href',None) pic = i.find("img").get('data-src') if i.find("img") else None title = i.find("h3").string if i.find("h3") else None comment_num = i.find(class_="iconfont icon-message").string if i.find(class_="iconfont icon-message") else 0 heart = i.find(class_="iconfont icon-heart").string if i.find(class_="iconfont icon-heart") else 0 topic = i.find(class_="category").span.string if i.find(class_="category") else 0 news_date =None if i.find(name="span", attrs={"data-origindate": True}): news_date= i.find(name="span", attrs={"data-origindate": True}).get("data-origindate",None) if news_date: news_date = news_date[:-6] #no content and have heart&conment but not add item = NewsItem(title=title,news_url=news_url,pic=pic,topic=topic,news_date=news_date,comment_num=comment_num) # 所属目录 item['catalogue'] = "Top 15" if "tags" in response.url else u"商业" #判断是否结束 item = judge_news_crawl(item) if item : request = scrapy.Request(news_url,callback=self.parse_article) request.meta["item"] = item yield request else: if "tags" in response.url: self.top_flag = lastkey else: self.com_flag = lastkey next_url = None #判断各个类别是否需要爬取下一页 if "tags" in response.url: if not self.top_flag: next_url = "http://www.qdaily.com/tags/tagmore/29/%s.json" % lastkey else: if not self.com_flag: next_url = "http://www.qdaily.com/categories/categorymore/18/%s.json" % lastkey # logger.info(next_url) if next_url: yield scrapy.Request(next_url,callback=self.parse_next_page) else: logger.warning("can't find newslit")
def parse_news(self,response): item = response.meta.get("item",NewsItem()) soup = BeautifulSoup(response.body,"lxml") temp = soup.find("div",class_ = "main_t").find_all("span") news_date = temp[0].text referer_web = temp[1].text.split(u":")[1] temp = soup.find("div",class_ = "TRS_Editor") content = "\n\n".join([ t.text.strip() for t in temp.find_all("p")]) item["news_date"] = news_date item["referer_web"] = referer_web item["content"] = content item['crawl_date'] = NOW yield item
def parse_topic(self, response): origin_url = response.url topic_url = origin_url.split("_", 1)[1].rsplit("_", 1)[0] pageindex = int(origin_url.rsplit("_", 1)[1].replace('.html', '')) catalogue = re.search('</a> -> ([\w\W]+?) </i></h3>', response.body).group(1).decode("gb2312") soup = BeautifulSoup(response.body, "lxml") news_list = soup.find_all('li') for news in news_list: news_date = news.find('i').text.split(' ')[1].replace( ']', '') if news.find('i') else None if news_date: news_url = news.find( 'a', href=re.compile( 'http://news.carnoc.com/list/*.?')).get('href') news_no = news_url.rsplit('/', 1)[1].replace('.html', '') title = news.find( 'a', href=re.compile( 'http://news.carnoc.com/list/*.?')).text.strip() abstract = news.find('div').text.strip() pic = news.find('div').find( 'img', src=re.compile('http://pic.carnoc.com/file/*.?') ).get('src') if news.find('div').find( 'img', src=re.compile('http://pic.carnoc.com/file/*.?')) else None tags = news.find( 'div', class_='keywordslist').text.strip() if news.find( 'div', class_='keywordslist') else None item = NewsItem( news_url=news_url, news_date=news_date + ' 00:00:00', title=title, abstract=abstract, news_no=news_no, catalogue=catalogue, pic=pic, tags=tags, ) item = judge_news_crawl(item) if item: yield scrapy.Request(item["news_url"], callback=self.parse_news, meta={'item': item}) else: self.flag[topic_url] = pageindex else: logger.warning("carnoc:%s can't find news_date " % origin_url) if not self.flag[topic_url]: next_url = origin_url.rsplit( "_", 1)[0] + '_' + str(pageindex + 1) + '.html' yield scrapy.Request(next_url, callback=self.parse_topic)
def parse_news(self, response): item = response.meta.get("item", NewsItem()) soup = BeautifulSoup(response.body, "lxml") referer_web = soup.find("span", id="source_baidu").text if soup.find( "span", id="source_baidu") else None temp = soup.find("div", id="arttext") if item["pic"] == None: item["pic"] = temp.find("img").get("src") if temp.find( "img") else None content = "\n\n".join([t.text.strip() for t in temp.find_all("p")]) item['referer_web'] = referer_web item['content'] = content item['crawl_date'] = NOW yield item
def parse_topic(self,response): topic_url = response.url # print topic_url body = json.loads(response.body) news_list = body["data"] page = response.meta.get("page","1") topic_name = response.meta.get("topic_name",None) #http://m.iwshang.com/category/20 没有新闻 if not news_list: self.flag[topic_url]=page for news in news_list: news_date_timestamp = news.get("published",None) struct_date = datetime.datetime.fromtimestamp(int(news_date_timestamp)) news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.get("title",None) news_no = news.get("contentid",None) abstract = news.get("description",None) pic = news.get("thumb",None) news_url = news.get("url",None) #手机端新闻页面链接 referenceid = news.get("referenceid",None) #pc端的id,手机端的id跟pc端的id不一样 pc_news_url = self.pc_news_url % referenceid #pc端新闻页面链接 item = NewsItem( news_date=news_date, title=title, news_no=news_no, abstract=abstract, pic=pic, news_url=pc_news_url, topic=topic_name ) item = judge_news_crawl(item) if item: # yield item yield scrapy.Request(pc_news_url,callback=self.parse_news,meta={"item":item}) else: self.flag[topic_url]=page if not self.flag[topic_url]: page = str(int(page)+1) post_data = { "inslider":"0", "page":page, "pagesize":"10" } yield scrapy.FormRequest( url=topic_url, formdata=post_data, callback=self.parse_topic, meta={"page":page} )