def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel
class XinHuaNewsCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor :param channel:媒体 :param logger:日志 :return: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 :param keywordList:关键字数组 :param endTime:搜索时间范围结束 :return:返回文章列表 ''' startTime = endTime - datetime.timedelta(days=1) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for url in urls: self.logger.debug(url) article = self.crawlArticle(url) if article is not None and article not in articleList: #同一文章可能会在搜索中出现多次结果,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def crawlStatistics(self, article): ''' 爬取统计信息 :param article:文章 :return:无需返回参数,统计信息写入article实例 ''' pass def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 :param url: :return:返回一个Article实例 ''' cookies = None html = self.session.download(url=url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies) if html: article_url = html['url'] article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] article_id = re.findall(r'c_\d+', article_url)[0] article_id = article_id[2:] soup = BeautifulSoup(html['html'], "html.parser") main1 = soup.find('div', attrs={'class': "widthMain main"}) main2 = soup.find('div', attrs={'class': "main pagewidth"}) main3 = soup.find('body', attrs={'class': "streamline-page"}) main4 = soup.find('div', attrs={'class': "h-title"}) main5 = soup.find('div', attrs={'id': "article"}) main6 = soup.find('div', attrs={'id': "Title"}) main7 = soup.find('div', attrs={'class': "article"}) if main1 is not None: self.logger.debug("main1") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain1(main1) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main2 is not None: self.logger.debug("main2") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain2(main2) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main3 is not None: self.logger.debug("main3") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain3(main3) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main3 is None and main4 is not None: self.logger.debug("main4") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain4(soup) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main5 is not None: self.logger.debug("main5") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain5(main5) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main6 is not None: self.logger.debug("main6") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain6(soup) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main7 is not None: self.logger.debug("main7") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain7(soup) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if (main1 is None and main2 is None and main3 is None and main4 is None and main5 is None and main6 is None and main7 is None): self.logger.error(u"存在另外一种html格式::%s", url) return def refreshSearch(self): ''' 重置搜索 :return: ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 :return: ''' pass def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 :param article:文章 :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' return ([], False) def crawlMain1(self, main1): # 获取标题 Ttitle = main1.find('h1') if Ttitle is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle.text.strip() # 获取发布时间 Tinfo = main1.find('div', attrs={'class': "info"}) if Tinfo is not None: Ttime = Tinfo.find('span', attrs={'class': "h-time"}) Ttime = Ttime.text.strip() else: self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布作者 Tauthor = Tinfo.find('em', attrs={'id': "source"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None #获取发布内容 Tcontent = main1.find('div', attrs={'id': "content"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent def crawlMain2(self, main2): # 获取标题 error = "" Ttitle = main2.find('h1') if Ttitle is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle.text.strip() # 获取发布时间 Tinfo = main2.find('div', attrs={'class': "info"}) if Tinfo is not None: Ttime = Tinfo.find('span', attrs={'id': "pubtime"}) Ttime = Ttime.text.strip() if (Ttime == ""): Ttime = Tinfo.text.strip() Ttime = re.findall(u'\d{4}年\d{2}月\d{2}日.\d{2}:\d{2}:\d{2}', Ttime)[0] timeArray = time.strptime(Ttime, u"%Y年%m月%d日 %H:%M:%S") Ttime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布作者 Tauthor = Tinfo.find('em', attrs={'id': "source"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取发布内容 Tcontent = main2.find('div', attrs={'id': "content"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent def crawlMain3(self, main3): # 获取标题 Ttitle = main3.find('div', attrs={'class': "h-title"}) if Ttitle is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle.text.strip() # 获取发布时间 Tinfo = main3.find('div', attrs={'class': "h-info"}) if Tinfo is not None: Ttime = Tinfo.find('span', attrs={'class': "h-time"}) Ttime = Ttime.text.strip() else: self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布作者 Tauthor = Tinfo.find_all('span')[1] if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取发布内容 Tcontent = main3.find('div', attrs={'class': "h-title"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent def crawlMain4(self, main4): # 获取标题 Ttitle = main4.find('div', attrs={'class': "h-title"}) if Ttitle is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle.text.strip() # 获取发布时间 Tinfo = main4.find('div', attrs={'class': "h-info"}) if Tinfo is not None: Ttime = Tinfo.find('span', attrs={'class': "h-time"}) Ttime = Ttime.text.strip() else: self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布作者 Tauthor = Tinfo.find_all('span')[1] if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取发布内容 Tcontent = main4.find('div', attrs={'id': "p-detail"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent def crawlMain5(self, main5): # 获取标题 Ttitle = main5.find('h1') if Ttitle is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle.text.strip() # 获取发布时间 Tinfo = main5.find('div', attrs={'class': "source"}) Ttime = Tinfo.find('span', attrs={'class': "time"}) if Ttime is not None: Ttime = Ttime.text.strip() else: self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布作者 Tauthor = Tinfo.find('em', attrs={'id': "source"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取发布内容 Tcontent = main5.find('div', attrs={'class': "article"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent def crawlMain6(self, main6): # 获取标题 Ttitle_div = main6.find('div', attrs={'id': "Title"}) if Ttitle_div is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle_div.text.strip() # 获取发布时间 title_parents = Ttitle_div.find_parents() Tinfo = title_parents[2].find_all('td')[1] Ttime = Tinfo.text.strip() try: Ttime = re.findall(u'\d{4}年\d{2}月\d{2}日.\d{2}:\d{2}:\d{2}', Ttime)[0] except: self.logger.error(traceback.format_exc()) Ttime = "" if Ttime == "": self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: timeArray = time.strptime(Ttime, u"%Y年%m月%d日 %H:%M:%S") Ttime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) Tauthor = Tinfo.find('font') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取发布内容 Tcontent = main6.find('div', attrs={'id': "Content"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent def crawlMain7(self, main7): # 获取标题 error = "" Ttitle = main7.find('h1') if Ttitle is None: self.logger.error(u'[XinhuaNews]' + u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None else: Ttitle = Ttitle.text.strip() # 获取发布时间 Ttime = main7.find('span', attrs={'class': "time"}) if Ttime is not None: Ttime = Ttime.text.strip() timeArray = time.strptime(Ttime, u"%Y年%m月%d日 %H:%M:%S") Ttime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布作者 Tauthor = main7.find('em', attrs={'id': "source"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取发布内容 Tcontent = main7.find('div', attrs={'class': "article"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n\t', '', Tcontent) else: self.logger.error(u'[XinhuaNews]' + u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return None, None, None, None return Ttitle, Ttime, Tauthor, Tcontent
class PeopleNewsCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' self.site = 'people.com.cn' # 搜索站点 if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() count = 0 for url in urls: article = self.crawlArticle(url) #self.logger.debug(article) #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 if article is not None and article not in articleList: #count = count +1 #self.logger.debug(u'文章数量%d',count) articleList.append(article) return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' pass def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420' html = self.session.download(url, data=None, isJson=False, timeout=10, retry=3, addr=True, cookies=cookies) soup = BeautifulSoup(html['html'], 'html.parser') # 'html.parser' 解析器 try: meta = soup.find('meta').attrs['content'] except: self.logger.warn(u'找不到meta里的content') return # self.logger.error('%s',meta) if "GB2312" in meta: encoding1 = 'GB2312' elif "UTF-8" in meta: encoding1 = 'UTF-8' elif "utf-8" in meta: encoding1 = 'utf-8' else: encoding1 = 'gbk' html = self.session.download(url, encoding=encoding1, data=None, isJson=False, timeout=10, retry=3, addr=True, cookies=cookies) # 不同网页编码格式让其重新下载一遍 soup = BeautifulSoup(html['html'], 'html.parser') # 'html.parser' 解析器 main = soup.find('body') if html: article_url = html['url'] # self.logger.debug(article_url) if article_url.find(self.channel.url) < 0: self.logger.warn('Unrelated url found:%s', url) return None # if '.html' not in article_url: # self.logger.error(u'非文章类型网址:%s ',article_url) # return try: article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] except: self.logger.error(u'网址后缀不符合:%s ', article_url) return self.logger.debug('[peoplenews]' + article_url) articleid = article_url articleid = articleid[articleid.find('cn/') + 3:-5] # 由于数据库字段长度有限,所以截取部分作为ID self.logger.warn(u'地址 %s 以url地址部分字段作为tid', article_url) if 'bbs1' not in article_url: main1 = soup.find( 'div', attrs={'class': "i_cont"} ) #http://health.people.com.cn/n1/2017/1011/c14739-29579836.html main2 = soup.find( 'div', attrs={'class': "text_c"} ) #http://rencai.people.com.cn/n/2014/0721/c244800-25311391.html if (main1 is None) and (main2 is None): self.logger.debug(u'走main') try: Ttitle = main.find('h1').text.strip() # 标题 except: self.logger.error(u'Ttitle存在走了main部分却不满足其他格式的的url::%s', article_url) return if Ttitle is None: self.logger.error( u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Ttitle_crawl = soup.find('div', attrs={'class': "box01"}) #对应一种格式 if Ttitle_crawl is None: self.logger.error( u'Ttitle_crawl存在走了main部分却不满足其他格式的的url::%s', article_url) return try: Tpublishtime = Ttitle_crawl.find('div', attrs={ 'class': "fl" }).text.strip() except: self.logger.error(u'main中发布时间不匹配') return if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: # self.logger.error(Tpublishtime) Tpublishtime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', Tpublishtime)[0] # Tpublishtime = Tpublishtime[:18] if len(Tpublishtime[4]) > 1: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + Tpublishtime[4] else: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + ':00' # Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '') Tauthor = Ttitle_crawl.find( 'a', attrs={'target': "_blank"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = soup.find('div', attrs={'class': "box_con"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return Treply = None #这种格式下没有这些统计可以获取 meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply #self.logger.info(article) return article elif (main1 is not None): self.logger.debug(u'走main1') Ttitle = main1.find('h2') # 标题 if Ttitle is None: self.logger.error( u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Ttitle = Ttitle.text.strip() try: Tpublishtime = main1.find('div', attrs={ 'class': "artOri" }).text.strip() except: self.logger.error(u'main1中发布时间不匹配') return if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Tpublishtime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', Tpublishtime)[0] # self.logger.error(Tpublishtime) # Tpublishtime = Tpublishtime[:18] if len(Tpublishtime[4]) > 1: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + Tpublishtime[4] else: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + ':00' Tauthor = main1.find('div', attrs={ 'class': "artOri" }).find('a', attrs={'target': "_blank"}) # self.logger.debug(u"作者:%s",Tauthor) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main1.find('div', attrs={'class': "artDet"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return Treply = None # 这种格式下没有这些统计可以获取 meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article elif (main2 is not None): self.logger.debug(u'走main2') Ttitle = main2.find('h2', attrs={'class': "one"}) # 标题 if Ttitle is None: self.logger.error( u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Ttitle = Ttitle.text.strip() try: Tpublishtime = main2.find('span', attrs={ 'id': "p_publishtime" }).text.strip() except: self.logger.error(u'main2中发布时间不匹配') return if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Tpublishtime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', Tpublishtime)[0] # self.logger.error(Tpublishtime) # Tpublishtime = Tpublishtime[:18] if len(Tpublishtime[4]) > 1: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + Tpublishtime[4] else: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + ':00' Tauthor = main2.find('span', attrs={ 'id': "p_origin" }).find('a', attrs={'target': "_blank"}) # self.logger.debug(u"作者:%s",Tauthor) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main2.find('div', attrs={'class': "show_text"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return Treply = None # 这种格式下没有这些统计可以获取 meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article else: self.logger.warn(u'存在另外一种html格式 %s', article_url) # elif 'bbs1' in article_url: #bbs1的格式 # self.logger.debug(u'走bbs1') # if main is not None: # main_crawl = main.find('div',attrs={'class':"navBar"}) # Ttitle = main_crawl.find('h2').text # # if Ttitle is None: # self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s', article_url) # return # else: # statice_crawl = soup.find('p', attrs={'class': "replayInfo"}) # Tpublishtime = statice_crawl.find('span',attrs={'class':"float_l mT10"}).text.strip() # Tpublishtime = Tpublishtime[-19:] # if Tpublishtime is None: # self.logger.error(u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) # return # else: # Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '') # # Tauthor = None # # Tcontent_crawl = soup.find('article') # Tcontent_crawl1 = Tcontent_crawl.find('div').attrs['content_path'] # Tcontent_html = self.session.download(Tcontent_crawl1,encoding='utf-8', data=None, isJson=False, timeout=10, retry=3) # soup1 = BeautifulSoup(Tcontent_html, 'html.parser') # Tcontent = soup1.text.strip() # if Tcontent is not None: # Tcontent = re.sub(r'\n|\t', '', Tcontent) # else: # self.logger.error(u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) # return # # Tread = statice_crawl.find('span',attrs={'class':"readNum"}).text.strip() # Treply = statice_crawl.find('span', attrs={'class': "replayNum"}).text.strip() # Tlike = statice_crawl.find('span',attrs={'class':"float_l supportBtn"}).attrs['overscore'] # # meta_info = None ##这里保存class= replyInfo # # article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, # article_url, None, Tauthor, meta_info=meta_info) # article.statistics.reply_count = Treply # article.statistics.read_count = Tread # article.statistics.like_count = Tlike # # # self.logger.info(article) # return article def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420' html = self.session.download(article.url, encoding='gbk', data=None, isJson=False, timeout=10, retry=3, addr=True, cookies=cookies) article_url = article.url soup = BeautifulSoup(html['html'], 'html.parser') try: sid = soup.find('meta', attrs={ 'name': "contentid" }).attrs['content'] except: return (list(), False) sid = re.sub(r'\D', '', sid) bbs = 'http://bbs1.people.com.cn/postLink.do?nid=' + sid # bbs = soup.find('div', attrs={'class': "message"}) # if bbs: # bbs = bbs.find('a') # if bbs: # bbs = bbs.attrs['href'] # else: # bbs = 'http://bbs1.people.com.cn/postLink.do?nid=' # print bbs # else: # return None commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") html1 = self.session.download(bbs, encoding='gbk', data=None, isJson=False, timeout=10, retry=3) soup1 = BeautifulSoup(html1, 'html.parser') id = soup1.find('meta', attrs={'name': "contentid"}) if id: id = id.attrs['content'] id = re.sub(r'\D', '', id) re_url = 'http://bbs1.people.com.cn/api/postApi.do' page = 1 while page < 30: data1 = { 'action': 'postDetailByParentId', 'replayPostId': id, 'pageNo': page } html2 = self.session.download(re_url, encoding='utf-8', data=data1, isJson=False, timeout=10, retry=3) html2 = re.sub(r'\\\\\\', '', html2) html2 = re.sub(r'"\[\\"', '[', html2) html2 = re.sub(r'\\"\]"', ']', html2) html2 = re.sub(r'\\",\\"', ',', html2) html2 = json.loads(html2) totalCount = html2['totalCount'] if totalCount == 0: break replayPosts = html2['replayPosts'] if replayPosts: for i in replayPosts: cid = i['id'] user_id = i['userId'] user_name = i['userNick'] user_ip = i['userIP'] # ip_address = get_ip_address(str(user_ip)) # ip_address = '' user_head = '' publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(i['createTime']) / 1000)) reply_userid = i['parentId'] like_count = i['vote_yes'] unlike_count = i['vote_no'] read_count = i['readCount'] reply_count = i['replyCount'] source_url = article_url content = i['contentText'] heat = 0 location_coutry = 'CN' # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid # print like_count,unlike_count,read_count,reply_count,source_url commentList.append( Comment( article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_coutry, None, None, ###这里的ip_address还未实现 user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) pageCount = html2['pageCount'] # 评论总页数 if pageCount == page: break page = page + 1 # 评论页数+1 else: break return (commentList, False)
class TencentNewsCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel self.lastCommentId = None def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) # 由于之前news.qq.com这个前缀的网站太少新闻,新添加另外两个 channel_prefix = ['news', 'new', 'gd'] channel_urls = self.channel.url.split('|') channel_dict = dict(zip(channel_prefix, channel_urls)) websites = dict() for prefix, channel_url in channel_dict.items(): websites[prefix] = self.baiduCrawler.search( channel_url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = self.crawlArticle(channel_prefix, websites) return articleList def crawlArticle(self, channel_prefix, websites): ''' :param channel_prefix: 网址前缀数组 :param websites: 对应网址前缀的文章url字典 :return: ''' articleList = list() for prefix, url_list in websites.items(): if prefix is channel_prefix[0]: for i in range(len(url_list)): article = self.crawlNewsArticle(url_list[i]) if article is not None and article not in articleList: # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) if prefix is channel_prefix[1]: for i in range(len(url_list)): article = self.crawlNewArticle(url_list[i]) if article is not None and article not in articleList: # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) if prefix is channel_prefix[2]: for i in range(len(url_list)): article = self.crawlNewsArticle(url_list[i]) if article is not None and article not in articleList: # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def crawlNewsArticle(self, url): ''' 爬取url前缀为news.qq.com和gd.qq.com的文章 :param url: :return: ''' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find('news.qq.com') < 0 and article_url.find( 'gd.qq.com') < 0: self.logger.warn('Unrelated url found:%s', url) return None article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[TencentNews]' + article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'id': "Main-Article-QQ"}) main1 = soup.find('div', attrs={'id': "Main-P-QQ"}) if main is not None: Ttitle = main.find('h1').text.strip() #标题 Ttime = main.find('span', attrs={'class': "article-time"}) #发布时间 Ttime1 = main.find('span', attrs={'class': "a_time"}) Ttime2 = main.find('span', attrs={'class': "pubTime"}) if Ttime is not None: Ttime = Ttime.text.strip() elif Ttime1 is not None: Ttime1 = Ttime1.text.strip() Ttime = Ttime1 elif Ttime2 is not None: Ttime2 = Ttime2.text.strip() Ttime = Ttime2 else: Ttime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if len(Ttime) == 16: Ttime = Ttime + ':00' Tauthor = main.find('span', attrs={'class': "a_source"}) Tauthor1 = main.find('span', attrs={'class': "color-a-1"}) if Tauthor is not None: #Tauthor = Tauthor.find('a').text.strip() Tauthor = Tauthor.text.strip() elif Tauthor1 is not None: #Tauthor1 = Tauthor1.find('a').text.strip() Tauthor1 = Tauthor1.text.strip() Tauthor = Tauthor1 else: Tauthor = None Tcontent = main.find('div', attrs={'id': "Cnt-Main-Article-QQ"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: Tcontent = None articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0] try: commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0] meta_info = '{"commentid":"%s"}' % commentid except: commentid = None meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Ttime, article_url, None, Tauthor, meta_info=meta_info) if commentid is not None: try: re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum' html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=None, timeout=10, retry=3)) Treply = int(html1['data']['commentnum']) except Exception: traceInfo = traceback.format_exc() self.logger.error( 'Faile to parse comment for %s (cid=%s):%s', articleid, commentid, traceInfo) Treply = None article.statistics.reply_count = Treply return article elif main1 is not None: Ttitle = soup.find('meta', attrs={ 'name': "Description" }).attrs['content'] # 标题 Ttime = re.findall( r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',", html['html']) if Ttime is not None: Ttime = Ttime[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[ 2] + ' ' + Ttime[3] else: Ttime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if len(Ttime) == 16: Ttime = Ttime + ':00' Tauthor = re.findall(r'para = {\s+name: \"(.*)\",', html['html']) if Tauthor is not None: Tauthor = Tauthor[0] else: Tauthor = None con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url) con_html = self.session.download(con_url, encoding='gbk', data=None, timeout=10, retry=3) con_list = re.findall(r'<p>(.*?)</p>', con_html) if con_list is not None: TT = [] for i in con_list: if i.strip() not in TT: TT.append(i) Tcontent = ''.join(TT) else: Tcontent = None articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0] try: commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0] meta_info = '{"commentid":"%s"}' % commentid except: commentid = None meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Ttime, article_url, None, Tauthor, meta_info=meta_info) try: if commentid is not None: re_url = 'http://coral.qq.com/article/batchcommentnum' data1 = {'targetid': articleid} html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=data1, timeout=10, retry=3)) Treply = int(html1['data'][0]['commentnum']) else: Treply = None except: Treply = None article.statistics.reply_count = Treply return article return None def crawlNewArticle(self, url): ''' 爬取url前缀为new.qq.com的文章 :param url: :return: ''' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find('new.qq.com/omn') < 0: self.logger.warn('Unrelated url found:%s', url) return article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[TencentNew]' + article_url) soup = BeautifulSoup(html['html'], 'html.parser') script_tags = soup.head.find_all('script') data = dict() for tag in script_tags: text = re.search(r'window.DATA = (.*)', tag.text, re.S) if text: data = json.loads(text.group(1)) tid = data['article_id'] title = data['title'] author_name = data['media'] author_id = data['media_id'] publish_datetime = data['pubtime'] comment_id = data['comment_id'] main = soup.find('div', attrs={'class': 'qq_conent clearfix'}) t_content = '' if main is not None: contents = main.find_all('p', {'class': 'one-p'}) for content in contents: if content.string is None: continue t_content += str(content.get_text().strip()) get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % ( comment_id, comment_id) comment_data = self.session.download(get_comment_count_url) comment_data = re.search( r'_article%scommentnum\((.*)\)' % comment_id, comment_data) comment_dict = eval(comment_data.group(1)) reply_count = comment_dict['data']['commentnum'] meta_info = '{"commentid":"%s"}' % comment_id article = Article(tid=tid, channel_id=self.channel.channel_id, title=title, content=t_content, publish_datetime=publish_datetime, url=article_url, author_id=author_id, author_name=author_name, meta_info=meta_info) article.statistics.reply_count = reply_count return article return None def crawlStatistics(self, article): meta_info = article.meta_info if meta_info is None: return jo = json.loads(meta_info) if "commentid" not in jo: return commentid = jo["commentid"] re_url = 'http://coral.qq.com/article/batchcommentnum' data1 = {'targetid': commentid} html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=data1, timeout=10, retry=3)) article.statistics.reply_count = int(html1['data'][0]['commentnum']) def refreshSearch(self): pass def refreshCommentCrawler(self): self.lastCommentId = None def crawlComment(self, article): # 获取文章评论 meta_info = article.meta_info if meta_info is None: return (list(), False) jo = json.loads(meta_info) if "commentid" not in jo: return (list(), False) commentid = jo["commentid"] cookies = 'pac_uid=0_58ec8106620c1; gj_mpvid=80515918; ad_play_index=97; dsp_cookiemapping0=1492586667155; pgv_info=ssid=s9259450720; ts_last=news.qq.com/a/20170415/002007.htm; ts_refer=www.baidu.com/link; pgv_pvid=1281052383; ts_uid=1143064466; ptag=www_baidu_com|' re_url = 'http://coral.qq.com/article/' + commentid + '/comment' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") data1 = { 'commentid': self.lastCommentId if self.lastCommentId is not None else '0', 'reqnum': '50' } html = self.session.download(re_url, encoding='utf-8', cookies=cookies, data=data1, timeout=10, retry=3) jo = json.loads(html) if jo['errCode'] != 0: return ([], False) if jo['data']['retnum'] == 0: return ([], False) self.lastCommentId = jo['data']['last'] for i in jo['data']['commentid']: cid = i['id'] user_id = i['userinfo']['userid'] user_name = i['userinfo']['nick'] user_ip = '' location = i['userinfo']['region'].replace(u'市', '').replace( u'自治', '').replace(u'新区', '').replace(u'区', '').replace(u'洲', '') location_list = location.split(':') location_country = location_list[0] if len(location_list) > 1: location_region = location_list[1] else: location_region = '' if len(location_list) > 2: location_city = location_list[2] else: location_city = '' #user_head = i['userinfo']['head'] publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(i['time'])) reply_userid = str(i['replyuserid']) like_count = i['up'] reply_count = i['rep'] content = i['content'] # print cid, user_id, user_name, user_ip, ip_address, user_head, publish_datetime, reply_userid # print like_count,unlike_count,read_count,reply_count,source_url commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_country, location_region, location_city, user_id, user_name, content, reply_userid, None, like_count, reply_count, None)) return (commentList, jo['data']['hasnext'])
class IFengNewsCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for url in urls: article = self.crawlArticle(url) # self.crawlStatistics(article) if article is not None and article not in articleList: # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def crawlStatistics(self, article): #下次直接获得要统计的变量而不用爬整个网页 ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' meta_info = article.meta_info #凤凰网的meta_info我保存的是doc_url值,因为有几种格式,要么是subxxxx_0 要么是文章url # print len(meta_info) data1 = { 'callback': 'newCommentListCallBack', 'doc_url': meta_info, 'job': '1', 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, addr=False, isJson=True) article.statistics.reply_count = html1['count'] #如果还需要其他统计数可以继续添加 def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find(self.channel.url) < 0: self.logger.warn('Unrelated url found:%s', url) return None article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[iFengnews]' + article_url) soup = BeautifulSoup(html['html'], 'lxml') # 'html.parser' 解析器 main = soup.find('div', attrs={'class': "main"}) main1 = soup.find('div', attrs={'class': "yc_main"}) if main is not None: self.logger.debug(u'走第一种格式') Ttitle = main.find('h1').text.strip() # 标题 if Ttitle is None: self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return else: Tpublishtime = main.find('span', attrs={ 'class': "ss01" }).text.strip() if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return else: Tpublishtime = Tpublishtime.replace(u'年', '-').replace( u'月', '-').replace(u'日', '') Tauthor = main.find('a', attrs={'target': "_blank"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main.find('div', attrs={'id': "main_content"}) # print Tcontent # Tcontent = Tcontent.find('p') if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return doc_url = re.findall(r'"commentUrl":"(.*)",', html['html']) if doc_url: doc_url = doc_url[0] else: doc_url = url data1 = { 'callback': 'newCommentListCallBack', # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml', 'doc_url': doc_url, 'job': '1', 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, addr=False, isJson=True) # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] ) Treply = html1['count'] if len(html1['comments']) is not 0: articleid = html1['comments'][0]['article_id'] else: articleid = article_url articleid = articleid[articleid.find('a/') + 2: -6] # 由于数据库字段长度有限,所以截取部分作为ID self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid', article_url) meta_info = doc_url article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article ##对第二种格式的爬取 if main1 is not None: self.logger.debug(u'走第二种格式') Ttitle = main1.find('h1').text.strip() if Ttitle is None: self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return else: Tpublishtime = main1.find('span').text.strip() if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return # return Tpublishtime Tauthor = main1.find('a', attrs={'target': "_blank"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main1.find('div', attrs={'class': "yc_con_txt"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.warn( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return doc_url = re.findall(r'"commentUrl":"(.*)",', html['html']) if doc_url: doc_url = doc_url[0] else: doc_url = url data1 = { 'callback': 'newCommentListCallBack', # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml', 'doc_url': doc_url, 'job': '1', 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, addr=False, isJson=True) # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] ) try: Treply = html1['count'] except: Treply = None if len(html1['comments']) is not 0: articleid = html1['comments'][0]['article_id'] else: articleid = url.strip() articleid = articleid[articleid.find('a/') + 2:-6] # 由于数据库字段长度有限,所以截取部分作为ID self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid', article_url) meta_info = doc_url article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article if (main is None) and (main1 is None): self.logger.warn(u"存在另外一种html格式::%s", url) return def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.lastCommentId = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' #self.logger.debug('Article:%s', article) html = self.session.download(article.url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, isJson=False) # meta_info = article.meta_info add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # add_datetime = time.mktime(time.strptime('','%Y-%m-%d')) commentList = list() page = 1 while page < 30: doc_url = re.findall(r'"commentUrl":"(.*)",', html['html']) if doc_url: doc_url = doc_url[0] else: doc_url = article.url data1 = { 'callback': 'newCommentListCallBack', 'orderby': '', 'docUrl': doc_url, 'job': '1', 'p': page, 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, isJson=True) totalcount = html1['count'] # 评论总数 if totalcount == 0: break comments = html1['comments'] if comments: for comment in comments: cid = comment['comment_id'] user_id = comment['user_id'] user_name = comment['uname'] user_ip = comment['client_ip'] # ip_address = get_ip_address(self, str(user_ip)) # 并没有获取到值 # if ip_address is '': try: ip_address = comment['ip_from'] except: ip_address = None # ip_address = comment['ip_from'] user_head = comment['user_url'] publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(comment['create_time']))) reply_userid = comment['parent'] # 评论的回复 if reply_userid: reply_userid = comment['parent'][0]['user_id'] else: reply_userid = '' like_count = comment['uptimes'] unlike_count = None read_count = None reply_count = None source_url = article.url content = comment['comment_contents'] heat = 0 location_coutry = 'CN' if ip_address is None: commentList.append( Comment( article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, None, None, None, ###这里的ip_address还未实现 user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) else: try: location_region = ip_address[:ip_address. find(u'省') + 1] location_city = ip_address[ip_address.find(u'省') + 1:] except: location_region = None location_city = None commentList.append( Comment(article.tid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_coutry, location_region, location_city, user_id, user_name, content, reply_userid, None, like_count, reply_count, dislike_count=None)) page = page + 1 totalpage = math.ceil(totalcount / 20.0) # 计算评论总页数,向上取整 if totalpage < page: break else: break return (commentList, False ) #测试的时候 article[0][222].content 可以取出第222条的评论内容
class SinaNewsCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta(days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for url in urls: article = self.crawlArticle(url) if article is not None and article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120' html = self.session.download(article.url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies) re_url = 'http://comment5.news.sina.com.cn/page/info' channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } try: html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) article.statistics.reply_count = html1['result']['count']['show'] except: self.logger.error('[SinaStatistics]url:' + article.url + ', tid:' + article.tid + '%s' + traceback.format_exc()) return def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120' html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies) if html: article_url = html['url'] article_url = re.findall(r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.info('[SinaNews]'+article_url) #获取发布时间 date = re.findall(r'/(\d{4}-\d{2}-\d{2})/', article_url) if len(date) == 0: return None # if date[0] < '2015-07-01': # html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) soup = BeautifulSoup(html['html'], 'lxml') main = soup.find('div', attrs={'class': "wrap-inner"}) main1 = soup.find('div', attrs={'class': "Main clearfix"}) main2 = soup.find('div', attrs ={'class': "main-content w1240"}) #第一种网页格式 if main is not None: self.logger.debug('走第一种格式') #获取标题 Ttitle = main.find('h1', attrs={'id': "artibodyTitle"}) if Ttitle is None: self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #获取发布时间 Ttime = main.find('span', attrs={'class': 'time-source'}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}).*', Ttime)[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3] else: self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取作者信息 Tauthor = soup.find('span', attrs={'class': "time-source"}) if Tauthor is not None: Tauthor = Tauthor.find('a') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None else: Tauthor = None #获取内容 Tcontent = main.find('div', attrs={'id': "artibody"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return #获取评论数 try: channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] Treply = totalcount except: Treply = None # 获取文章的id articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0] article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article #第二种网页格式 elif main1 is not None: self.logger.debug('走第二种格式') #获取标题 Ttitle = main1.find('h1', attrs={'id': "artibodyTitle"}) if Ttitle is None: self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #获取时间 Ttime = main1.find('span', attrs={'id': "pub_date"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})', Ttime)[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3] else: self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取发布者 Tauthor = main1.find('span', attrs={'id': "media_name"}) if Tauthor is not None: Tauthor = Tauthor.find('a').text.strip() else: Tauthor = None #获取内容 Tcontent = main1.find('div', attrs={'id': "artibody"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return try: channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] Treply = totalcount except: Treply = None # 获取文章的id articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0] article = Article(tid=articleid, channel_id=self.channel.channel_id,title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article #第三种网页格式 elif main2 is not None: self.logger.debug(u'第三种格式') #获取标题 Ttitle = main2.find('div', attrs={'class': "second-title"}) if Ttitle is None: self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() # 获取时间 Ttime = main2.find('span', attrs={'class': "date"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D ' '(\d{2}:\d{2}).*', Ttime)[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3] else: self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布者 Tauthor = main2.find('a', attrs={'class': "source"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取内容 Tcontent = main2.find('div', attrs={'id': "article"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return # 获取评论数 try: channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] Treply = totalcount except: Treply = None # 获取文章的id articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0] article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' pass def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' html = self.session.download(article.url, encoding='utf-8', data=False, timeout=10, retry=3, addr=True) channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") commentList = list() page = 1 self.logger.info(article.url) try: while page < 30: data ={ 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': page, 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(url=re_url, encoding='utf-8', data=data, timeout=10, retry=3, addr=True) html1 = html1["html"] html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] if totalcount == 0: break cmntlist = html1["result"]["cmntlist"] for i in cmntlist: cid = i["mid"] user_id = i["uid"] user_name = i["nick"] user_ip = i["ip"] publish_datetime = i["time"] like_count = i["agree"] content = i["content"] commentList.append(Comment(article.tid, self.channel.channel_id, cid,add_datetime, publish_datetime, user_ip, None, None, None,user_id, user_name,content,None, None, like_count, None, None)) totalpage = math.ceil(totalcount / 20.0) if totalpage < page: break page = page + 1 except: self.logger.error(self.logger.error('Fail to parse comment:%s'+traceback.format_exc())) finally: return (commentList, False)
class ZhihuCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = SessionCrawler() self.channel = channel self.nextCommentUrl = None def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for baiduUrl in urls: url = self.__fetchRealUrlFromBaiduUrl(baiduUrl) article = self.crawlArticle(url) if article is not None and article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def __fetchRealUrlFromBaiduUrl(self, baiduUrl): ''' ''' response = self.session.session.get(baiduUrl, allow_redirects=False) if response.status_code == 302: return response.headers['Location'] def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' #判断url格式,因为从百度查询所得不一定是question,目前只爬question if url.find('question') < 0: self.logger.warn('Question supported only:%s', url) return None article_id = re.findall(r'question/(\d+)', url)[0] self.session.randomSleep() response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS) soup = BeautifulSoup(response) main = soup.find('div', attrs={'id': "data"}).attrs['data-state'] articleJson = json.loads(main) questionJson = articleJson['entities']['questions'][article_id] title = questionJson['title'] contentSoup = BeautifulSoup(questionJson['editableDetail']) content = contentSoup.text author_id = questionJson['author']['id'] author_name = questionJson['author']['name'] createTimeInFloat = questionJson['created'] publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat)) reply_count = questionJson['commentCount'] read_count = questionJson['visitCount'] collect_count = questionJson['followerCount'] article = Article(article_id, self.channel.channel_id, title, content, publish_datetime, url, author_id, author_name) article.statistics.reply_count = reply_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count return article def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' articleCopy = self.crawlArticle(article.url) article.statistics.reply_count = articleCopy.statistics.reply_count article.statistics.read_count = articleCopy.statistics.read_count article.statistics.collect_count = articleCopy.statistics.collect_count def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.nextCommentUrl = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' if self.nextCommentUrl is None: curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0) curl = curl.replace('#', '%') else: curl = self.nextCommentUrl self.session.randomSleep() result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS) jo = json.loads(result) paging = jo['paging'] hasnext = not paging['is_end'] self.nextCommentUrl = paging['next'] dataList = jo['data'] add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') commentList = list() for data in dataList: #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys()) #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url']) publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time'])) comment = Comment(article.tid, article.channel_id, data['id'], add_datetime, publish_datetime, ip_address=None, location_country=None, location_region=None, location_city=None, author_id=data['author']['id'], author_name=data['author']['name'], content=data['content'], reply_author_id=None, read_count=None, like_count=data['voteup_count'], reply_count=data['comment_count'], dislike_count=None) commentList.append(comment) return (commentList, hasnext)
class NeteaseNewsCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = self.baiduCrawler.session self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for url in urls: article = self.crawlArticle(url) if article is not None and article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056' try: self.logger.info("[crawlStatistics]" + article.tid) if len(article.tid) != 16: articleid = article.tid[3:len(article.tid) - 2] else: articleid = article.tid re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid html1 = self.session.download(url=re_url, encoding='utf-8', data=None, isJson=True, timeout=10, retry=3) article.statistics.reply_count = html1["tcount"] except: self.logger.error('[SinaStatistics]url:' + article.url + ', tid:' + article.tid + ', %s' + traceback.format_exc()) return def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True, cookies=cookies) if html: article_url = html['url'] article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.info(article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'class': "post_content_main"}) main1 = soup.find('div', attrs={'class': "ep-content-main"}) #第一种网页格式 if main is not None: #获取标题 Ttitle = main.find('h1') if Ttitle is None: self.logger.error('[NeteaseNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #获取发布时间 Ttime = main.find('div', attrs={'class': "post_time_source"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', Ttime)[0] else: self.logger.error('[NeteaseNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取发布作者 Tauthor = main.find('div', attrs={'class': "post_time_source"}) if Tauthor is not None: Tauthor = Tauthor.find('a') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None #获取发布内容 Tcontent = main.find('div', attrs={'class': "post_text"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) dr = re.compile(r'<[^>]+>', re.S) Tcontent = dr.sub('', Tcontent) else: self.logger.error('[NeteaseNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return # 获取评论数 articleid = "" try: articleid = re.findall(r'"docId" : "(.*)",', html['html'])[0] re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid html1 = self.session.download(url=re_url, encoding='utf-8', data=None, isJson=True, timeout=10, retry=3) Treply = html1["tcount"] except: Treply = None self.logger.error('[NeteaseComment]url:' + article_url + ', tid:' + articleid + ', %s' + traceback.format_exc()) finally: article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article #第二种网页格式 elif main1 is not None: #标题 Ttitle = main1.find('h1') if Ttitle is None: self.logger.error('[NeteaseNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #发布的时间 Ttime = main1.find('div', attrs={'class': "ep-time-source cDGray"}) Ttime1 = main1.find('div', attrs={'class': "ep-info cDGray"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0] elif Ttime1 is not None: Ttime = Ttime1.text.strip() Ttime = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0] else: self.logger.error('[NeteaseNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取作者信息 Tauthor = main1.find('div', attrs={'class': "ep-time-soure cDGray"}) Tauthor1 = main1.find('div', attrs={'class': "ep-source cDGray"}) if Tauthor is not None: Tauthor = Tauthor.find('a') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None elif Tauthor1 is not None: Tauthor = Tauthor1.find('span') if Tauthor is not None: Tauthor = Tauthor.text.strip() print Tauthor Tauthor = re.findall(r'来源:(.*)"', Tauthor)[0] else: Tauthor = None else: Tauthor = None #获取内容 Tcontent = main1.find('div', attrs={'id': "endText"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) dr = re.compile(r'<[^>]+>', re.S) Tcontent = dr.sub('', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return #获取评论数 try: articleid = re.findall(r'"docId" : "(.*)",', html['html'])[0] re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid html1 = json.load( self.session.download(re_url, encoding='utf-8', data=None, isJson=True, timeout=10, retry=3)) Treply = html1['tcount'] except: Treply = None self.logger.error('[NeteaseComment]url:' + article_url + ', tid:' + articleid + ', %s' + traceback.format_exc()) finally: article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor) self.logger.debug("[crawlArticle]" + article.tid) article.statistics.reply_count = Treply return article def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' pass def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056' if len(article.tid) != 16: articleid = article.tid[3:len(article.tid) - 2] else: articleid = article.tid re_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid + '/comments/newList' commentList = list() add_datetime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") page = 0 while page < 750: data1 = { 'offset': page, 'limit': 30, 'showLevelThreshold': 72, 'headLimit': 1, 'tailLimit': 2, 'ibc': 'newspc' } try: html1 = self.session.download(url=re_url, encoding='utf-8', cookies=cookies, data=data1, timeout=10, retry=3) html1 = json.loads(html1) totalcount = html1["newListSize"] if totalcount == 0: break for i in html1['comments'].itervalues(): cid = i['commentId'] user_id = i['user']['userId'] if user_id == 0: user_name = '' else: user_name = i['user']['nickname'] user_ip = '' location = i['user']['location'].replace( u'市', ':').replace(u'自治', ':').replace(u'新区', ':').replace( u'区', ':').replace(u'洲', ':') location_list = location.split(':') location_country = location_list[0] if len(location_list) > 1: location_region = location_list[1] else: location_region = '' if len(location_list) > 2: location_city = location_list[2] else: location_city = '' publish_datetime = i['createTime'] like_count = i['vote'] unlike_count = i['against'] content = i['content'] dr = re.compile(r'<[^>]+>', re.S) content = dr.sub('', i['content']) commentList.append( Comment(articleid, self.channel.channel_id, cid, add_datetime, publish_datetime, user_ip, location_country, location_region, location_city, user_id, user_name, content, None, None, like_count, None, unlike_count)) # print page, totalcount if page > int(totalcount): break page = page + 30 except: self.logger.error('[NeteaseComment]url:' + article.url + ', tid:' + article.tid + ', %s' + traceback.format_exc()) return finally: return (commentList, False)