def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' #判断url格式,因为从百度查询所得不一定是question,目前只爬question if url.find('question') < 0: self.logger.warn('Question supported only:%s', url) return None article_id = re.findall(r'question/(\d+)', url)[0] self.session.randomSleep() response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS) soup = BeautifulSoup(response) main = soup.find('div', attrs={'id': "data"}).attrs['data-state'] articleJson = json.loads(main) questionJson = articleJson['entities']['questions'][article_id] title = questionJson['title'] contentSoup = BeautifulSoup(questionJson['editableDetail']) content = contentSoup.text author_id = questionJson['author']['id'] author_name = questionJson['author']['name'] createTimeInFloat = questionJson['created'] publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat)) reply_count = questionJson['commentCount'] read_count = questionJson['visitCount'] collect_count = questionJson['followerCount'] article = Article(article_id, self.channel.channel_id, title, content, publish_datetime, url, author_id, author_name) article.statistics.reply_count = reply_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count return article
def __filterRemovedArticle(self, articleList, entityId, eventId=None): ''' 与remove表格对比,进行文章过滤 返回不存在remove表中的文章list ''' if len(articleList) == 0: return [] if eventId is not None: tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId eventCondition = ' event_id=%d and ' % eventId else: tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId eventCondition = '' # 在remove表里查找文章 selectSql = ''' SELECT TID, CHANNEL_ID FROM %s where %s (%s) ''' whereClauseList = map( lambda article: '(TID="%s" and CHANNEL_ID=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute( selectSql % (tableName, eventCondition, ' or '.join(whereClauseList))) resultList = self.dbProxy.fetchall() # 查询返回结果集 removedArticleList = map(lambda x: Article(x[0], x[1]), resultList) filteredArticle = filter(lambda x: x not in removedArticleList, articleList) #self.logger.debug('originalList:%s', map(lambda article: article.tid, articleList)) #self.logger.debug('removedArticleList:%s', map(lambda article: article.tid, removedArticleList)) #self.logger.debug('filteredArticle:%s', map(lambda article: article.tid, filteredArticle)) return filteredArticle
def __searchByPage(self,keywordList,startTime,endTime,page): # 在豆瓣内部搜索框搜索 page = str(page) #url接收的是str格式 search_url = "https://www.douban.com/group/search?start="+page+"&cat=1013&sort=time&q=" #cat:按话题搜索 sort:按最新发布时间分类 q:搜索关键词 query = urllib.quote(' '.join(keywordList).encode('utf-8')) search_url = search_url+str(query) url_page = self.session.get(search_url,headers=self.headers) soup = BeautifulSoup(url_page, "lxml") # self.logger.debug(soup) main_wrap = soup.find('div', attrs={'class': "article"}) main_article_list = main_wrap.find('div',attrs={'class':"topics"}) articleList = list() hasnext = True if main_article_list is not None: title_list = main_article_list.findAll('tr', {'class': 'pl'}) for title in title_list: article_publishtime = title.find('td', attrs={'class': "td-time"}).attrs['title'] urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S") Y, M, D, H = urlTime[0:4] urlTime2 = datetime.datetime(Y, M, D, H) urlTime2 = time.mktime(urlTime2.timetuple()) #转换成时间戳来比较 float类型 startTime = endTime - datetime.timedelta(days=2) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) #如果符合时间范围就爬取 if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond: article_url = title.find('a').attrs['href'] self.logger.debug(article_url) (content, author_name, tid,like_count) = self.crawlArticle(article_url) if content is None: # 话题已被删除或则其他格式 continue article_title = title.find('td', attrs={'class': "td-subject"}).text article = Article(tid, self.channel.channel_id, article_title, content, article_publishtime, article_url, None, author_name) self.crawlComment(article) reply_count = title.find('td', attrs={'class': "td-reply"}).text.strip() reply_count = re.sub(u'回应', '', reply_count) # 回复数量去除中文保留数字 article.statistics.reply_count = reply_count article.statistics.like_count = like_count if article not in articleList: articleList.append(article) else: print len(articleList) hasnext=False break return (articleList, hasnext)
def crawlArticle(self, url, **kwargs): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' # TBD, 转发情况目前先不考虑 searchTitle = '' referer = None if (kwargs['flag'] == 0): if 'referer' not in kwargs: return None if 'title' in kwargs: searchTitle = kwargs['title'] else: searchTitle = '' try: self.session_public_article.randomSleep() if kwargs['flag'] == 0: referer = kwargs['referer'] headers = HEADERS_ARTICLE.copy() headers['Referer'] = referer response = self.session_public_article.get( url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE) else: response = self.session_public_article.get( url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE) mid = re.findall(r'var mid = .*"(\d+)";', response.text)[0] + '-' + re.findall( r'var idx = .*"(\d+)";', response.text)[0] soup = BeautifulSoup(response.text, 'lxml') main = soup.find('div', {'id': "img-content"}) title = main.find('h2').text.strip() content = main.find('div', {'id': "js_content"}).text.strip() profile = main.find('div', {'class': "profile_inner"}) author_id = profile.find('span').text.strip() author_name = profile.find('strong').text.strip() publish_datetime = kwargs['publish_datetime'] article = Article(mid, self.channel.channel_id, title, content, publish_datetime=publish_datetime, url=url, author_id=author_id, author_name=author_name, meta_info='{refer="%s"}' % referer) self.logger.debug('Successfully parse article:%s', title) return article except: self.logger.error('Fail to get article for %s: %s due to %s', url, searchTitle, traceback.format_exc()) return None
def __fetchOldArticleList(self, articleList, articleCount=100): ''' 从全局文章表,获取尚未消亡的文章id,而且这些文章并不在本次爬虫爬回来的记录里 ''' #用来查询总页数 selectSql_count = 'SELECT COUNT(*) FROM %s where extinct="N" and channel_id=%d ' sql2 = selectSql_count % (Constants.TABLE_SA_ARTICLE, self.channel.channel_id) #获取旧文章的sql selectSql = 'SELECT TID,title, publish_datetime,url, meta_info FROM %s where extinct="N" and channel_id=%d ' sql = selectSql % (Constants.TABLE_SA_ARTICLE, self.channel.channel_id) if len(articleList) > 0: whereClauseList = map( lambda article: ' tid<>"%s" ' % (article.tid), articleList) sql += ' and (%s)' % (' and '.join(whereClauseList)) sql2 += ' and (%s)' % (' and '.join(whereClauseList)) sql2 += ' order by add_datetime desc' self.dbProxy.execute(sql2) resultList2 = self.dbProxy.fetchall() # print '12456789sssssssssssssssssss' # print resultList2 #((53,),) resultList2 = re.findall(r'\d+', str(resultList2)) #返回一个list # print resultList2[0] if int(resultList2[0]) > int(articleCount): randpage = random.randint( 0, int(math.ceil(float(resultList2[0]) / articleCount))) else: randpage = 0 #用来随机取数据库页数 sql += ' order by add_datetime desc limit %d,%d' % (randpage, articleCount) self.dbProxy.execute(sql) resultList = self.dbProxy.fetchall() return map( lambda item: Article(item[0], self.channel.channel_id, title=item[1], publish_datetime=item[2], url=item[3], meta_info=item[4]), resultList)
def __filterRemovedArticle(self, articleList, entityId, eventId=None): ''' 与remove表格对比,进行文章过滤 返回不存在remove表中的文章list ''' if len(articleList) == 0: return [] if eventId is not None: tableName = Constants.TABLE_SA_EVENT_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId eventCondition = ' event_id=%d and ' % eventId start_datetime, end_datetime = self.__fetchEventTime( entityId, eventId) # 过滤掉不在该事件开始时间和结束之间内的文章 article_new_list = list() for article in articleList: if (str(article.publish_datetime) > str(start_datetime)) and ( str(article.publish_datetime) < str(end_datetime)): article_new_list.append(article) articleList = article_new_list else: tableName = Constants.TABLE_SA_ARTICLE_REMOVE + Constants.TABLE_NAME_DELIMITER + entityId eventCondition = '' # 在remove表里查找文章 selectSql = ''' SELECT TID, CHANNEL_ID FROM %s where %s (%s) ''' whereClauseList = map( lambda article: '(TID="%s" and CHANNEL_ID=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute( selectSql % (tableName, eventCondition, ' or '.join(whereClauseList))) resultList = self.dbProxy.fetchall() # 查询返回结果集 removedArticleList = map(lambda x: Article(x[0], x[1]), resultList) filteredArticle = filter(lambda x: x not in removedArticleList, articleList) return filteredArticle
def __seperateNewOldArticles(self, articleList, entityId=None): ''' 查询全局文章表,区分新文章和旧文章 ''' if len(articleList) == 0: return ([], []) if entityId is None: selectSql = 'select tid, channel_id from %s where ' % Constants.TABLE_SA_ARTICLE else: selectSql = 'select tid, channel_id from %s where ' % ( Constants.TABLE_SA_ARTICLE + Constants.TABLE_NAME_DELIMITER + entityId) whereClauseList = map( lambda article: '(tid="%s" and channel_id=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute(selectSql + ' or '.join(whereClauseList)) resultList = map(lambda x: Article(x[0], x[1]), self.dbProxy.fetchall()) existingArticleList = filter(lambda x: x in resultList, articleList) newArticleList = filter(lambda x: x not in resultList, articleList) return (existingArticleList, newArticleList)
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 :param url: :return:返回一个Article实例 ''' cookies = None html = self.session.download(url=url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies) if html: article_url = html['url'] article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] article_id = re.findall(r'c_\d+', article_url)[0] article_id = article_id[2:] soup = BeautifulSoup(html['html'], "html.parser") main1 = soup.find('div', attrs={'class': "widthMain main"}) main2 = soup.find('div', attrs={'class': "main pagewidth"}) main3 = soup.find('body', attrs={'class': "streamline-page"}) main4 = soup.find('div', attrs={'class': "h-title"}) main5 = soup.find('div', attrs={'id': "article"}) main6 = soup.find('div', attrs={'id': "Title"}) main7 = soup.find('div', attrs={'class': "article"}) if main1 is not None: self.logger.debug("main1") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain1(main1) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main2 is not None: self.logger.debug("main2") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain2(main2) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main3 is not None: self.logger.debug("main3") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain3(main3) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main3 is None and main4 is not None: self.logger.debug("main4") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain4(soup) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main5 is not None: self.logger.debug("main5") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain5(main5) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main6 is not None: self.logger.debug("main6") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain6(soup) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if main7 is not None: self.logger.debug("main7") Ttitle, Ttime, Tauthor, Tcontent = self.crawlMain7(soup) if (Ttitle != None and Ttime != None and Tcontent != None): article = Article(tid=article_id, channel_id=self.channel.channel_id, title=Ttitle, author_name=Tauthor, publish_datetime=Ttime, content=Tcontent, url=article_url) return article if (main1 is None and main2 is None and main3 is None and main4 is None and main5 is None and main6 is None and main7 is None): self.logger.error(u"存在另外一种html格式::%s", url) return
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' article = None html = self.session.download(url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3, addr=True) article_url = html['url'] # if article_url.find(self.channel.url)<0: # self.logger.warn('Unrelated url found:%s',url) # continue # self.logger.debug(article_url) soup = BeautifulSoup(html['html'], 'html.parser') noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return article main = soup.find('div', attrs={'class': "left_section"}) if main: Ttitle = main.find('div', attrs={ 'id': "j_core_title_wrap" }).find('h1') Ttitle1 = main.find('div', attrs={ 'id': "j_core_title_wrap" }).find('h3') if Ttitle: Ttitle = Ttitle.text.strip() elif Ttitle1: Ttitle = Ttitle1.text.strip() else: Ttitle = '' # self.logger.debug(u'标题%s',Ttitle) data_field = main.find('div', attrs={ 'id': "j_p_postlist" }).find('div').attrs['data-field'].strip() data_field = json.loads(data_field) publish_datetime = data_field['content'] if 'date' in publish_datetime.keys(): publish_datetime = publish_datetime['date'] else: publish_datetime = main.find('div', attrs={ 'id': "j_p_postlist" }).find('div').find_all( 'span', attrs={'class': "tail-info" })[-1].text.strip() publish_datetime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', publish_datetime)[0] if len(publish_datetime[4]) > 1: publish_datetime = publish_datetime[ 0] + '-' + publish_datetime[1] + '-' + publish_datetime[ 2] + ' ' + publish_datetime[3] + publish_datetime[4] else: publish_datetime = publish_datetime[ 0] + '-' + publish_datetime[1] + '-' + publish_datetime[ 2] + ' ' + publish_datetime[3] + ':00' Tid = data_field['author']['user_id'] Tauthor = data_field['author']['user_name'] Treply = soup.find('li', attrs={ 'class': "l_reply_num" }).find('span').text.strip() #总回复数 Tcontent = main.find('div', attrs={ 'id': "j_p_postlist" }).find('div').find('cc').text.strip() article = Article(Tid, self.channel.channel_id, Ttitle, Tcontent, publish_datetime, url=article_url, author_id=None, author_name=Tauthor, meta_info=None) article.statistics.reply_count = Treply else: self.logger.warn(u'很抱歉,该贴已被删除。%s', article_url) return article
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' html = self.session.download(url, encoding='utf-8', data=None, isJson=False, timeout=10, retry=3, addr=True) article_url = html['url'] self.logger.debug(article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'id': "bd"}) main1 = soup.find('div', attrs={ 'class': "wd-question" }) #论坛提问帖子http://bbs.tianya.cn/post-730-5795-1-1.shtml article = None if main: Ttitle = main.find('h1').find('span').text Ttime = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[1].text.strip() Ttime = re.sub(u'[\u4e00-\u9fa5]+:', '', Ttime) Tid = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[0].find('a').attrs['uid'].strip() Tauthor = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[0].find('a').attrs['uname'].strip() Tclick = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[2].text.strip() Tclick = re.sub(u'[\u4e00-\u9fa5]+:', '', Tclick) Tclick = int(Tclick) Treply = main.find('div', attrs={ 'class': "atl-info" }).find_all('span')[3].text.strip() Treply = re.sub(u'[\u4e00-\u9fa5]+:', '', Treply) Treply = int(Treply) Tlike = main.find('a', attrs={'class': "zantop"}).attrs['_count'] Tcontent = main.find('div', attrs={ 'class': "bbs-content clearfix" }).text.strip() article = Article(Tid, self.channel.channel_id, Ttitle, Tcontent, Ttime, url=article_url, author_id=None, author_name=Tauthor) article.statistics.reply_count = Treply article.statistics.click_count = Tclick article.statistics.like_count = Tlike elif main1: Ttitle = main1.find('h1').find('span').text Ttime = main1.find('div').attrs['js_replytime'] Ttime = re.findall(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', Ttime)[0] Tid = main1.find('div').attrs['_host'] Tauthor = main1.find('div', attrs={ 'class': "q-info" }).find('a').text Tclick = main1.find('div').attrs['js_clickcount'] Treply = main1.find('div').attrs['js_powerreply'] Tcontent = main1.find('div', attrs={'class': "q-content atl-item"}) if Tcontent: Tcontent = Tcontent.find('div', attrs={ 'class': "text" }).text.strip() else: Tcontent = '' article = Article(Tid, self.channel.channel_id, Ttitle, Tcontent, Ttime, url=article_url, author_id=None, author_name=Tauthor) article.statistics.reply_count = Treply article.statistics.click_count = Tclick return article
def crawlArticle(self, req_data): hasnext = False req_url = 'http://120.55.151.61/Treehole/V4/Cave/getList.action' data_r = self.s.post(url=req_url, data=req_data, headers=self.headers) data_r = data_r.text true = True false = False data_j = eval(data_r) data_js = json.dumps(data_j) data_dict = simplejson.loads(data_js) data = data_dict['data'] timestamp = data['timestampLong'] timestampLong = data['timestampLong'] / 1000 timestampLong_struct = datetime.datetime.fromtimestamp(timestampLong) timestampLong_str = timestampLong_struct.strftime('%Y-%m-%d %H:%M:%S') messageBO = data['messageBOs'] for each in messageBO: # 20条记录保存在一组里 if 'studentBO' in each: content = each.get('content', False) if content: publish_timeStamp = each['issueTime'] / 1000 datetime_struct = datetime.datetime.fromtimestamp( publish_timeStamp) publish_time = datetime_struct.strftime( '%Y-%m-%d %H:%M:%S') if datetime.datetime.strptime( publish_time, '%Y-%m-%d %H:%M:%S') < self.startTime: hasnext = False break schoolName = each['schoolName'] readCount = each['readCount'] relayedCount = each['relayedCount'] reply_count = each['comments'] content = each['content'] if len(content) > 8: title = content[:8] + '...' elif len(content) <= 0: title = u'内容为图片' else: title = content studentBO = each['studentBO'] studentBO_id = studentBO['studentId'] studentBO_name = studentBO['nickName'] likeCount = each['likeCount'] article_id = each['messageId'] publish_method = each['source'] publish_method = publish_method[publish_method.find('_') + 1:] data2 = 'platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.1.1&phoneModel=BLN-AL10&sharePlatform=4&channel=huaweiMarket&plateId=1&messageId=' + str( article_id) + '&' article_urlPOST = 'http://120.55.151.61/Treehole/V4/Message/preShare.action' data_urlPOST = self.s.post(url=article_urlPOST, data=data2, headers=self.headers) data3 = data_urlPOST.text data_j3 = eval(data3) data_js3 = json.dumps(data_j3) data_dict3 = simplejson.loads(data_js3) data4 = data_dict3['data'] article_url = data4['shareUrl'] article = Article(article_id, self.channel.channel_id, title, content, publish_time, article_url, author_id=studentBO_id, author_name=studentBO_name, publish_method=publish_method, entity=schoolName) article.statistics.reply_count = reply_count article.statistics.like_count = likeCount article.statistics.read_count = readCount article.statistics.forward_count = relayedCount # print article.__str__() if article is not None and article not in self.articleList: self.articleList.append(article) # print datetime.datetime.strptime(timestampLong_str,'%Y-%m-%d %H:%M:%S') # print '1111111' if datetime.datetime.strptime(timestampLong_str, '%Y-%m-%d %H:%M:%S') < self.startTime: hasnext = False return ('', hasnext) else: hasnext = True res_data = 'timestamp=' + str( timestamp ) + '&preMoodTimestap=' + str( timestamp ) + '&platform=1&phoneBrand=HONOR&phoneVersion=24&versionNumber=9.1.1&phoneModel=BLN-AL10&type=1&channel=huaweiMarket&' return (res_data, hasnext)
def __searchByPage(self, keywordList, startTime, endTime, page): # 在豆瓣内部搜索框搜索 page = str(page) #url接收的是str格式 query = urllib.quote(' '.join(keywordList).encode('utf-8')) params = { 'c': 'blog', 'range': 'article', # 包含全部 'by': 'all', 'sort': 'time', 'col': '', 'source': '', 'from': '', 'country': '', 'size': '', 'time': '', 'a': '', 'isown': 'false', 'page': page, 'dpc': '', 'q': query } search_url = "http://search.sina.com.cn/" url_page = self.session.get(search_url, params=params, headers=self.headers) soup = BeautifulSoup(url_page, "lxml") # self.logger.debug(soup) main_wrap = soup.find('div', attrs={'class': "result-boxes"}) if main_wrap is None: #为了防止因网络问题而导致的查找不到 self.logger.debug(u'第一次查找没有结果再找一遍中') url_page = self.session.get(search_url, params=params, headers=self.headers) soup = BeautifulSoup(url_page, "lxml") main_wrap = soup.find('div', attrs={'class': "result-boxes"}) if main_wrap is None: #再找一遍如果还是没有才可以算没有结果 self.logger.debug(u'第二次查找没有结果,该关键词没有结果') return (list(), False) main_article_list = main_wrap.findAll( 'div', attrs={'class': "box-result clearfix"}) hasnext = True if main_article_list is not None: print '1' for title in main_article_list: print '2' article_publishtime_wrap = title.findAll('p')[1] article_publishtime = article_publishtime_wrap.find( 'span', attrs={ 'class': "fgray_time" }).text.strip() self.logger.error(article_publishtime) urlTime = time.strptime(article_publishtime, "%Y-%m-%d %H:%M:%S") Y, M, D, H = urlTime[0:4] urlTime2 = datetime.datetime(Y, M, D, H) urlTime2 = time.mktime(urlTime2.timetuple()) #转换成时间戳来比较 float类型 startTime = endTime - datetime.timedelta(days=5) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) #如果符合时间范围就爬取 if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond: print '3' title_wrap = title.find('h2') article_url_parent = title_wrap.find('a').attrs['href'] article_url_index = article_url_parent.rfind('/') article_url_child = article_url_parent[article_url_index + 1:] article_url = 'http://blog.sina.com.cn/s/blog_' + article_url_child + '.html' Ttitle = title_wrap.find('a').text.strip() self.logger.debug(article_url) meta_info = article_url_child (content, reply_count, like_count, read_count, collect_count, forward_count) = self.crawlArticle( article_url, meta_info) self.logger.debug(like_count) if content is None: # 话题已被删除或则其他格式 print '756236' continue tid = 't_' + article_url_child author_name = article_publishtime_wrap.find( 'a', attrs={ 'class': "rib-author" }).text article = Article(tid, self.channel.channel_id, Ttitle, content, article_publishtime, article_url, None, author_name, meta_info=meta_info) # self.crawlComment(article) article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count # self.logger.debug(article) if article not in self.articleList: self.articleList.append(article) else: print '78956' print len(self.articleList) hasnext = False break return hasnext
class JianShuCrawler(object): def __init__(self, channel, logger=None): ''' :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel #获取媒体实体 # 设置请求头和代理 self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Host': 'www.jianshu.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } self.headers_1 = { 'Host': 'www.jianshu.com', 'Connection': 'keep-alive', 'Connection-Length': '0', 'Accept': 'application/json', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9' } #用户信息 self.usename = '13432881156' self.password = '******' #用cookie进行登录 self.session = SessionCrawler() self.session.cookies = cookielib.LWPCookieJar( filename='cookie') #读取cookie try: self.session.cookies.load(ignore_discard=True) except: print('未能加载cookie') if self.islogin(): print('已经登录简书') else: self.login(self.usename, self.password) def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 :param keywordList: 关键字数组 :param endTime: 搜索时间范围结束 :return: ''' if self.islogin(): page = 0 # 页数 articleList = list() hasnext = True # 开始爬虫 while hasnext: page += 1 if page == 5: break (articleListInPage, hasnext) = self._searchByPage(keywordList, endTime, page) is_article = list() for article in articleListInPage: if article not in articleList: is_article.append(article) articleList.extend(is_article) print(len(articleList)) self.logger.debug('总共抓取文章有:%s' % len(articleList)) return articleList else: self.logger.error(u'未登录简书,请确认登录') return None def _searchByPage(self, keywordList, endTime, page): hasnext = True articleList = list() page = str(page) # 在简书内部搜索框搜索 search_url_1 = "https://www.jianshu.com/search/do?q=" search_url_2 = "&type=note&page=" + page + "&order_by=published_at&time_range=a_day" # q:是关键词搜索,page:页数,order_by是排序,time_range是按天排序 query = urllib.quote(' '.join(keywordList).encode('utf-8')) search_url = search_url_1 + str(query) + search_url_2 # 抓取页面的网址 csrf = self.getCsrf(keywordList, page) post_data = {'X-CSRF-Token': csrf} # 获取页面信息 url_page = self.session.post(search_url, headers=self.headers_1, data=post_data, textRspOnly=False) if url_page.status_code == 200: self.logger.debug(u'已经获取中大新闻页面') else: self.logger.debug(u'中大新闻获取完毕') return (articleList, False) #已经爬取到新闻界面的最后一页 # 以下对中大新闻页面进行解析 articleJson = None try: articleJson = json.loads(url_page.text) except Exception, e: self.logger.error(e) if articleJson is None: self.logger.error('articleJson is None') return allInfo = articleJson['entries'] # allinfo代表字典0:{}\1:{}..... for info in allInfo: pre_publishedTime = info["first_shared_at"] publishedTime_1 = ''.join( re.findall('[^A-Za-z]', pre_publishedTime[0:-5])) # 文章发表时间 publishedTime = publishedTime_1[0:10] + ' ' + publishedTime_1[10:18] #print(publishedTime) urlTime = time.strptime(publishedTime, '%Y-%m-%d %H:%M:%S') Y, M, D, H = urlTime[0:4] urlTime2 = datetime.datetime(Y, M, D, H) # 转化成时间戳来比较float urlTime2 = time.mktime(urlTime2.timetuple()) # 文章发表时间的时间戳 startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) # 开始爬取的时间戳 endTimeIntSecond = time.mktime(endTime.timetuple()) # 结束爬取的时间错 if urlTime2 >= startTimeIntSecond and urlTime2 <= endTimeIntSecond: # 获取文章链接,爬取文章内容 pre_article_url = info['slug'] articleUrl = "https://www.jianshu.com/p/" + pre_article_url # 文章的链接 (content, authorName, title) = self.crawlArticle(articleUrl) #爬取文章内容 #print(title) if content is None: # 话题已被删除或则其他格式 print("没有爬到文章") continue authorId = info["user"]["id"] #作者id likeCount = info["likes_count"] # 点赞数 readCount = info["views_count"] # 文章阅读数 replyCount = info["public_comments_count"] # 文章评论数 tid = info["id"] # 文章的id article = Article(tid, self.channel.channel_id, title, content, publishedTime, articleUrl, authorId, authorName) #self.crawlComment(article) article.statistics.reply_count = replyCount article.statistics.like_count = likeCount article.statistics.read_count = readCount if (article is not None) and (article not in articleList): articleList.append(article) else: print('结束爬虫') hasnext = False break return (articleList, hasnext)
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' cookies = 'U_TRS1=000000fa.9fe376b4.58573ebc.bde2f2c3; UOR=,vip.stock.finance.sina.com.cn,; vjuids=3923fcfb8.15914cd122a.0.e347599b65a6; SINAGLOBAL=183.63.92.250_1482112700.861930; SUB=_2AkMvC7H0f8NhqwJRmP4WzWzrb4xwzgnEieLBAH7sJRMyHRl-yD83qlNetRBAqqE4nv4pjjxQaUfLZo_Os-Bxsw..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFZzJ6nbHTRfVEqOXp-S.5z; SGUID=1482721389362_efec0e8d; vjlast=1488765553.1489054965.10; bdshare_firstime=1492414283526; _ct_uid=58f46f61.537a7929; lxlrtst=1492423120_o; rotatecount=2; Apache=59.42.29.149_1492670298.869113; ULV=1492670299361:18:6:6:59.42.29.149_1492670298.869113:1492670298484; afpCT=1; CNZZDATA1252916811=1442218969-1492654141-http%253A%252F%252Fnews.sina.com.cn%252F%7C1492664941; UM_distinctid=15b8a154522e79-0a3f79bddc9d05-4e45042e-100200-15b8a154523a49; CNZZDATA5399792=cnzz_eid%3D349789736-1492650802-http%253A%252F%252Fnews.sina.com.cn%252F%26ntime%3D1492667002; U_TRS2=00000095.1c285e96.58f85761.e07aa962; lxlrttp=1492423120' html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True, cookies=cookies) if html: article_url = html['url'] article_url = re.findall(r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.info('[SinaNews]'+article_url) #获取发布时间 date = re.findall(r'/(\d{4}-\d{2}-\d{2})/', article_url) if len(date) == 0: return None # if date[0] < '2015-07-01': # html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) soup = BeautifulSoup(html['html'], 'lxml') main = soup.find('div', attrs={'class': "wrap-inner"}) main1 = soup.find('div', attrs={'class': "Main clearfix"}) main2 = soup.find('div', attrs ={'class': "main-content w1240"}) #第一种网页格式 if main is not None: self.logger.debug('走第一种格式') #获取标题 Ttitle = main.find('h1', attrs={'id': "artibodyTitle"}) if Ttitle is None: self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #获取发布时间 Ttime = main.find('span', attrs={'class': 'time-source'}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}).*', Ttime)[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3] else: self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取作者信息 Tauthor = soup.find('span', attrs={'class': "time-source"}) if Tauthor is not None: Tauthor = Tauthor.find('a') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None else: Tauthor = None #获取内容 Tcontent = main.find('div', attrs={'id': "artibody"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return #获取评论数 try: channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] Treply = totalcount except: Treply = None # 获取文章的id articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0] article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article #第二种网页格式 elif main1 is not None: self.logger.debug('走第二种格式') #获取标题 Ttitle = main1.find('h1', attrs={'id': "artibodyTitle"}) if Ttitle is None: self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #获取时间 Ttime = main1.find('span', attrs={'id': "pub_date"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})', Ttime)[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3] else: self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取发布者 Tauthor = main1.find('span', attrs={'id': "media_name"}) if Tauthor is not None: Tauthor = Tauthor.find('a').text.strip() else: Tauthor = None #获取内容 Tcontent = main1.find('div', attrs={'id': "artibody"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return try: channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] Treply = totalcount except: Treply = None # 获取文章的id articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0] article = Article(tid=articleid, channel_id=self.channel.channel_id,title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article #第三种网页格式 elif main2 is not None: self.logger.debug(u'第三种格式') #获取标题 Ttitle = main2.find('div', attrs={'class': "second-title"}) if Ttitle is None: self.logger.error('[SinaNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() # 获取时间 Ttime = main2.find('span', attrs={'class': "date"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'(\d{4})\D(\d{2})\D(\d{2})\D ' '(\d{2}:\d{2}).*', Ttime)[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[2] + ' ' + Ttime[3] else: self.logger.error('[SinaNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' # 获取发布者 Tauthor = main2.find('a', attrs={'class': "source"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None # 获取内容 Tcontent = main2.find('div', attrs={'id': "article"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return # 获取评论数 try: channel = re.findall(r"channel: '(.*)',", html['html'])[0] newsid = re.findall(r"newsid: '(.*)',", html['html'])[0] data = { 'format': 'js', 'channel': channel, 'newsid': newsid, 'group': '', 'compress': '1', 'ie': 'gbk', 'oe': 'gbk', 'page': '1', 'page_size': '20' } re_url = 'http://comment5.news.sina.com.cn/page/info' html1 = self.session.download(re_url, encoding='utf-8', data=data, isJson=False, timeout=10, retry=3, addr=False) html1 = re.sub(r'(.*=)\{', '{', html1) html1 = json.loads(html1) totalcount = html1['result']['count']['show'] Treply = totalcount except: Treply = None # 获取文章的id articleid = re.findall(r'([a-z]{8}\d{7})', article_url)[0] article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, publish_datetime=Ttime, content=Tcontent, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' cookies = 'Province=020; City=020; usertrack=c+5+hljsm+B+cg5MA7YDAg==; vjuids=-7517fab.15b5c40e631.0.a042d54907b81; _ntes_nnid=5e90ea8f4ef321150e3b5d43f68870c8,1491901408828; _ntes_nuid=5e90ea8f4ef321150e3b5d43f68870c8; UM_distinctid=15b5c41b7836eb-0fd2f7e510ef22-4e45042e-100200-15b5c41b78461b; __gads=ID=18c804c9f3ead780:T=1491901995:S=ALNI_MYWNxLkcHVgXyExP9eeFcD-mj7SiQ; afpCT=1; CNZZDATA1256734798=337963631-1491900970-http%253A%252F%252Fnews.163.com%252F%7C1492767097; CNZZDATA1256336326=1559830613-1491900088-http%253A%252F%252Fnews.163.com%252F%7C1492765460; vjlast=1491901409.1492754596.11; ne_analysis_trace_id=1492768109053; vinfo_n_f_l_n3=09c375e3d4394d15.1.13.1491901408836.1492766182939.1492768266676; s_n_f_l_n3=09c375e3d4394d151492768109056' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True, cookies=cookies) if html: article_url = html['url'] article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.info(article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'class': "post_content_main"}) main1 = soup.find('div', attrs={'class': "ep-content-main"}) #第一种网页格式 if main is not None: #获取标题 Ttitle = main.find('h1') if Ttitle is None: self.logger.error('[NeteaseNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #获取发布时间 Ttime = main.find('div', attrs={'class': "post_time_source"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', Ttime)[0] else: self.logger.error('[NeteaseNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取发布作者 Tauthor = main.find('div', attrs={'class': "post_time_source"}) if Tauthor is not None: Tauthor = Tauthor.find('a') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None #获取发布内容 Tcontent = main.find('div', attrs={'class': "post_text"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) dr = re.compile(r'<[^>]+>', re.S) Tcontent = dr.sub('', Tcontent) else: self.logger.error('[NeteaseNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return # 获取评论数 articleid = "" try: articleid = re.findall(r'"docId" : "(.*)",', html['html'])[0] re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid html1 = self.session.download(url=re_url, encoding='utf-8', data=None, isJson=True, timeout=10, retry=3) Treply = html1["tcount"] except: Treply = None self.logger.error('[NeteaseComment]url:' + article_url + ', tid:' + articleid + ', %s' + traceback.format_exc()) finally: article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor) article.statistics.reply_count = Treply return article #第二种网页格式 elif main1 is not None: #标题 Ttitle = main1.find('h1') if Ttitle is None: self.logger.error('[NeteaseNews]' + '缺少标题,无法构成文章,可能已被修改格式,本文停止爬取') return else: Ttitle = Ttitle.text.strip() #发布的时间 Ttime = main1.find('div', attrs={'class': "ep-time-source cDGray"}) Ttime1 = main1.find('div', attrs={'class': "ep-info cDGray"}) if Ttime is not None: Ttime = Ttime.text.strip() Ttime = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0] elif Ttime1 is not None: Ttime = Ttime1.text.strip() Ttime = re.findall( r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')[0] else: self.logger.error('[NeteaseNews]' + '缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取') return if len(Ttime) == 16: Ttime = Ttime + ':00' #获取作者信息 Tauthor = main1.find('div', attrs={'class': "ep-time-soure cDGray"}) Tauthor1 = main1.find('div', attrs={'class': "ep-source cDGray"}) if Tauthor is not None: Tauthor = Tauthor.find('a') if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = None elif Tauthor1 is not None: Tauthor = Tauthor1.find('span') if Tauthor is not None: Tauthor = Tauthor.text.strip() print Tauthor Tauthor = re.findall(r'来源:(.*)"', Tauthor)[0] else: Tauthor = None else: Tauthor = None #获取内容 Tcontent = main1.find('div', attrs={'id': "endText"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) dr = re.compile(r'<[^>]+>', re.S) Tcontent = dr.sub('', Tcontent) else: self.logger.error('[SinaNews]' + '缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取') return #获取评论数 try: articleid = re.findall(r'"docId" : "(.*)",', html['html'])[0] re_url = 'http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' + articleid html1 = json.load( self.session.download(re_url, encoding='utf-8', data=None, isJson=True, timeout=10, retry=3)) Treply = html1['tcount'] except: Treply = None self.logger.error('[NeteaseComment]url:' + article_url + ', tid:' + articleid + ', %s' + traceback.format_exc()) finally: article = Article(tid=articleid, channel_id=self.channel.channel_id, title=Ttitle, content=Tcontent, publish_datetime=Ttime, url=article_url, author_name=Tauthor) self.logger.debug("[crawlArticle]" + article.tid) article.statistics.reply_count = Treply return article
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 :return:返回一个article实例 ''' article_id = re.findall(r'\d+', url)[0] html = self.session.get(url=url, headers=CRAWL_ARTICLE_HEADERS) soup = BeautifulSoup(html.text, 'lxml') if (url.find('zhuanlan') != -1): main = soup.find('textarea', attrs={'id': "preloadedState"}) try: if main is None: return main sub = re.findall(r'new Date\(\".*\"\)', main.text)[0] sub = re.findall(r'".*"', sub)[0] maintext = re.subn(r'new Date\(\".*\"\)', sub, main.text)[0] try: articleJson = json.loads( maintext)["database"]["Post"][article_id] except: return None #获取标题 title = articleJson["title"] self.logger.info(title) authorName = articleJson["author"] contentSoup = BeautifulSoup(articleJson["content"]) content = contentSoup.text commentCount = articleJson["commentCount"] collapsedCount = articleJson["collapsedCount"] likeCount = articleJson["likeCount"] publishedTime = articleJson["publishedTime"][0:18] timeArray = time.strptime(publishedTime, "%Y-%m-%dT%H:%M:%S") publishedTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) article = Article(article_id, self.channel.channel_id, title, content, publishedTime, url, None, authorName) article.statistics.reply_count = commentCount article.statistics.like_count = likeCount article.statistics.collect_count = collapsedCount self.session.close() return article except: return None if (url.find('question') != -1): # 获取标题 try: # 针对404页面 main = soup.find('div', attrs={ 'id': "data" }).attrs['data-state'] articleJson = json.loads(main) questionJson = articleJson['entities']['questions'][article_id] title = questionJson['title'] self.logger.info(title) contentSoup = BeautifulSoup(questionJson['editableDetail'], 'lxml') content = contentSoup.text author_id = questionJson['author']['id'] author_name = questionJson['author']['name'] createTimeInFloat = questionJson['created'] publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat)) reply_count = questionJson['commentCount'] read_count = questionJson['visitCount'] collect_count = questionJson['followerCount'] article = Article(article_id, self.channel.channel_id, title, content, publish_datetime, url, author_id, author_name) article.statistics.reply_count = reply_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count return article except: return None
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' html = self.session.download(url, encoding='utf-8', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find(self.channel.url) < 0: self.logger.warn('Unrelated url found:%s', url) return None article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[iFengnews]' + article_url) soup = BeautifulSoup(html['html'], 'lxml') # 'html.parser' 解析器 main = soup.find('div', attrs={'class': "main"}) main1 = soup.find('div', attrs={'class': "yc_main"}) if main is not None: self.logger.debug(u'走第一种格式') Ttitle = main.find('h1').text.strip() # 标题 if Ttitle is None: self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return else: Tpublishtime = main.find('span', attrs={ 'class': "ss01" }).text.strip() if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return else: Tpublishtime = Tpublishtime.replace(u'年', '-').replace( u'月', '-').replace(u'日', '') Tauthor = main.find('a', attrs={'target': "_blank"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main.find('div', attrs={'id': "main_content"}) # print Tcontent # Tcontent = Tcontent.find('p') if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return doc_url = re.findall(r'"commentUrl":"(.*)",', html['html']) if doc_url: doc_url = doc_url[0] else: doc_url = url data1 = { 'callback': 'newCommentListCallBack', # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml', 'doc_url': doc_url, 'job': '1', 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, addr=False, isJson=True) # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] ) Treply = html1['count'] if len(html1['comments']) is not 0: articleid = html1['comments'][0]['article_id'] else: articleid = article_url articleid = articleid[articleid.find('a/') + 2: -6] # 由于数据库字段长度有限,所以截取部分作为ID self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid', article_url) meta_info = doc_url article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article ##对第二种格式的爬取 if main1 is not None: self.logger.debug(u'走第二种格式') Ttitle = main1.find('h1').text.strip() if Ttitle is None: self.logger.error(u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return else: Tpublishtime = main1.find('span').text.strip() if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return # return Tpublishtime Tauthor = main1.find('a', attrs={'target': "_blank"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main1.find('div', attrs={'class': "yc_con_txt"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.warn( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', url) return doc_url = re.findall(r'"commentUrl":"(.*)",', html['html']) if doc_url: doc_url = doc_url[0] else: doc_url = url data1 = { 'callback': 'newCommentListCallBack', # 'doc_url': 'http://gd.ifeng.com/a/20171010/6053241_0.shtml', 'doc_url': doc_url, 'job': '1', 'callback': 'newCommentListCallBack' } re_url = 'http://comment.ifeng.com/get.php' html1 = self.session.download(re_url, encoding='gbk', data=data1, timeout=10, retry=3, addr=False, isJson=True) # html1 = json.loads( html1[html1.find('=') + 1:html1.rfind(';c')] ) try: Treply = html1['count'] except: Treply = None if len(html1['comments']) is not 0: articleid = html1['comments'][0]['article_id'] else: articleid = url.strip() articleid = articleid[articleid.find('a/') + 2:-6] # 由于数据库字段长度有限,所以截取部分作为ID self.logger.warn(u'地址 %s 没有评论因此以url地址部分字段作为tid', article_url) meta_info = doc_url article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article if (main is None) and (main1 is None): self.logger.warn(u"存在另外一种html格式::%s", url) return
def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") self.logger.info(soup) # check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) # find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) # fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \ # publish_datetime, reply_count, read_count, like_count, collect_count, forward_count if article not in articleList: articleList.append(article)
def crawlNewArticle(self, url): ''' 爬取url前缀为new.qq.com的文章 :param url: :return: ''' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find('new.qq.com/omn') < 0: self.logger.warn('Unrelated url found:%s', url) return article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[TencentNew]' + article_url) soup = BeautifulSoup(html['html'], 'html.parser') script_tags = soup.head.find_all('script') data = dict() for tag in script_tags: text = re.search(r'window.DATA = (.*)', tag.text, re.S) if text: data = json.loads(text.group(1)) tid = data['article_id'] title = data['title'] author_name = data['media'] author_id = data['media_id'] publish_datetime = data['pubtime'] comment_id = data['comment_id'] main = soup.find('div', attrs={'class': 'qq_conent clearfix'}) t_content = '' if main is not None: contents = main.find_all('p', {'class': 'one-p'}) for content in contents: if content.string is None: continue t_content += str(content.get_text().strip()) get_comment_count_url = 'https://coral.qq.com/article/%s/commentnum?callback=_article%scommentnum' % ( comment_id, comment_id) comment_data = self.session.download(get_comment_count_url) comment_data = re.search( r'_article%scommentnum\((.*)\)' % comment_id, comment_data) comment_dict = eval(comment_data.group(1)) reply_count = comment_dict['data']['commentnum'] meta_info = '{"commentid":"%s"}' % comment_id article = Article(tid=tid, channel_id=self.channel.channel_id, title=title, content=t_content, publish_datetime=publish_datetime, url=article_url, author_id=author_id, author_name=author_name, meta_info=meta_info) article.statistics.reply_count = reply_count return article return None
def crawlNewsArticle(self, url): ''' 爬取url前缀为news.qq.com和gd.qq.com的文章 :param url: :return: ''' html = self.session.download(url, encoding='gbk', data=None, timeout=10, retry=3, addr=True) if html: article_url = html['url'] if article_url.find('news.qq.com') < 0 and article_url.find( 'gd.qq.com') < 0: self.logger.warn('Unrelated url found:%s', url) return None article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] self.logger.debug('[TencentNews]' + article_url) soup = BeautifulSoup(html['html'], 'html.parser') main = soup.find('div', attrs={'id': "Main-Article-QQ"}) main1 = soup.find('div', attrs={'id': "Main-P-QQ"}) if main is not None: Ttitle = main.find('h1').text.strip() #标题 Ttime = main.find('span', attrs={'class': "article-time"}) #发布时间 Ttime1 = main.find('span', attrs={'class': "a_time"}) Ttime2 = main.find('span', attrs={'class': "pubTime"}) if Ttime is not None: Ttime = Ttime.text.strip() elif Ttime1 is not None: Ttime1 = Ttime1.text.strip() Ttime = Ttime1 elif Ttime2 is not None: Ttime2 = Ttime2.text.strip() Ttime = Ttime2 else: Ttime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if len(Ttime) == 16: Ttime = Ttime + ':00' Tauthor = main.find('span', attrs={'class': "a_source"}) Tauthor1 = main.find('span', attrs={'class': "color-a-1"}) if Tauthor is not None: #Tauthor = Tauthor.find('a').text.strip() Tauthor = Tauthor.text.strip() elif Tauthor1 is not None: #Tauthor1 = Tauthor1.find('a').text.strip() Tauthor1 = Tauthor1.text.strip() Tauthor = Tauthor1 else: Tauthor = None Tcontent = main.find('div', attrs={'id': "Cnt-Main-Article-QQ"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: Tcontent = None articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0] try: commentid = re.findall(r'cmt_id = (\d+);', html['html'])[0] meta_info = '{"commentid":"%s"}' % commentid except: commentid = None meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Ttime, article_url, None, Tauthor, meta_info=meta_info) if commentid is not None: try: re_url = 'http://coral.qq.com/article/' + commentid + '/commentnum' html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=None, timeout=10, retry=3)) Treply = int(html1['data']['commentnum']) except Exception: traceInfo = traceback.format_exc() self.logger.error( 'Faile to parse comment for %s (cid=%s):%s', articleid, commentid, traceInfo) Treply = None article.statistics.reply_count = Treply return article elif main1 is not None: Ttitle = soup.find('meta', attrs={ 'name': "Description" }).attrs['content'] # 标题 Ttime = re.findall( r"pubtime\D+(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2})\',", html['html']) if Ttime is not None: Ttime = Ttime[0] Ttime = Ttime[0] + '-' + Ttime[1] + '-' + Ttime[ 2] + ' ' + Ttime[3] else: Ttime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') if len(Ttime) == 16: Ttime = Ttime + ':00' Tauthor = re.findall(r'para = {\s+name: \"(.*)\",', html['html']) if Tauthor is not None: Tauthor = Tauthor[0] else: Tauthor = None con_url = re.sub(r'\.htm\?.*', '.hdBigPic.js', article_url) con_html = self.session.download(con_url, encoding='gbk', data=None, timeout=10, retry=3) con_list = re.findall(r'<p>(.*?)</p>', con_html) if con_list is not None: TT = [] for i in con_list: if i.strip() not in TT: TT.append(i) Tcontent = ''.join(TT) else: Tcontent = None articleid = re.findall(r'id:\'(\d+)\',', html['html'])[0] try: commentid = re.findall(r'aid\D+(\d+)\",', html['html'])[0] meta_info = '{"commentid":"%s"}' % commentid except: commentid = None meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Ttime, article_url, None, Tauthor, meta_info=meta_info) try: if commentid is not None: re_url = 'http://coral.qq.com/article/batchcommentnum' data1 = {'targetid': articleid} html1 = json.loads( self.session.download(re_url, encoding='utf-8', data=data1, timeout=10, retry=3)) Treply = int(html1['data'][0]['commentnum']) else: Treply = None except: Treply = None article.statistics.reply_count = Treply return article return None
def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' cookies = '_HY_lvt_c5928ed57343443aa175434a09ea2804=1492582419784; _HY_CTK_c5928ed57343443aa175434a09ea2804=0acc9a79b115c4ca1931c41e303bec28; BAIDU_SSP_lcr=https://www.baidu.com/link?url=NszYD2w_HgkPWqrzDQ3WKApYldw_9MpUVun9r-R09M7r0dh09MUwTHzG087WaJrhBwMCY-7pDfds4xjtWArRf2xh01DHOWWWd9DpBnHwZ03&wd=&eqid=84d5edfe0003dbdd0000000658f6c280; ALLYESID4=0DB901C6E627D980; sso_c=0; wdcid=5838509dcecc0a53; _people_ip_new_code=510000; UM_distinctid=15b802cdbd364d-02e7f218c26ae6-4e45042e-100200-15b802cdbd49ce; wdses=62d3f0f698d07532; sfr=1; CNZZDATA1260954200=1457761991-1492499618-null%7C1492580619; CNZZDATA1260954203=33096124-1492503288-null%7C1492578888; CNZZDATA1256327855=1768205365-1492503342-null%7C1492578947; wdlast=1492582420' html = self.session.download(url, data=None, isJson=False, timeout=10, retry=3, addr=True, cookies=cookies) soup = BeautifulSoup(html['html'], 'html.parser') # 'html.parser' 解析器 try: meta = soup.find('meta').attrs['content'] except: self.logger.warn(u'找不到meta里的content') return # self.logger.error('%s',meta) if "GB2312" in meta: encoding1 = 'GB2312' elif "UTF-8" in meta: encoding1 = 'UTF-8' elif "utf-8" in meta: encoding1 = 'utf-8' else: encoding1 = 'gbk' html = self.session.download(url, encoding=encoding1, data=None, isJson=False, timeout=10, retry=3, addr=True, cookies=cookies) # 不同网页编码格式让其重新下载一遍 soup = BeautifulSoup(html['html'], 'html.parser') # 'html.parser' 解析器 main = soup.find('body') if html: article_url = html['url'] # self.logger.debug(article_url) if article_url.find(self.channel.url) < 0: self.logger.warn('Unrelated url found:%s', url) return None # if '.html' not in article_url: # self.logger.error(u'非文章类型网址:%s ',article_url) # return try: article_url = re.findall( r'.*?\.html|.*?\.htm|.*?\.shtml|.*?\.shtm', article_url)[0] except: self.logger.error(u'网址后缀不符合:%s ', article_url) return self.logger.debug('[peoplenews]' + article_url) articleid = article_url articleid = articleid[articleid.find('cn/') + 3:-5] # 由于数据库字段长度有限,所以截取部分作为ID self.logger.warn(u'地址 %s 以url地址部分字段作为tid', article_url) if 'bbs1' not in article_url: main1 = soup.find( 'div', attrs={'class': "i_cont"} ) #http://health.people.com.cn/n1/2017/1011/c14739-29579836.html main2 = soup.find( 'div', attrs={'class': "text_c"} ) #http://rencai.people.com.cn/n/2014/0721/c244800-25311391.html if (main1 is None) and (main2 is None): self.logger.debug(u'走main') try: Ttitle = main.find('h1').text.strip() # 标题 except: self.logger.error(u'Ttitle存在走了main部分却不满足其他格式的的url::%s', article_url) return if Ttitle is None: self.logger.error( u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Ttitle_crawl = soup.find('div', attrs={'class': "box01"}) #对应一种格式 if Ttitle_crawl is None: self.logger.error( u'Ttitle_crawl存在走了main部分却不满足其他格式的的url::%s', article_url) return try: Tpublishtime = Ttitle_crawl.find('div', attrs={ 'class': "fl" }).text.strip() except: self.logger.error(u'main中发布时间不匹配') return if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: # self.logger.error(Tpublishtime) Tpublishtime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', Tpublishtime)[0] # Tpublishtime = Tpublishtime[:18] if len(Tpublishtime[4]) > 1: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + Tpublishtime[4] else: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + ':00' # Tpublishtime = Tpublishtime.replace(u'年', '-').replace(u'月', '-').replace(u'日', '') Tauthor = Ttitle_crawl.find( 'a', attrs={'target': "_blank"}) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = soup.find('div', attrs={'class': "box_con"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return Treply = None #这种格式下没有这些统计可以获取 meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply #self.logger.info(article) return article elif (main1 is not None): self.logger.debug(u'走main1') Ttitle = main1.find('h2') # 标题 if Ttitle is None: self.logger.error( u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Ttitle = Ttitle.text.strip() try: Tpublishtime = main1.find('div', attrs={ 'class': "artOri" }).text.strip() except: self.logger.error(u'main1中发布时间不匹配') return if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Tpublishtime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', Tpublishtime)[0] # self.logger.error(Tpublishtime) # Tpublishtime = Tpublishtime[:18] if len(Tpublishtime[4]) > 1: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + Tpublishtime[4] else: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + ':00' Tauthor = main1.find('div', attrs={ 'class': "artOri" }).find('a', attrs={'target': "_blank"}) # self.logger.debug(u"作者:%s",Tauthor) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main1.find('div', attrs={'class': "artDet"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return Treply = None # 这种格式下没有这些统计可以获取 meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article elif (main2 is not None): self.logger.debug(u'走main2') Ttitle = main2.find('h2', attrs={'class': "one"}) # 标题 if Ttitle is None: self.logger.error( u'缺少标题,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Ttitle = Ttitle.text.strip() try: Tpublishtime = main2.find('span', attrs={ 'id': "p_publishtime" }).text.strip() except: self.logger.error(u'main2中发布时间不匹配') return if Tpublishtime is None: self.logger.error( u'缺少文章发布时间,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return else: Tpublishtime = re.findall( r'(\d{4})\D(\d{2})\D(\d{2})\D(\d{2}:\d{2}(:\d{2})?)', Tpublishtime)[0] # self.logger.error(Tpublishtime) # Tpublishtime = Tpublishtime[:18] if len(Tpublishtime[4]) > 1: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + Tpublishtime[4] else: Tpublishtime = Tpublishtime[0] + '-' + Tpublishtime[1] + '-' + Tpublishtime[2] + ' ' + \ Tpublishtime[3] + ':00' Tauthor = main2.find('span', attrs={ 'id': "p_origin" }).find('a', attrs={'target': "_blank"}) # self.logger.debug(u"作者:%s",Tauthor) if Tauthor is not None: Tauthor = Tauthor.text.strip() else: Tauthor = 'None' Tcontent = main2.find('div', attrs={'class': "show_text"}) if Tcontent is not None: Tcontent = Tcontent.text.strip() Tcontent = re.sub(r'\n|\t', '', Tcontent) else: self.logger.error( u'缺少文章内容,无法构成文章,可能已被修改格式,本文停止爬取::该网站为 %s ', article_url) return Treply = None # 这种格式下没有这些统计可以获取 meta_info = None article = Article(articleid, self.channel.channel_id, Ttitle, Tcontent, Tpublishtime, article_url, None, Tauthor, meta_info=meta_info) article.statistics.reply_count = Treply self.logger.info(article) return article else: self.logger.warn(u'存在另外一种html格式 %s', article_url)
def __updateOldArticleToArticleHistoryTable(self, articleList, currentTableName, historyTableName, isEventTable=False): ''' 更新到文章历史表 @param currentTableName: 当前文章表:全局文章表、实体文章表或者实体事件文章表 @param historyTableName: 历史文章表:全局文章表、实体文章表或者实体事件文章表 @param eventId: 如果更新到实体事件文章表,则需要提供事件id,否则为None ''' if len(articleList) > 0: if isEventTable is False: eventIdFieldName = '' else: eventIdFieldName = ',EVENT_ID' #找寻老文章 selectSql = ''' SELECT TID, CHANNEL_ID %s FROM %s where %s ''' whereClauseList = map( lambda article: '(TID="%s" and CHANNEL_ID=%d)' % (article.tid, article.channel_id), articleList) self.dbProxy.execute(selectSql % (eventIdFieldName, currentTableName, ' or '.join(whereClauseList))) resultList = self.dbProxy.fetchall() if isEventTable: existingArticleList = map( lambda item: Article(item[0], item[1], eventId=item[2]), resultList) else: existingArticleList = map( lambda item: Article(item[0], item[1]), resultList) toBeUpdateArticleList = list() for item in existingArticleList: index = articleList.index(item) obj = copy.copy(articleList[index]) obj.eventId = item.eventId toBeUpdateArticleList.append(obj) if len(toBeUpdateArticleList) > 0: n = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') if isEventTable is False: eventIdFieldName = '' else: eventIdFieldName = 'EVENT_ID,' insertSql = ''' INSERT INTO %s (TID, %s CHANNEL_ID, READ_COUNT,LIKE_COUNT, REPLY_COUNT, FORWARD_COUNT, COLLECT_COUNT, HEAT, ADD_DATETIME) VALUES %s ''' valueList = list() for article in toBeUpdateArticleList: statistics = article.statistics if isEventTable is False: eventIdFieldValue = '' else: eventIdFieldValue = str(article.eventId) + ',' valueList.append( '("%s", %s %d, %s, %s, %s, %s, %s, %s, "%s")' % (article.tid, eventIdFieldValue, article.channel_id, statistics.read_count if statistics.read_count is not None else Constants.DEFAULT_NUM, statistics.like_count if statistics.like_count is not None else Constants.DEFAULT_NUM, statistics.reply_count if statistics.reply_count is not None else Constants.DEFAULT_NUM, statistics.forward_count if statistics.forward_count is not None else Constants.DEFAULT_NUM, statistics.collect_count if statistics.collect_count is not None else Constants.DEFAULT_NUM, statistics.heat if statistics.heat is not None else Constants.DEFAULT_NUM, n)) if len(valueList) > 0: self.dbProxy.execute(insertSql % (historyTableName, eventIdFieldName, ','.join(valueList))) self.dbProxy.commit()
def __parse_page_url(self, page_url, count_page=0): ''' 获取每一页的微博信息 @return: article_list ''' user_agent = random.choice(AGENTS) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 's.weibo.com', 'Referer': 'http://s.weibo.com/', 'User-Agent': user_agent } self.change_cookie() cookies = self.__load_cookies_from_lwp(self.user_name) response = self.session.get(url=page_url, headers=headers, cookies=cookies) # 若此cookie无效,则重新登录获取新的cookie if re.match(r'File not .*', response) is not None: self.__login(self.user_name) cookies = self.__load_cookies_from_lwp(self.user_name) response = self.session.get(url=page_url, headers=headers, cookies=cookies) html = etree.HTML(response) # 内容list card_wrap_list = html.xpath( '//div[@class="m-wrap"]/div[@id="pl_feedlist_index"]//div[@class="card-wrap"]' ) article_list = list() for card_wrap in card_wrap_list: try: mid = card_wrap.xpath('./@mid')[0] except Exception as e: continue info = card_wrap.xpath('.//div[@class="content"]')[0] author_name = info.xpath('.//a[@class="name"]/text()')[0] author_id = re.search( r'/(\d+).?refer_flag', info.xpath('.//a[@class="name"]/@href')[0]).group(1) txt = info.xpath('.//p[@class="txt"]')[0] content = txt.xpath('string(.)').strip() title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' article_info = info.xpath('.//p[@class="from"]/a')[0] article_url = self.HTTP + article_info.xpath('./@href')[0] statistics = card_wrap.xpath( './/div[@class="card-act"]//li/a/text()') collect_count = re.search(r'(\d+)', statistics[0]) forward_count = re.search(r'(\d+)', statistics[1]) reply_count = re.search(r'(\d+)', statistics[2]) like_count = card_wrap.xpath( './/div[@class="card-act"]//li/a/em/text()') if collect_count: collect_count = collect_count.group(1).strip() else: collect_count = 0 if forward_count: forward_count = forward_count.group(1).strip() else: forward_count = 0 if reply_count: reply_count = reply_count.group(1).strip() else: reply_count = 0 if len(like_count): like_count = like_count[0] else: like_count = 0 publish_datetime = self.__parse_time(article_url, cookies) article = Article(mid, self.channel.channel_id, title, content, url=article_url, publish_datetime=publish_datetime, author_id=author_id, author_name=author_name) article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count article.statistics.reply_count = reply_count article.statistics.like_count = like_count if article not in article_list: article_list.append(article) self.logger.debug('%d page crawl article is %s' % (count_page, len(article_list))) return article_list