class WechatCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.session_public = SessionCrawler(logger=self.logger) self.session_public_article = SessionCrawler(logger=self.logger) self.channel = channel self.entityId = 'SYSU' self.ip_list = None self.proxies = None self.monitor_title = '微信爬虫监控' self.email = SendEmail() self.db = InsertDB() def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,爬取一天内的文章 @param keywordList: 关键字数组 @:param endTime: 搜索结束时间 ''' run_msg = '微信爬虫开始运行' self.db.Insert(self.channel.channel_id, self.entityId, run_msg) startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeStr = startTime.strftime('%Y-%m-%d') endTimeStr = endTime.strftime('%Y-%m-%d') # startTime = endTime - datetime.timedelta(days=1) # startTimeStr = startTime.strftime('%Y-%m-%d') # endTimeStr=startTime.strftime('%Y-%m-%d') self.logger.debug('startTime:%s', startTimeStr) self.logger.debug('endTime:%s', endTimeStr) # 随机选取一个代理 # proxy_crawler = ProxyCrawler() # proxies = proxy_crawler.get_random_proxy() # publicList = self.getPublic() # articleList = self.searchPublic(publicList) articleList = list() urlList = list() for keyword in keywordList: # 忽略第一次,第一次不带时间范围 pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote( keyword.encode('utf-8')), int(time.time() * 1000))).replace( '#', '%') self.logger.debug('pageUrl:%s', pageUrl) self.session.randomSleep() lastPageUrl = pageUrl # 爬取微信一天时间的内容 pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote( keyword.encode('utf-8')), startTimeStr, endTimeStr) self.logger.debug('pageUrl:%s', pageUrl) # 得到ip队列 self.ip_list = getIp() ip = self.ip_list.dequeue() self.proxies = {"http": "http://" + ip} while True: # proxies = {"http": "http://" + ip} headers = HEADERS_SOGOU.copy() headers['Referer'] = lastPageUrl try: response = self.session.get(pageUrl, allow_redirects=False, headers=headers, proxies=self.proxies) soup = BeautifulSoup(response, 'lxml') main = soup.find('ul', {'class': "news-list"}) while True: if main is None: # self.logger.error('Fail to parse: ip被封,更新ip') content = 'ip被封,更新ip' # self.email.send(self.monitor_title, content) self.db.Insert(self.channel.channel_id, self.entityId, content) temp = self.ip_list.dequeue() if self.ip_list.isempty(): self.ip_list = getIp() self.proxies = {"http": "http://" + temp} # while True: # try: response = self.session.get(pageUrl, allow_redirects=False, headers=headers, proxies=self.proxies) soup = BeautifulSoup(response, 'lxml') main = soup.find('ul', {'class': "news-list"}) # break # except: # ip_unuseful_content = '此ip是不合格的ip,更新ip' # # self.email.send(self.monitor_title, ip_unuseful_content) # self.db.Insert(self.channel.channel_id,self.entityId,ip_unuseful_content) # tmp = self.ip_list.dequeue() # if self.ip_list.isempty(): # self.ip_list = getIp() # self.proxies = {"http": "http://" + tmp} else: break li_list = main.findAll('li') # li_list有可能为空,但还可以翻页 for li in li_list: a_list = li.findAll('a') try: publish_datetime = li.select_one('.s-p').get('t') publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(int(publish_datetime))) except Exception as e: self.logger.debug( 'Publish_datetime crawl failed, use now time') publish_datetime = datetime.datetime.now( ).strftime('%Y-%m-%d %H:%M:%S') print publish_datetime for a in a_list: if a['uigs'].startswith('article_title'): # self.logger.debug('Article title:%s',a.text) urlList.append((a['href'], pageUrl, a.text, publish_datetime)) break pageBarList = soup.findAll('div', {'id': 'pagebar_container'}) if len(pageBarList) == 0: # 没有翻页,直接退出 break pageBar = pageBarList[0] aList = pageBar.findAll('a') foundNextPage = False for a in aList: if a['uigs'] == 'page_next': foundNextPage = True lastPageUrl = pageUrl pageUrl = SOGOU_URL + a['href'] self.logger.debug('Found next page:%s', a.text) break if foundNextPage is False: break except: ip_unuseful_content = '此ip是不合格的ip,更新ip' # self.email.send(self.monitor_title,ip_unuseful_content) self.db.Insert(self.channel.channel_id, self.entityId, ip_unuseful_content) tmp = self.ip_list.dequeue() if self.ip_list.isempty(): self.ip_list = getIp() self.proxies = {"http": "http://" + tmp} for item in urlList: article = self.crawlArticle(item[0], referer=item[1], title=item[2], publish_datetime=item[3], flag=0) if article is not None: if article not in articleList: # 同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) if articleList is None: monitor_content = '微信没有数据,或者微信爬虫挂了' self.email.send(self.monitor_title, monitor_content) self.db.Insert(self.channel.channel_id, self.entityId, monitor_content) end_msg = '微信爬虫结束' self.db.Insert(self.channel.channel_id, self.entityId, end_msg) return articleList def searchPublic(self, keywordList): ''' 根据关键字数组,开始时间和结束时间范围搜索公众号 :param keywordList: :param endTime: :return: ''' articleList = list() for keyword in keywordList: self.logger.debug(keyword) pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0]) self.logger.info('pageUrl:%s', pageUrl) self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) soup = BeautifulSoup(response.text, 'lxml') main = soup.find('ul', {'class': "news-list2"}) if main is None: self.logger.error('Fail to parse:%s', response.text) try: li_list = main.findAll('li') except Exception, e: print e continue for li in li_list: a_title = li.find('p', {'class': "tit"}) if a_title is not None: title = str(a_title.text.strip()) if title == keyword[0]: self.logger.debug(title) a_href = a_title.find('a')['href'] sub_articleList = self.crawlWetchartpublic(a_href) for article in sub_articleList: articleList.append(article) return articleList
class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel self.entityId = 'SYSU' # self.user_name = "*****@*****.**" # self.session = SessionCrawler(sleepRange=[3,8]) # self.pass_word = "810214bee810214" # self.user_name = "15088137907" # self.pass_word = "4p2yhynrb7" self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] self.logger.info('username: %s' % self.user_name) self.email = SendEmail() self.db = InsertDB() self.monitor_title = '微博爬虫监控' self.proxies = '' self.session = SessionCrawler(sleepRange=[3, 8]) # user_name_password3 = '15767199023:j980216' # user_name_password1 = '13427287354:4ova7zixzj' # user_name_password2 = '13532011721:1emr41761u' # user_name_password3 = '13640792755:1eek9uuym4' # user_name_password4 = '13697726577:7hviv4old0'#### # user_name_password5 = '13794342903:6imuw2cdya' # 197的微博账号 # user_name_password1 = '17825769929:4ms7e2v3zx' # user_name_password2 = '18211493432:7fagvqyi9p' # user_name_password3 = '17827278983:0nenzag325' # user_name_password4 = '13922771190:5aqa10wvwf' # user_name_password5 = '15999916968:2i45j5b49y' # 15119820746 - ---0htkvsq5h6 # 15986585396 - ---5gsmhx3e8k # 13430915912 - ---8s1nif2d50 # 15012471375 - ---3qwlffw8vv # 17880567972 - ---6jrlzr2fqe # 17876156948 - ---5g5w4i43f3 # 15915132451 - ---2rl2v9hy9t # 13543985544 - ---8x0pqi3as7 # 13717382951 - ---5p2d39l19r # 13640695490 - ---6nxc4vou4o def change_cookie(self): ''' 随机获取一个cookie :return: ''' # usename_list = [ # '18814095644','13432881156','*****@*****.**','15018377821','*****@*****.**', # '15767199023','13427287354','13532011721','13640792755','13794342903', # '17825769929','18211493432','17827278983','13922771190','15999916968', # '15119820746','15986585396','13430915912','15012471375','17880567972', # '17876156948','15915132451','13543985544','13717382951','13640695490', # '15711707673','13680181412','13414759320','17820956139','18476072534', # '17806699214','13418852766','17827181603','15919354070','15088137907' # ] usename_list = [ '18814095644', '13432881156', '*****@*****.**', '15018377821', '*****@*****.**', ] usename = random.choice(usename_list) return usename def get_username_password(self): ''' 随机获取用户和密码 :return: ''' user_name_password1 = '18814095644:ljda.18814095644' user_name_password2 = '13432881156:liang452035397' user_name_password3 = '[email protected]:810214bee810214' user_name_password4 = '15018377821:zzm15331411' user_name_password5 = '15767199023:j980216' user_name_password6 = '[email protected]:uwinvip' user_list = [ user_name_password1, user_name_password2, user_name_password3, user_name_password4, user_name_password5, user_name_password6 ] user_choice = random.choice(user_list) user_name_password = user_choice.split(':') return user_name_password def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' run_msg = '微博爬虫开始运行' self.db.Insert(self.channel.channel_id, self.entityId, run_msg) startTime = endTime - datetime.timedelta(hours=2) # startTime=datetime.datetime(2017,11,20,23) page = 1 articleList = list() hasnext = True while hasnext: data = self.__searchByPage(keywordList, startTime, endTime, page) (articleListInPage, hasnext) = self.__parseSearchPage(data) articleList.extend(articleListInPage) page += 1 if articleList is None: article_msg = '微博没有爬取到数据' self.email.send(self.monitor_title, article_msg) self.db.Insert(self.channel.channel_id, self.entityId, article_msg) end_msg = '微博爬虫结束' self.db.Insert(self.channel.channel_id, self.entityId, end_msg) return articleList def __searchByPage(self, keywordList, startTime, endTime, page): query = urllib.quote(' '.join(keywordList).encode('utf-8')) params = { 'typeall': '1', 'suball': '1', # 包含全部 'timescope': 'custom:%s:%s' % (startTime.strftime("%Y-%m-%d"), (endTime.strftime("%Y-%m-%d"))), # 时间 # 微博搜索的时间范围格式不同 不能写%Y-%m-%d-%H 'Refer': 'SWeibo_box', 'page': page } user_agent = random.choice(AGENTS) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', # 'Host': 's.weibo.com', # 'Referer': 'http://s.weibo.com/', 'User-Agent': user_agent } index_url = 'http://s.weibo.com/weibo/' + query # 搜索主页+ usename_cookie = self.change_cookie() self.logger.debug('Use Cookie %s' % usename_cookie) try: cookies = self.__load_cookies_from_lwp(usename_cookie) html = self.session.get(index_url, params=params, headers=headers, cookies=cookies) # 加载本地cookies lindex = html.find( '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"' ) rindex = html[lindex:].find('</script>') rindex = lindex + rindex - 1 lindex = lindex + len( '<script>STK && STK.pageletM && STK.pageletM.view(') jo = json.loads(html[lindex:rindex]) data = jo['html'] # 实时微博页 self.logger.debug('Get data') return data except Exception as e: self.logger.debug('ERROR %s' % e) loginFlag = self.__login() self.logger.debug('Use username: %s' % self.user_name) if loginFlag is False: self.logger.error('Fail to logon') login_msg = '微博登录失败' self.email.send(self.monitor_title, login_msg) self.db.Insert(self.channel.channel_id, self.entityId, login_msg) return cookies = self.__load_cookies_from_lwp(self.user_name) self.logger.debug('Get a new Cookie: %s' % cookies) html = self.session.get(index_url, params=params, headers=headers, cookies=cookies) # 加载本地cookies lindex = html.find( '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct"' ) rindex = html[lindex:].find('</script>') rindex = lindex + rindex - 1 lindex = lindex + len( '<script>STK && STK.pageletM && STK.pageletM.view(') jo = json.loads(html[lindex:rindex]) data = jo['html'] # 实时微博页 return data # self.logger.warning('Crawler failed: %s' % e) # msg = '没有获取到json数据,说明微博爬虫挂了' # self.email.send(self.monitor_title,msg) # self.db.Insert(self.channel.channel_id,self.entityId,msg) def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") self.logger.info(soup) # check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) # find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) # fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \ # publish_datetime, reply_count, read_count, like_count, collect_count, forward_count if article not in articleList: articleList.append(article) return (articleList, hasnext)
class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.channel = channel # self.user_name = "*****@*****.**" # self.session = SessionCrawler(sleepRange=[3,8]) # self.pass_word = "810214bee810214" # self.user_name = "15018377821" # self.pass_word = "zzm15331411" self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] self.logger.info('username: %s' % self.user_name) self.session = SessionCrawler(sleepRange=[3, 8]) def get_username_password(self): ''' 随机获取用户和密码 :return: ''' user_name_password1 = '18814095644:ljda.18814095644' user_name_password2 = '13432881156:liang452035397' user_name_password3 = '[email protected]:810214bee810214' user_name_password4 = '15018377821:zzm15331411' user_name_password5 = '[email protected]:uwinvip' user_name_password6 = '15767199023:j980216' user_list = [ user_name_password1, user_name_password2, user_name_password3, user_name_password4, user_name_password5, user_name_password6 ] user_choice = random.choice(user_list) user_name_password = user_choice.split(':') return user_name_password def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' # time_now = time.time() # date = time.strftime('%Y-%m-%d', time.localtime(time.time())) # com_time_low = time.mktime(time.strptime(date + ' 07:00:00', '%Y-%m-%d %H:%M:%S')) # com_time_hig = time.mktime(time.strptime(date + ' 07:59:58', '%Y-%m-%d %H:%M:%S')) # if time_now > com_time_low and time_now < com_time_hig: # loginFlag = self.__login() # if loginFlag is False: # self.logger.error('Fail to logon') # return startTime = endTime - datetime.timedelta(hours=2) # startTime=datetime.datetime(2017,11,20,23) page = 1 articleList = list() hasnext = True while hasnext: data = self.__searchByPage(keywordList, startTime, endTime, page) (articleListInPage, hasnext) = self.__parseSearchPage(data) articleList.extend(articleListInPage) page += 1 return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' # return try: (data, check) = self.__fetchSingleArticle(article) if check == '0': soup = BeautifulSoup(data, 'lxml') ulList = soup.findAll( 'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'}) li_list = ulList[0].findAll('li') self.__parseStatistics(article, li_list) elif check == '1': self.logger.warning(u'要访问的网页404了:%s', article.url) return else: self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url) return except: self.logger.error('Fail to fetch statistics for:%s, %s', article.url, traceback.format_exc()) return def __save_cookies_lwp(self, cookiejar): """ 保存cookies到本地 """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() for c in cookiejar: args = dict(vars(c).items()) args['rest'] = args['_rest'] del args['_rest'] c = cookielib.Cookie(**args) lwp_cookiejar.set_cookie(c) lwp_cookiejar.save(filename, ignore_discard=True) def __load_cookies_from_lwp(self): """ 读取本地cookies """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() lwp_cookiejar.load(filename, ignore_discard=True) # self.logger.debug(lwp_cookiejar) return lwp_cookiejar def __parseStatistics(self, article, soup_li_list): # 新版 collect_count = soup_li_list[0].find('span').text collect_count = re.findall(r'\d+', collect_count) if len(collect_count) > 0: collect_count = int(collect_count[0]) else: collect_count = 0 forward_count = soup_li_list[1].find('span').text forward_count = re.findall(r'\d+', forward_count) if len(forward_count) > 0: forward_count = int(forward_count[0]) else: forward_count = 0 reply_count = soup_li_list[2].find('span').text reply_count = re.findall(r'\d+', reply_count) if len(reply_count) > 0: reply_count = int(reply_count[0]) else: reply_count = 0 like_count = soup_li_list[3].find('span').text like_count = re.findall(r'\d+', like_count) if len(like_count) > 0: like_count = int(like_count[0]) else: like_count = 0 article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") # self.logger.info(soup) #check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) #find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) #fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) # print mid, article_url, add_datetime, channeltype, channel, title, content, author_id, author_name, \ # publish_datetime, reply_count, read_count, like_count, collect_count, forward_count if article not in articleList: articleList.append(article) return (articleList, hasnext)
class WeiboCrawler(object): ''' classdocs ''' def __init__(self, channel=None, logger=None): ''' Constructor ''' # if logger is None: # # self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) # # else: # # self.logger = logger self.logger = logging.getLogger() self.channel = channel self.user_name_password = self.get_username_password() self.user_name = self.user_name_password[0] self.pass_word = self.user_name_password[1] # self.logger.info('username: %s' % self.user_name) print 'username: %s' % self.user_name self.session = SessionCrawler(sleepRange=[3, 8]) def get_username_password(self): ''' 随机获取用户和密码 :return: ''' # 超算1的微博账号 # user_name_password1 = '18814095644:ljda.18814095644' # user_name_password2 = '13432881156:liang452035397' # user_name_password3 = '[email protected]:810214bee810214' # user_name_password4 = '15018377821:zzm15331411' # user_name_password5 = '[email protected]:uwinvip' # 超算2的微博账号 # user_name_password1 = '13432881156:liang452035397' # user_name_password2 = '[email protected]:uwinvip' # user_name_password3 = '15767199023:j980216' # user_name_password1 = '13427287354:4ova7zixzj' # user_name_password2 = '13532011721:1emr41761u' # user_name_password3 = '13640792755:1eek9uuym4' # user_name_password4 = '13697726577:7hviv4old0' # user_name_password5 = '13794342903:6imuw2cdya' # 197的微博账号 user_name_password1 = '17825769929:4ms7e2v3zx' user_name_password2 = '18211493432:7fagvqyi9p' user_name_password3 = '17827278983:0nenzag325' user_name_password4 = '13922771190:5aqa10wvwf' user_name_password5 = '15999916968:2i45j5b49y' user_list = [ user_name_password1, user_name_password2, user_name_password3, user_name_password4, user_name_password5 ] # user_list = [user_name_password1, user_name_password2, user_name_password3] user_choice = random.choice(user_list) user_name_password = user_choice.split(':') return user_name_password # 随机选取User-Agent def get_random_agent(self): user_agent = [ "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2595.400 QQBrowser/9.6.10872.400", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)", ] random_header = random.choice(user_agent) return random_header def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' self.__login() # startTime = endTime - datetime.timedelta(hours=2) # # startTime=datetime.datetime(2017,11,20,23) # page = 1 # articleList = list() # hasnext = True # while hasnext: # data = self.__searchByPage(keywordList, startTime,endTime, page) # (articleListInPage,hasnext) = self.__parseSearchPage(data) # # articleList.extend(articleListInPage) # page+=1 # return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' # return try: (data, check) = self.__fetchSingleArticle(article) if check == '0': soup = BeautifulSoup(data, 'lxml') ulList = soup.findAll( 'ul', {'class': 'WB_row_line WB_row_r4 clearfix S_line2'}) li_list = ulList[0].findAll('li') self.__parseStatistics(article, li_list) elif check == '1': self.logger.warning(u'要访问的网页404了:%s', article.url) return else: self.logger.warning(u'抱歉,你访问的页面地址有误,或者该页面不存在:%s', article.url) return except: self.logger.error('Fail to fetch statistics for:%s, %s', article.url, traceback.format_exc()) return def __save_cookies_lwp(self, cookiejar): """ 保存cookies到本地 """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() for c in cookiejar: args = dict(vars(c).items()) args['rest'] = args['_rest'] del args['_rest'] c = cookielib.Cookie(**args) lwp_cookiejar.set_cookie(c) lwp_cookiejar.save(filename, ignore_discard=True) def __load_cookies_from_lwp(self): """ 读取本地cookies """ filename = 'sinaweibocookies' lwp_cookiejar = cookielib.LWPCookieJar() lwp_cookiejar.load(filename, ignore_discard=True) return lwp_cookiejar def __parseStatistics(self, article, soup_li_list): # 新版 collect_count = soup_li_list[0].find('span').text collect_count = re.findall(r'\d+', collect_count) if len(collect_count) > 0: collect_count = int(collect_count[0]) else: collect_count = 0 forward_count = soup_li_list[1].find('span').text forward_count = re.findall(r'\d+', forward_count) if len(forward_count) > 0: forward_count = int(forward_count[0]) else: forward_count = 0 reply_count = soup_li_list[2].find('span').text reply_count = re.findall(r'\d+', reply_count) if len(reply_count) > 0: reply_count = int(reply_count[0]) else: reply_count = 0 like_count = soup_li_list[3].find('span').text like_count = re.findall(r'\d+', like_count) if len(like_count) > 0: like_count = int(like_count[0]) else: like_count = 0 article.statistics.reply_count = reply_count article.statistics.like_count = like_count article.statistics.collect_count = collect_count article.statistics.forward_count = forward_count def __parseSearchPage(self, data): ''' @return: (articleList,hasnext) ''' articleList = list() hasnext = False soup = BeautifulSoup(data, "lxml") # self.logger.info(soup) #check if no result noResultDivList = soup.findAll('div', {'class': 'pl_noresult'}) if len(noResultDivList) > 0: hasnext = False self.logger.info('No result') return (articleList, hasnext) #find page bar to check if more pageDivList = soup.findAll('div', {'class': 'W_pages'}) if len(pageDivList) > 0: pageDiv = pageDivList[0] if len(pageDiv.findAll('a', {'class': 'page next S_txt1 S_line1'})) > 0: hasnext = True if hasnext is False: self.logger.info('The last page') root_1 = soup.findAll('div', {"action-type": "feed_list_item"}) # self.logger.debug(root_1) for r in root_1: root_2 = r.find('div', {'class': "content clearfix"}) mid = r.attrs['mid'] article_url = root_2.find('div', { 'class': "feed_from W_textb" }).findNext('a').attrs['href'] self.logger.debug('1 %s', article_url) if not article_url.startswith('http:'): article_url = 'http:' + article_url # self.logger.debug(article_url) root_content = root_2.find('p', {'class': "comment_txt"}) long_content = root_content.find('a', {'action-type': "fl_unfold"}) try: link_content = root_content.find('a').attrs['href'] link_content = ' 原文链接: ' + link_content except: link_content = '' if long_content: content_url = 'http://s.weibo.com/ajax/direct/morethan140?' + long_content.attrs[ 'action-data'] self.session.randomSleep() response = self.session.get(content_url, textRspOnly=False) try: content_html = response.json()['data']['html'] content = BeautifulSoup( content_html, 'html.parser').text.strip().replace( "'", "''").replace("%", "\%").replace(":", "\:") except Exception, e: self.logger.debug('Exception: %s' % e) continue else: content = root_content.text.strip().replace("'", "''").replace( "%", "\%").replace(":", "\:") content = content + link_content # self.logger.error(content) title = content[:30].replace("'", "''").replace("%", "\%").replace( ":", "\:") + ' ' author_id = r.attrs['tbinfo'] author_id = re.findall(r'ouid=(\d+)', author_id)[0] author_name = root_2.find('a').attrs['nick-name'] publish_datetime = root_2.find('a', { 'class': "W_textb" }).attrs['date'] try: publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(float(publish_datetime) / 1000)) except: continue article = Article(mid, self.channel.channel_id, title, content, publish_datetime, url=article_url, author_id=author_id, author_name=author_name) #fetch statistics root_3 = r.find('div', {'class': "feed_action clearfix"}) soup_li_list = root_3.findAll('li') self.__parseStatistics(article, soup_li_list) if article not in articleList: articleList.append(article) return (articleList, hasnext)
class WeiXinCrawlerByTopic(object): def __init__(self, sleepRange, logger): ''' Constructor ''' self.logger = logger self.session = SessionCrawler(None, sleepRange, self.logger) def __querySogou(self, sougoUrl): ''' Given the official account id, we expect to uniquely find one and only one item Return the url to the official account ''' self.logger.debug('Query sougo %s', sougoUrl) content = self.session.get(sougoUrl, SOUGO_HEADER) soup = BeautifulSoup(content) ul = soup.findAll('ul',{'class':'news-list'})[0] liList = ul.findAll('li') for li in liList: aList = li.findAll('a') articleUrl = None for a in aList: if a['uigs'].startswith('article_title'): articleUrl = a['href'] break if articleUrl is not None: header = WX_HEADER.copy() header['Referer'] = sougoUrl self.session.randomSleep() content = self.session.get(articleUrl, header) article = self.parseArticle(content) article.contentUrl = articleUrl queryComment(self.session, articleUrl, article) def parseArticle(self, content): soup = BeautifulSoup(content) article = Article() #content div = soup.findAll('div',{'class':'rich_media_content'}) if len(div)==0: #it may be due to that this post has been banned self.logger.warn('No content') return None #raise Exception('No content for %s'% article.title) article.content = div[0].text #title in <title> ... </title> title = soup.findNext('title') article.title = title.text article.wid = hash(article.title) #find meta list divMeta = soup.findAll('div',{'class':'rich_media_meta_list'})[0] #post date emPostdate = divMeta.findAll('em',{'id':'post-date'})[0] article.publishDateTime = time.mktime(datetime.datetime.strptime(emPostdate.text, '%Y-%m-%d').timetuple()) #author emAuthorList = divMeta.findAll('em',{'class':'rich_media_meta rich_media_meta_text'}) for em in emAuthorList: if 'id' not in em.attrs: article.author = em.text break #profile divProfile = divMeta.findAll('div',{'class':'profile_inner'})[0] ##nickname strong = divProfile.findAll('strong',{'class':'profile_nickname'})[0] article.userName = strong.text ##userid article.userId = strong.findNext('span').text return article
class WeiXinCralwer(object): ''' classdocs ''' def __init__(self, sleepRange, logger): ''' Constructor ''' self.logger = logger self.session = SessionCrawler(None, sleepRange, self.logger) def __querySogou(self, sougoUrl): ''' Given the official account id, we expect to uniquely find one and only one item Return the url to the official account ''' self.logger.debug('Query sougo %s', sougoUrl) content = self.session.get(sougoUrl, SOUGO_HEADER) soup = BeautifulSoup(content) item = soup.findAll('a',{'uigs':'account_name_0'})[0] return item['href'] def __queryArticleList(self, sougoUrl, officialAccountUrl): self.logger.debug('Query ariticle list for %s', officialAccountUrl) header = WX_HEADER.copy() header['Referer'] = sougoUrl self.session.randomSleep() content = self.session.get(officialAccountUrl, header) lindex = content.find(WX_ARTICLE_LIST_PREFIX)+len(WX_ARTICLE_LIST_PREFIX) rindex = content.find(WX_ARTICLE_LIST_SUFFIX) rindex = lindex + content[lindex:rindex].rfind(';') js = content[lindex:rindex] jo = json.loads(js) aList = jo['list'] articleList = list() for item in aList: app_msg_ext_info = item['app_msg_ext_info'] comm_msg_info = item['comm_msg_info'] article = self.__fetchArticle(app_msg_ext_info) article.publishDateTime = comm_msg_info['datetime'] articleList.append(article) if 'multi_app_msg_item_list' in item: for embedItem in item['multi_app_msg_item_list']: article = self.__fetchArticle(embedItem) article.publishDateTime = comm_msg_info['datetime'] articleList.append(article) return articleList def __fetchArticle(self, item): article = Article() article.title = item['title'] article.wid = str(hash(article.title)) article.author = item['author'] article.contentUrl = item['content_url'] article.digest = item['digest'] article.fileid = item['fileid'] article.sourceUrl = item['source_url'] #print article.title,":",article.contentUrl,'\n' return article def __queryComment(self, articleList, referer): mainPageHeader = WX_HEADER.copy() mainPageHeader['Referer'] = referer for article in articleList: self.logger.debug('Query comment for %s', article.title) #find the signature lindex= article.contentUrl.find('signature=') rindex = article.contentUrl[lindex:].find('&') if rindex>0: signature = article.contentUrl[lindex:rindex] else: signature = article.contentUrl[lindex:] #find the timestamp lindex= article.contentUrl.find('timestamp=')+len('timestamp=') timestamp = int(article.contentUrl[lindex:lindex+10]) self.session.randomSleep() #query main page mainUrl = 'http://mp.weixin.qq.com'+article.contentUrl.replace('&','&') self.session.randomSleep() content = self.session.get(mainUrl, mainPageHeader) soup = BeautifulSoup(content) div = soup.findAll('div',{'class':'rich_media_content'}) if len(div)==0: #it may be due to that this post has been banned self.logger.warn('No content for %s', article.title) continue #raise Exception('No content for %s'% article.title) article.content = div[0].text #query comment page currentTime = int(time.time()) url = WX_COMMENT_URL % (timestamp, signature) #print url header = WX_COMMENT_HEADER.copy() header['Referer'] = mainUrl self.session.randomSleep() content = self.session.get(url, header) jo = json.loads(content) #print jo.keys() article.readCount = jo['read_num'] article.likeCount = jo['like_num'] commentList = jo['comment'] for item in commentList: comment = Comment() comment.commenterNickName = item['nick_name'] comment.likeCount = item['like_num'] comment.content = item['content'] comment.contentId = item['content_id'] comment.createTime = item['create_time'] for replyItem in item['reply']['reply_list']: reply = Reply() reply.content = replyItem['content'] reply.createTime = replyItem['create_time'] reply.uin = replyItem['uin'] reply.toUin = replyItem['to_uin'] reply.replyId = replyItem['reply_id'] comment.replyList.append(reply) article.commentList.append(comment) def crawl(self, officialAccountId): sougoUrl = SOUGO_QUERY_URL % (1, quote(officialAccountId)) officialAccountUrl = self.__querySogou(sougoUrl) articleList = self.__queryArticleList(sougoUrl, officialAccountUrl) self.__queryComment(articleList, officialAccountUrl) return articleList #self.__writeCsv(officialAccountId+'.csv', articleList) def writeDb(self, dbConf, officialAccountId, articleList): dbProxy = MySqlProxy(host=dbConf['dbHost'], port=3306, user=dbConf['dbUser'], passwd=dbConf['dbPasswd'], db=dbConf['dbName']) weixinSql = 'INSERT INTO T_WEIXIN (pid, wid, author, title, digest, content, publish_datetime, read_count, like_count) values ' commentSql = 'INSERT INTO T_WEIXIN_COMMENT(pid, cid, wid, content, publisher_name, publish_datetime,like_count) values ' replySql = 'INSERT INTO T_WEIXIN_REPLY (rid, cid, content, publish_datetime, uin, touin) values ' weixinValueList = list() commentValueList = list() replyValueList = list() widSet = set() for article in articleList: weixinValueList.append('("%s","%s","%s","%s","%s","%s","%s",%d,%d)'%( officialAccountId, str(article.wid), article.author.replace('"','\\"'), article.title.replace('"','\\"'), article.digest.replace('"','\\"'), article.content.replace('"','\\"'), time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)), article.readCount, article.likeCount )) widSet.add(article.fileid) for comment in article.commentList: commentValueList.append('("%s","%s","%s","%s","%s","%s",%d)'%( officialAccountId, str(comment.contentId), str(article.wid), comment.content.replace('"','\\"'), comment.commenterNickName.replace('"','\\"'), time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime)), comment.likeCount )) for reply in comment.replyList: replyValueList.append('("%s","%s","%s","%s","%s","%s")'%( str(reply.replyId), str(comment.contentId), reply.content.replace('"','\\"'), time.strftime(ISOTIMEFORMAT, time.localtime(reply.createTime)), reply.uin, reply.toUin )) #clear the db firstly sql = 'delete from T_WEIXIN where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId) dbProxy.execute(sql) sql = 'delete from T_WEIXIN_REPLY where cid in (select cid from T_WEIXIN_COMMENT where wid in (%s) and pid="%s")' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId) dbProxy.execute(sql) sql = 'delete from T_WEIXIN_COMMENT where wid in (%s) and pid="%s"' % (','.join(map(lambda x: '"'+str(x)+'"', widSet)), officialAccountId) dbProxy.execute(sql) #insert to db if len(weixinValueList)>0: self.logger.info('Insert %d records to weixin', len(weixinValueList)) dbProxy.execute(weixinSql +','.join(weixinValueList)) if len(commentValueList)>0: self.logger.info('Insert %d records to comment', len(commentValueList)) dbProxy.execute(commentSql +','.join(commentValueList)) if len(replyValueList)>0: self.logger.info('Insert %d records to reply', len(replyValueList)) dbProxy.execute(replySql +','.join(replyValueList)) def __writeCsv(self, fileName, articleList): #f = codecs.open(fileName,"w","utf-8") csvfile = file(fileName,'w') csvfile.write(codecs.BOM_UTF8) writer = csv.writer(csvfile) header = ['Title', 'Digest', 'Author', 'readCount', 'likeCount', 'publishDateTime', 'Comment-NickName', 'Comment-Content', 'Comment-likeCount', 'Comment-CreateTime'] writer.writerow(header) for article in articleList: writer.writerow( ( article.title.encode('utf8'), article.digest.encode('utf8'), article.author.encode('utf8'), article.readCount, article.likeCount, time.strftime(ISOTIMEFORMAT, time.localtime(article.publishDateTime)), '', '', '', '' ) ) for comment in article.commentList: writer.writerow( ( '', '', '', '', '', '', comment.commenterNickName.encode('utf8'), comment.content.encode('utf8'), comment.likeCount, time.strftime(ISOTIMEFORMAT, time.localtime(comment.createTime)) ) ) csvfile.close()
class WetchartpublicCrawler(object): ''' calssdocs ''' def __init__(self, channel, logger=None): ''' 构造函数 :param channel: :param logger: ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.channel = channel def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 :param keywordList: 关键字数组 :param endTime: 搜索时间范围结束 :return: ''' for keyword in keywordList: pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword) self.logger.debug('pageUrl:%s', pageUrl) response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) soup = BeautifulSoup(response.text) main = soup.find('ul', {'class': "news-list2"}) if main is None: self.logger.error('Fail to parse:%s', response.text) li_list = main.findAll('li') for li in li_list: a_title = li.find('p', {'class': "tit"}) if a_title is not None: title = str(a_title.text.strip()) if title == keyword: a_href = a_title.find('a')['href'] self.logger.debug(a_href) self.crawlWetchartpublic(a_href) def crawlWetchartpublic(self, url): response = self.session.get(url, textRspOnly=False, headers=HEADERS_SOGOU_PUBLIC) soup = BeautifulSoup(response.text) script_list = soup.findAll('script') for li in script_list: li_str = str(li.text) sub_str1 = "msgList = " sub_str2 = '}]};' if li_str.find(sub_str1) != -1: index1 = li_str.find(sub_str1) index2 = li_str.find(sub_str2) main = str(li.text)[index1 + len(sub_str1):index2 + 3] articleJson = json.loads(main) articlelist = articleJson['list'] for item in articlelist: mes_info = item['app_msg_ext_info'] url = 'https://mp.weixin.qq.com' + mes_info['content_url'] url = url.replace('amp;', '') self.crawlArticle(url) multi_item_list = mes_info['multi_app_msg_item_list'] for multi_item in multi_item_list: multi_url = 'https://mp.weixin.qq.com' + multi_item[ 'content_url'] multi_url = multi_url.replace('amp;', '') self.crawlArticle(multi_url) def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 :return:返回一个article实例 ''' self.session.randomSleep() response = self.session.get(url, textRspOnly=False) def crawlStatistics(self, article): ''' 爬去统计信息 :param article: :return: 无需返回参数,统计信息写入article实例 ''' pass def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' pass def crawlComment(self, article): ''' 根据文章,爬去文章的评论,返回评论列表 :param article: :return:(commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' pass
class ZhihuCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.baiduCrawler = BaiduCrawler(self.logger) self.session = SessionCrawler() self.channel = channel self.nextCommentUrl = None def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeIntSecond = time.mktime(startTime.timetuple()) endTimeIntSecond = time.mktime(endTime.timetuple()) urls = self.baiduCrawler.search(self.channel.url, keywordList, startTimeIntSecond, endTimeIntSecond) articleList = list() for baiduUrl in urls: url = self.__fetchRealUrlFromBaiduUrl(baiduUrl) article = self.crawlArticle(url) if article is not None and article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def __fetchRealUrlFromBaiduUrl(self, baiduUrl): ''' ''' response = self.session.session.get(baiduUrl, allow_redirects=False) if response.status_code == 302: return response.headers['Location'] def crawlArticle(self, url): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' #判断url格式,因为从百度查询所得不一定是question,目前只爬question if url.find('question') < 0: self.logger.warn('Question supported only:%s', url) return None article_id = re.findall(r'question/(\d+)', url)[0] self.session.randomSleep() response = self.session.get(url, headers=CRAWL_ARTICLE_HEADERS) soup = BeautifulSoup(response) main = soup.find('div', attrs={'id': "data"}).attrs['data-state'] articleJson = json.loads(main) questionJson = articleJson['entities']['questions'][article_id] title = questionJson['title'] contentSoup = BeautifulSoup(questionJson['editableDetail']) content = contentSoup.text author_id = questionJson['author']['id'] author_name = questionJson['author']['name'] createTimeInFloat = questionJson['created'] publish_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(createTimeInFloat)) reply_count = questionJson['commentCount'] read_count = questionJson['visitCount'] collect_count = questionJson['followerCount'] article = Article(article_id, self.channel.channel_id, title, content, publish_datetime, url, author_id, author_name) article.statistics.reply_count = reply_count article.statistics.read_count = read_count article.statistics.collect_count = collect_count return article def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' articleCopy = self.crawlArticle(article.url) article.statistics.reply_count = articleCopy.statistics.reply_count article.statistics.read_count = articleCopy.statistics.read_count article.statistics.collect_count = articleCopy.statistics.collect_count def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' self.nextCommentUrl = None def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' if self.nextCommentUrl is None: curl = COMMENT_URL_TEMPLATE % (article.tid, COMMENT_PAGE_SIZE, 0) curl = curl.replace('#', '%') else: curl = self.nextCommentUrl self.session.randomSleep() result = self.session.get(curl, headers=CRAWL_COMMENT_HEADERS) jo = json.loads(result) paging = jo['paging'] hasnext = not paging['is_end'] self.nextCommentUrl = paging['next'] dataList = jo['data'] add_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') commentList = list() for data in dataList: #self.logger.debug('[Zhihu]Comment data keys:%s', data.keys()) #self.logger.debug('[ZHIHU]Comment url for %s:%s', article.title, data['url']) publish_datetime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(data['updated_time'])) comment = Comment(article.tid, article.channel_id, data['id'], add_datetime, publish_datetime, ip_address=None, location_country=None, location_region=None, location_city=None, author_id=data['author']['id'], author_name=data['author']['name'], content=data['content'], reply_author_id=None, read_count=None, like_count=data['voteup_count'], reply_count=data['comment_count'], dislike_count=None) commentList.append(comment) return (commentList, hasnext)
class WechatCrawler(object): ''' classdocs ''' def __init__(self, channel, logger=None): ''' Constructor ''' if logger is None: self.logger = Logging.getLogger(Logging.LOGGER_NAME_DEFAULT) else: self.logger = logger self.session = SessionCrawler(logger=self.logger) self.session_public = SessionCrawler(logger=self.logger) self.session_public_article = SessionCrawler(logger=self.logger) self.channel = channel self.entityId = 'SYSU' def searchArticle(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索文章 @param keywordList: 关键字数组 @param endTime: 搜索时间范围结束 ''' startTime = endTime - datetime.timedelta( days=self.channel.search_ranges) startTimeStr = startTime.strftime('%Y-%m-%d') endTimeStr = endTime.strftime('%Y-%m-%d') self.logger.debug('startTime:%s', startTimeStr) self.logger.debug('endTime:%s', endTimeStr) urlList = list() publicList = self.getPublic() articleList = self.searchPublic(publicList, endTime) for keyword in keywordList: # 忽略第一次,第一次不带时间范围 pageUrl = (SOUGO_WEIXIN_URL_INIT % (urllib.quote( keyword.encode('utf-8')), int(time.time() * 1000))).replace( '#', '%') self.logger.debug('pageUrl:%s', pageUrl) self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) lastPageUrl = pageUrl pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote( keyword.encode('utf-8')), startTimeStr, endTimeStr) pageUrl = SOUGO_WEIXIN_URL_SUB_PAGE % (urllib.quote( keyword.encode('utf-8'))) self.logger.debug('pageUrl:%s', pageUrl) while True: # 用上一次url作为这次的referer headers = HEADERS_SOGOU.copy() headers['Referer'] = lastPageUrl self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=headers) soup = BeautifulSoup(response.text, 'lxml') main = soup.find('ul', {'class': "news-list"}) if main is None: self.logger.error('Fail to parse:%s', response.text) return [] li_list = main.findAll('li') #li_list有可能为空,但还可以翻页 for li in li_list: a_list = li.findAll('a') for a in a_list: if a['uigs'].startswith('article_title'): #self.logger.debug('Article title:%s',a.text) urlList.append((a['href'], pageUrl, a.text)) break pageBarList = soup.findAll('div', {'id': 'pagebar_container'}) if len(pageBarList) == 0: #没有翻页,直接退出 break pageBar = pageBarList[0] aList = pageBar.findAll('a') foundNextPage = False for a in aList: if a['uigs'] == 'page_next': foundNextPage = True lastPageUrl = pageUrl pageUrl = SOGOU_URL + a['href'] self.logger.debug('Found next page:%s', a.text) break if foundNextPage is False: break for item in urlList: article = self.crawlArticle(item[0], referer=item[1], title=item[2], flag=0) if article is not None: if article not in articleList: #同一文章可能会在搜索结果出现多次,在baidu的结果url是不重复,但是实际可能指向同一文章,需要去重 articleList.append(article) return articleList def searchPublic(self, keywordList, endTime): ''' 根据关键字数组,开始时间和结束时间范围搜索公众号 :param keywordList: :param endTime: :return: ''' articleList = list() for keyword in keywordList: self.logger.debug(keyword) pageUrl = SOUGO_WECHARTPUBLIC_URL_INIT % (keyword[0]) self.logger.info('pageUrl:%s', pageUrl) self.session.randomSleep() response = self.session.get(pageUrl, textRspOnly=False, headers=HEADERS_SOGOU) soup = BeautifulSoup(response.text, 'lxml') main = soup.find('ul', {'class': "news-list2"}) if main is None: self.logger.error('Fail to parse:%s', response.text) li_list = main.findAll('li') for li in li_list: a_title = li.find('p', {'class': "tit"}) if a_title is not None: title = str(a_title.text.strip()) if title == keyword[0]: self.logger.debug(title) a_href = a_title.find('a')['href'] sub_articleList = self.crawlWetchartpublic(a_href) for article in sub_articleList: articleList.append(article) return articleList def crawlWetchartpublic(self, url): ''' 按公众号爬取文章 :param url: :return: ''' self.logger.debug(url) self.session_public.randomSleep() response = self.session_public.get(url, textRspOnly=False, headers=HEADERS_ARTICLE) soup = BeautifulSoup(response.text, 'lxml') self.logger.debug(soup) script_list = soup.findAll('script') # if len(script_list) == 0: # script_list = self.isCrawlerPublic(url) articleList = list() for li in script_list: li_str = str(li.text) sub_str1 = "msgList = " sub_str2 = '}]};' if li_str.find(sub_str1) != -1: index1 = li_str.find(sub_str1) index2 = li_str.find(sub_str2) main = str(li.text)[index1 + len(sub_str1):index2 + 3] articleJson = json.loads(main) articlelistJson = articleJson['list'] for item in articlelistJson: mes_info = item['app_msg_ext_info'] url = 'https://mp.weixin.qq.com' + mes_info['content_url'] url = url.replace('amp;', '') self.logger.debug('article_url:' + url) article = self.crawlArticle(url, flag=1) articleList.append(article) multi_item_list = mes_info['multi_app_msg_item_list'] for multi_item in multi_item_list: multi_url = 'https://mp.weixin.qq.com' + multi_item[ 'content_url'] multi_url = multi_url.replace('amp;', '') self.logger.debug('article_url:' + multi_url) article = self.crawlArticle(multi_url, flag=1) if article is not None: articleList.append(article) return articleList def crawlStatistics(self, article): ''' 爬取统计信息 @return: 无需返回参数,统计信息写入article实例 ''' pass def crawlArticle(self, url, **kwargs): ''' 根据url爬取文章内容和统计信息 @return: 返回一个Article实例 ''' #TBD, 转发情况目前先不考虑 searchTitle = '' referer = None if (kwargs['flag'] == 0): if 'referer' not in kwargs: return None if 'title' in kwargs: searchTitle = kwargs['title'] else: searchTitle = '' try: self.session_public_article.randomSleep() if (kwargs['flag'] == 0): referer = kwargs['referer'] headers = HEADERS_ARTICLE.copy() headers['Referer'] = referer response = self.session_public_article.get( url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE) else: response = self.session_public_article.get( url, textRspOnly=False, headers=HEADER_PUBLIC_ARTICLE) mid = re.findall(r'var mid = .*"(\d+)";', response.text)[0] + '-' + re.findall( r'var idx = .*"(\d+)";', response.text)[0] soup = BeautifulSoup(response.text, 'lxml') main = soup.find('div', {'id': "img-content"}) title = main.find('h2').text.strip() content = main.find('div', {'id': "js_content"}).text.strip() profile = main.find('div', {'class': "profile_inner"}) author_id = profile.find('span').text.strip() author_name = profile.find('strong').text.strip() try: publish_datetime = main.find('em', { 'id': "post-date" }).text.strip() + ' 00:00:00' publish_datetime = datetime.datetime.strptime( publish_datetime, '%Y-%m-%d %H:%M:%S') except: self.logger.warn( 'Fail to parse publish_datetime, use current time as time') publish_datetime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') article = Article(mid, self.channel.channel_id, title, content, publish_datetime=publish_datetime, url=url, author_id=author_id, author_name=author_name, meta_info='{refer="%s"}' % referer) self.logger.debug('Successfully parse article:%s', title) return article except: self.logger.error('Fail to get article for %s: %s due to %s', url, searchTitle, traceback.format_exc()) return None def refreshSearch(self): ''' 重置搜索 ''' pass def refreshCommentCrawler(self): ''' 重置评论爬取 ''' pass def crawlComment(self, article): ''' 根据文章,爬取文章的评论,返回评论列表 @return: (commentList, hasnext)二元组,commentList是指评论数组(每个元素是Comment实例),hasnext表示是否还有要爬取 ''' return (list(), False) def getPublic(self): self.conf = self.__readDbConf() publictablename = Constants.TABLE_SA_WETCHARTPUBLIC + Constants.TABLE_NAME_DELIMITER + self.entityId sql = ''' SELECT public_name FROM %s ''' % (publictablename) dbProxy = MySqlProxy(self.conf[CONF_FILE_DBCONF][CONF_FILE_DBHOST], 3306, self.conf[CONF_FILE_DBCONF][CONF_FILE_DBUSER], self.conf[CONF_FILE_DBCONF][CONF_FILE_DBPASS], self.conf[CONF_FILE_DBCONF][CONF_FILE_DBNAME]) # dbProxy = MySqlProxy('localhost', 3306, 'root', 'zzm15331411', 'sentiment_re') # dbProxy = MySqlProxy('112.124.47.197', 3306, 'test', 'test', 'sa2') dbProxy.execute(sql) resultList = dbProxy.fetchall() # resultList = [(u'今日中大',),] return resultList def __readDbConf(self): fileName = os.path.join('conf', CONF_FILE_NAME) c = Configuration(fileName) return c.readConfig() def getProxyPageList(self, url): url_list = [] for i in range(1, 100): url_new = url + str(i) url_list.append(url_new) return url_list def getProxyIpList(self, url): response = self.session.get(url, textRspOnly=False, headers=PROXY_IP_URL_HEADER, timeout=2) host_list = etree.HTML(response.text).xpath( '//table[contains(@id,"ip_list")]/tr/td[2]/text()') port_list = etree.HTML(response.text).xpath( '//table[contains(@id,"ip_list")]/tr/td[3]/text()') ip_list = list() for i in range(0, len(host_list)): ip = host_list[i] + r':' + port_list[i] ip_list.append(ip) return ip_list def verifyIp(self, ip): proxy = {'http': 'http://' + ip} proxy_handler = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_handler) urllib2.install_opener(opener) test_url = 'http://www.baidu.com' req = urllib2.Request(url=test_url, headers=PROXY_IP_URL_HEADER) try: res = urllib2.urlopen(req) content = res.read() if content: self.logger.debug(r'https://' + ip + ' is OK') else: self.logger.debug(r'https://' + ip + ' is BAD') ip = "" return ip except urllib2.URLError as e: self.logger.debug(r'https://' + ip + ' ' + str(e.reason)) return "" except: self.logger.debug(r'https://' + ip + ' Other Error') return "" def getVaildIp(self): page_url_list = self.getProxyPageList(PROXY_IP_URL) ip_list = list() for page_url in page_url_list: page_ip_list = self.getProxyIpList(page_url) for ip in page_ip_list: ip = self.verifyIp(ip) if ip != "": ip_list.append(ip) return ip_list def isCrawlerPublic(self, url): data_script_list = list() page_url_list = self.getProxyPageList(PROXY_IP_URL) flag = 0 for page_url in page_url_list: page_ip_list = self.getProxyIpList(page_url) flag = 0 for ip in page_ip_list: flag = 0 ip = self.verifyIp(ip) if ip != "": # self.session_public.randomSleep() proxy = {'http': r'http://' + ip} try: response = self.session_public.get( url, textRspOnly=True, headers=HEADERS_ARTICLE, proxies=proxy) soup = BeautifulSoup(response.text, 'lxml') self.logger.debug(response.text) script_list = soup.findAll('script') if len(script_list) != 0: flag = 0 for li in script_list: li_str = str(li.text) sub_str1 = "msgList = " if li_str.find(sub_str1) != -1: data_script_list.append(li) flag = 1 break if flag == 1: break except: self.logger.debug( 'The ip can not be used to crawler public') if flag == 1: break if (flag == 0): self.logger.debug('The ip can not be used to crawler public') return data_script_list