def get(self): args_dict = self.args2dict() user_id = args_dict.get('user_id') if user_id is None: # 此时URL缺少查询参数 self.write(WeiboCurlError.REQUEST_LACK_ARGS) return try: # 爬取主页的结果 idx_curl_result = yield weibo_web_curl(SpiderAim.users_show, user_id=user_id) if not idx_curl_result['error_code']: idxParser = IndexParser( user_id, idx_curl_result.get('response')) # 构建一个主页解析器 try: user_id = idxParser.get_user_id() # 获取到真正的user_id max_page_num = idxParser.get_page_num() # 获取微博的页数 except CookieInvalidException: self.write(WeiboCurlError.COOKIE_INVALID) return # 爬取信息页的结果 info_curl_result = yield weibo_web_curl(SpiderAim.users_info, user_id=user_id) if not info_curl_result['error_code']: infoParser = InfoParser( info_curl_result.get('response')) # 信息页解析器 user_info = infoParser.extract_user_info() user = idxParser.get_user(user_info) user['max_page'] = max_page_num # 微博的最大页数 # print(user) success = settings.SUCCESS.copy() try: success['data'] = {'result': user, 'cursor': ''} except AttributeError: # user没有__dict__属性时,说明未爬取到user self.write(WeiboCurlError.REQUEST_ARGS_ERROR) # 报告参数错误 return self.write(success) return else: error_res = curl_result_to_api_result(info_curl_result) self.write(error_res) return else: error_res = curl_result_to_api_result(idx_curl_result) self.write(error_res) return except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log(e) self.write(WeiboCurlError.UNKNOWN_ERROR) return
def get_original_weibo(self, info, weibo_id): """获取原创微博""" try: weibo_content = utils.handle_garbled(info) weibo_content = weibo_content[:weibo_content.rfind(u'赞')] a_text = info.xpath('div//a/text()') if u'全文' in a_text: # 构造 CommentParser comment_resp = None for i in range(settings.RETRY_TIME): comment_curl_result = yield weibo_web_curl( SpiderAim.weibo_comment, weibo_id=weibo_id) if not comment_curl_result['error_code']: comment_resp = comment_curl_result['response'] break if i == settings.RETRY_TIME - 1: raise CurlError commentParser = CommentParser(weibo_id, comment_resp) wb_content = commentParser.get_long_weibo() if wb_content: weibo_content = wb_content # 获取topics和at_users at_users, topics = PageParser.__get_atusers_and_topics(info) return weibo_content, topics, at_users except Exception as e: utils.report_log(e) raise HTMLParseException
def get(self): # 获取参数 args_dict = self.args2dict() keyword, cursor = args_dict.get('keyword'), args_dict.get( 'cursor', '1') if keyword is None: self.write(WeiboCurlError.REQUEST_LACK_ARGS) # 缺少参数 return try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return if cursor > SEARCH_LIMIT_PAGES: result = settings.SUCCESS.copy() result['data'] = {'result': [], 'cursor': '0'} self.write(result) return user_type, gender, age_limit = args_dict.get( 'user_type'), args_dict.get('gender'), args_dict.get('age_limit') # 进行爬取 search_users_curl_result = yield weibo_web_curl(SpiderAim.search_users, keyword=keyword, user_type=user_type, gender=gender, age_limit=age_limit, page_num=cursor) if not search_users_curl_result['error_code']: self.response = search_users_curl_result['response'] else: error_res = curl_result_to_api_result(search_users_curl_result) self.write(error_res) return # 构建解析器 searchUsersParser = SearchUsersParser(self.response) # 提取信息 try: user_list = searchUsersParser.parse_page() except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return # 返回信息 if user_list: # print(user_list) success = settings.SUCCESS.copy() success['data'] = { 'result': user_list, 'cursor': str(cursor + 1) if cursor < SEARCH_LIMIT_PAGES else '0' } self.write(success) return self.write(WeiboCurlError.UNKNOWN_ERROR) return
def _build_selector(self): """构造self.selector,如果""" if self.selector is None: comment_curl_result = yield weibo_web_curl(SpiderAim.weibo_comment, weibo_id=self.weibo_id) if not comment_curl_result['error_code']: self.selector = etree.HTML( comment_curl_result['response'].body) self.info_node = self.selector.xpath("//div[@id='M_']")[0] else: self.selector = None
def get(self): args_dict = self.args2dict() user_id = args_dict.get('user_id') if user_id is None: # 此时缺少参数 self.write(WeiboCurlError.REQUEST_LACK_ARGS) return cursor = args_dict.get('cursor', '1') try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return filter = args_dict.get('filter', 0) # 默认爬取全部微博(原创+转发) page_curl_result = yield weibo_web_curl(SpiderAim.users_weibo_page, user_id=user_id, page_num=cursor) if not page_curl_result['error_code']: pageParser = PageParser(user_id, page_curl_result['response'], filter) else: error_res = curl_result_to_api_result(page_curl_result) self.write(error_res) return try: weibos, max_page = yield pageParser.get_one_page() if cursor == 1: user = pageParser.get_user_info_when_first_page() else: user = pageParser.get_user_info_except_first_page() except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return success = settings.SUCCESS.copy() try: success['data'] = { 'result': { 'user': user, 'weibos': [weibo.__dict__ for weibo in weibos] }, 'cursor': str(cursor + 1) if cursor < max_page else '0' } except AttributeError: # user没有__dict__属性时,说明未爬取到user self.write(WeiboCurlError.REQUEST_ARGS_ERROR) # 报告参数错误 return # print(success) self.write(success) return
def get(self): # 获取查询参数 args_dict = self.args2dict() user_id, cursor = args_dict.get('user_id'), args_dict.get( 'cursor', '1') if user_id is None: self.write(WeiboCurlError.REQUEST_LACK_ARGS) return try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return # 进行爬取 follow_curl_result = yield weibo_web_curl(SpiderAim.follow, user_id=user_id, page_num=cursor) if not follow_curl_result['error_code']: self.response = follow_curl_result['response'] else: error_res = curl_result_to_api_result(follow_curl_result) self.write(error_res) return # 构建解析器 followParser = FollowParser(self.response) # 提取相关信息并返回结果 try: follow_list = followParser.get_follows() # 关注者的列表 max_page_num = followParser.get_max_page_num() # 总页数 if cursor < max_page_num: cursor = str(cursor + 1) success = settings.SUCCESS.copy() success['data'] = { 'result': { 'friend_list': follow_list, 'max_page_num': max_page_num }, 'cursor': cursor } # print(success) self.write(success) return except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log(e) self.write(WeiboCurlError.UNKNOWN_ERROR)
def extract_picture_urls(info, weibo_id): """提取微博原始图片url""" try: first_pic = '/mblog/pic/' + weibo_id all_pic = '/mblog/picAll/' + weibo_id picture_urls = list() a_list = info.xpath('div/a/@href') all_href = ''.join(a_list) if first_pic in all_href: # 检查是否有单张的缩略图 if all_pic in all_href: # 检查该条微博是否有多图 mblog_picall_curl_result = yield weibo_web_curl( SpiderAim.mblog_pic_all, weibo_id=weibo_id) mblogPicAllParser = None if not mblog_picall_curl_result['error_code']: mblogPicAllParser = MblogPicAllParser( mblog_picall_curl_result['response']) preview_picture_list = mblogPicAllParser.extract_preview_picture_list( ) picture_urls = [ p.replace('/thumb180/', '/large/') for p in preview_picture_list ] else: if info.xpath('.//img/@src'): for link in info.xpath('div/a'): if len(link.xpath('@href')) > 0: if first_pic in link.xpath('@href')[0]: if len(link.xpath('img/@src')) > 0: preview_picture = link.xpath( 'img/@src')[0] picture_urls = [ preview_picture.replace( '/wap180/', '/large/') ] break else: LOGGING.warning( u'爬虫微博可能被设置成了"不显示图片",请前往' u'"https://weibo.cn/account/customize/pic",修改为"显示"' ) sys.exit() return picture_urls except Exception as e: utils.report_log(e) return u'无'
def get(self): # 获取参数 args_dict = self.args2dict() # 查询参数 -> 参数字典 keyword, cursor, is_hot = args_dict.get('keyword'), args_dict.get( 'cursor', '1'), args_dict.get('is_hot', False) if keyword is None: self.write(WeiboCurlError.REQUEST_LACK_ARGS) # 缺少参数 return try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return # 进行爬取 search_weibo_curl_result = yield weibo_web_curl(SpiderAim.search_weibo, keyword=keyword, page_num=cursor, is_hot=is_hot) if not search_weibo_curl_result['error_code']: self.response = search_weibo_curl_result['response'] else: error_res = curl_result_to_api_result(search_weibo_curl_result) self.write(error_res) return # 构建解析器 searchWeiboParser = SearchWeiboParser(self.response) # 获取微博信息 try: weibo_list = searchWeiboParser.parse_page() # print(weibo_list) except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return if weibo_list is None: self.write(WeiboCurlError.PAGE_NOT_FOUND) # 页面找不到 return # 成功返回结果 success = settings.SUCCESS.copy() success['data'] = { 'result': weibo_list, 'cursor': str(cursor + 1) if cursor < 50 else '0' } self.write(success) return
def get(self): # 获取参数 args_dict = self.args2dict() weibo_id = args_dict.get('weibo_id') if weibo_id is None: self.write(WeiboCurlError.REQUEST_LACK_ARGS) return hot = args_dict.get('hot', False) # 是否获取热评 cursor = args_dict.get('cursor', '1') try: cursor = 1 if not cursor else int(cursor) except ValueError: self.write(WeiboCurlError.REQUEST_ARGS_ERROR) return if cursor > SEARCH_LIMIT_PAGES: results = settings.SUCCESS.copy() results['data'] = {'result': [], 'cursor': '0'} self.write(results) return # 进行爬取 comment_curl_result = yield weibo_web_curl(SpiderAim.weibo_comment, weibo_id=weibo_id, page_num=cursor) if not comment_curl_result['error_code']: self.response = comment_curl_result['response'] else: error_res = curl_result_to_api_result(comment_curl_result) self.write(error_res) return # 构建解析器 try: commonParser = CommentParser(weibo_id, response=self.response) except CookieInvalidException: self.write(WeiboCurlError.COOKIE_INVALID) return try: weibo_detail = yield commonParser.parse_one_weibo() except HTMLParseException as e: report_log(e) self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log(e) self.write(WeiboCurlError.UNKNOWN_ERROR) return # 根据 hot 参数来确定获取 comment_list 的方式 if not hot: comment_list = commonParser.get_all_comment() else: hot_comment_curl_result = yield weibo_web_curl( SpiderAim.hot_comment, weibo_id=weibo_id, page_num=cursor) if not hot_comment_curl_result['error_code']: self.hot_comment_response = hot_comment_curl_result['response'] else: error_res = curl_result_to_api_result(comment_curl_result) self.write(error_res) return try: comment_list = HotCommentParser( weibo_id, self.hot_comment_response).get_all_comment() except HTMLParseException: self.write(WeiboCurlError.HTML_PARSE_ERROR) return except Exception as e: report_log( (__class__.__name__, StatusesShowHandler.get.__name__), e) self.write(WeiboCurlError.UNKNOWN_ERROR) return # 成功时返回结果 weibo_detail['weibo_id'] = weibo_id weibo_detail['comments'] = comment_list success = settings.SUCCESS.copy() success['data'] = { 'result': weibo_detail, 'cursor': str(cursor + 1) if cursor < weibo_detail['max_page'] else '0' } # print(success) self.write(success) return
def get_retweet(self, info, weibo_id, weibo: Weibo): """获取转发微博""" try: weibo_content = utils.handle_garbled(info) weibo_content = weibo_content[weibo_content.find(':') + 1:weibo_content.rfind(u'赞')] weibo_content = weibo_content[:weibo_content.rfind(u'赞')] # 检查当前是否已经为全部微博内容 a_text = info.xpath('div//a/text()') if u'全文' in a_text: # 构造 CommentParser comment_resp = None for i in range(settings.RETRY_TIME): comment_curl_result = yield weibo_web_curl( SpiderAim.weibo_comment, weibo_id=weibo_id) if not comment_curl_result['error_code']: comment_resp = comment_curl_result['response'] break if i == settings.RETRY_TIME - 1: raise CurlError commentParser = CommentParser(weibo_id, comment_resp) wb_content = commentParser.get_long_retweet(rev_type=dict) if wb_content: weibo_content = wb_content # 提取转发理由 if isinstance(weibo_content, dict): retweet_reason = weibo_content.get('retweet_reason') retweet_id = weibo_content.get('retweet_id') weibo_content = weibo_content.get('retweet') else: original_div = utils.handle_garbled(info.xpath('div')[-1]) retweet_reason = original_div[original_div.find(':') + 1:original_div.rindex(u'赞')] retweet_id = self.get_retweet_id(info) # 提取原始用户 original_user_node = info.xpath('./div/span[@class="cmt"]/a')[0] original_user = ''.join(original_user_node.xpath("./text()")) original_user_id = original_user_node.get('href') if original_user_id is not None: original_user_id = original_user_id[original_user_id. rfind(r'/') + 1:] # 获取原始微博的footers original_footer_div = info.xpath(r'./div')[-2] footer_nodes = original_footer_div.xpath( r'.//span[@class="cmt"] | .//a[@class="cc"]')[-3:] original_like_num = 0 original_retweet_num = 0 original_comment_num = 0 for i, footer_node in enumerate(footer_nodes): num = ''.join(footer_node.xpath('./text()')) try: num = int(num[num.find('[') + 1:num.rfind(']')]) except BaseException: pass if i == 0: original_like_num = num elif i == 1: original_retweet_num = num elif i == 2: original_comment_num = num # 获取话题 original_div = info.xpath('./div')[0] retweet_div = info.xpath('./div')[-1] retweet_at_users, retweet_topics = PageParser.__get_atusers_and_topics( retweet_div) original_at_users, original_topics = PageParser.__get_atusers_and_topics( original_div) weibo.retweet['weibo_id'] = retweet_id weibo.retweet['user_id'] = original_user_id weibo.retweet['screen_name'] = original_user weibo.retweet['text'] = weibo_content weibo.retweet['topics'] = original_topics weibo.retweet['at_users'] = original_at_users weibo.retweet['attitudes_count'] = original_like_num weibo.retweet['comments_count'] = original_comment_num weibo.retweet['reposts_count'] = original_retweet_num weibo.topics = retweet_topics weibo.at_users = retweet_at_users weibo.text = retweet_reason except Exception as e: utils.report_log(e) raise HTMLParseException