예제 #1
0
    def get(self):
        args_dict = self.args2dict()
        user_id = args_dict.get('user_id')
        if user_id is None:  # 此时URL缺少查询参数
            self.write(WeiboCurlError.REQUEST_LACK_ARGS)
            return

        try:
            # 爬取主页的结果
            idx_curl_result = yield weibo_web_curl(SpiderAim.users_show,
                                                   user_id=user_id)
            if not idx_curl_result['error_code']:
                idxParser = IndexParser(
                    user_id, idx_curl_result.get('response'))  # 构建一个主页解析器

                try:
                    user_id = idxParser.get_user_id()  # 获取到真正的user_id
                    max_page_num = idxParser.get_page_num()  # 获取微博的页数
                except CookieInvalidException:
                    self.write(WeiboCurlError.COOKIE_INVALID)
                    return

                # 爬取信息页的结果
                info_curl_result = yield weibo_web_curl(SpiderAim.users_info,
                                                        user_id=user_id)
                if not info_curl_result['error_code']:
                    infoParser = InfoParser(
                        info_curl_result.get('response'))  # 信息页解析器
                    user_info = infoParser.extract_user_info()
                    user = idxParser.get_user(user_info)
                    user['max_page'] = max_page_num  # 微博的最大页数
                    # print(user)

                    success = settings.SUCCESS.copy()
                    try:
                        success['data'] = {'result': user, 'cursor': ''}
                    except AttributeError:  # user没有__dict__属性时,说明未爬取到user
                        self.write(WeiboCurlError.REQUEST_ARGS_ERROR)  # 报告参数错误
                        return
                    self.write(success)
                    return
                else:
                    error_res = curl_result_to_api_result(info_curl_result)
                    self.write(error_res)
                    return
            else:
                error_res = curl_result_to_api_result(idx_curl_result)
                self.write(error_res)
                return

        except HTMLParseException:
            self.write(WeiboCurlError.HTML_PARSE_ERROR)
            return
        except Exception as e:
            report_log(e)
            self.write(WeiboCurlError.UNKNOWN_ERROR)
            return
예제 #2
0
    def get_original_weibo(self, info, weibo_id):
        """获取原创微博"""
        try:
            weibo_content = utils.handle_garbled(info)
            weibo_content = weibo_content[:weibo_content.rfind(u'赞')]
            a_text = info.xpath('div//a/text()')
            if u'全文' in a_text:
                # 构造 CommentParser
                comment_resp = None
                for i in range(settings.RETRY_TIME):
                    comment_curl_result = yield weibo_web_curl(
                        SpiderAim.weibo_comment, weibo_id=weibo_id)
                    if not comment_curl_result['error_code']:
                        comment_resp = comment_curl_result['response']
                        break
                    if i == settings.RETRY_TIME - 1:
                        raise CurlError

                commentParser = CommentParser(weibo_id, comment_resp)
                wb_content = commentParser.get_long_weibo()
                if wb_content:
                    weibo_content = wb_content
            # 获取topics和at_users
            at_users, topics = PageParser.__get_atusers_and_topics(info)

            return weibo_content, topics, at_users
        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException
예제 #3
0
 def get(self):
     # 获取参数
     args_dict = self.args2dict()
     keyword, cursor = args_dict.get('keyword'), args_dict.get(
         'cursor', '1')
     if keyword is None:
         self.write(WeiboCurlError.REQUEST_LACK_ARGS)  # 缺少参数
         return
     try:
         cursor = 1 if not cursor else int(cursor)
     except ValueError:
         self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
         return
     if cursor > SEARCH_LIMIT_PAGES:
         result = settings.SUCCESS.copy()
         result['data'] = {'result': [], 'cursor': '0'}
         self.write(result)
         return
     user_type, gender, age_limit = args_dict.get(
         'user_type'), args_dict.get('gender'), args_dict.get('age_limit')
     # 进行爬取
     search_users_curl_result = yield weibo_web_curl(SpiderAim.search_users,
                                                     keyword=keyword,
                                                     user_type=user_type,
                                                     gender=gender,
                                                     age_limit=age_limit,
                                                     page_num=cursor)
     if not search_users_curl_result['error_code']:
         self.response = search_users_curl_result['response']
     else:
         error_res = curl_result_to_api_result(search_users_curl_result)
         self.write(error_res)
         return
     # 构建解析器
     searchUsersParser = SearchUsersParser(self.response)
     # 提取信息
     try:
         user_list = searchUsersParser.parse_page()
     except HTMLParseException:
         self.write(WeiboCurlError.HTML_PARSE_ERROR)
         return
     # 返回信息
     if user_list:
         # print(user_list)
         success = settings.SUCCESS.copy()
         success['data'] = {
             'result': user_list,
             'cursor':
             str(cursor + 1) if cursor < SEARCH_LIMIT_PAGES else '0'
         }
         self.write(success)
         return
     self.write(WeiboCurlError.UNKNOWN_ERROR)
     return
예제 #4
0
 def _build_selector(self):
     """构造self.selector,如果"""
     if self.selector is None:
         comment_curl_result = yield weibo_web_curl(SpiderAim.weibo_comment,
                                                    weibo_id=self.weibo_id)
         if not comment_curl_result['error_code']:
             self.selector = etree.HTML(
                 comment_curl_result['response'].body)
             self.info_node = self.selector.xpath("//div[@id='M_']")[0]
         else:
             self.selector = None
예제 #5
0
    def get(self):
        args_dict = self.args2dict()
        user_id = args_dict.get('user_id')
        if user_id is None:  # 此时缺少参数
            self.write(WeiboCurlError.REQUEST_LACK_ARGS)
            return
        cursor = args_dict.get('cursor', '1')
        try:
            cursor = 1 if not cursor else int(cursor)
        except ValueError:
            self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
            return
        filter = args_dict.get('filter', 0)  # 默认爬取全部微博(原创+转发)

        page_curl_result = yield weibo_web_curl(SpiderAim.users_weibo_page,
                                                user_id=user_id,
                                                page_num=cursor)
        if not page_curl_result['error_code']:
            pageParser = PageParser(user_id, page_curl_result['response'],
                                    filter)
        else:
            error_res = curl_result_to_api_result(page_curl_result)
            self.write(error_res)
            return

        try:
            weibos, max_page = yield pageParser.get_one_page()
            if cursor == 1:
                user = pageParser.get_user_info_when_first_page()
            else:
                user = pageParser.get_user_info_except_first_page()
        except HTMLParseException:
            self.write(WeiboCurlError.HTML_PARSE_ERROR)
            return
        success = settings.SUCCESS.copy()
        try:
            success['data'] = {
                'result': {
                    'user': user,
                    'weibos': [weibo.__dict__ for weibo in weibos]
                },
                'cursor': str(cursor + 1) if cursor < max_page else '0'
            }
        except AttributeError:  # user没有__dict__属性时,说明未爬取到user
            self.write(WeiboCurlError.REQUEST_ARGS_ERROR)  # 报告参数错误
            return
        # print(success)
        self.write(success)
        return
예제 #6
0
 def get(self):
     # 获取查询参数
     args_dict = self.args2dict()
     user_id, cursor = args_dict.get('user_id'), args_dict.get(
         'cursor', '1')
     if user_id is None:
         self.write(WeiboCurlError.REQUEST_LACK_ARGS)
         return
     try:
         cursor = 1 if not cursor else int(cursor)
     except ValueError:
         self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
         return
     # 进行爬取
     follow_curl_result = yield weibo_web_curl(SpiderAim.follow,
                                               user_id=user_id,
                                               page_num=cursor)
     if not follow_curl_result['error_code']:
         self.response = follow_curl_result['response']
     else:
         error_res = curl_result_to_api_result(follow_curl_result)
         self.write(error_res)
         return
     # 构建解析器
     followParser = FollowParser(self.response)
     # 提取相关信息并返回结果
     try:
         follow_list = followParser.get_follows()  # 关注者的列表
         max_page_num = followParser.get_max_page_num()  # 总页数
         if cursor < max_page_num:
             cursor = str(cursor + 1)
         success = settings.SUCCESS.copy()
         success['data'] = {
             'result': {
                 'friend_list': follow_list,
                 'max_page_num': max_page_num
             },
             'cursor': cursor
         }
         # print(success)
         self.write(success)
         return
     except HTMLParseException:
         self.write(WeiboCurlError.HTML_PARSE_ERROR)
         return
     except Exception as e:
         report_log(e)
         self.write(WeiboCurlError.UNKNOWN_ERROR)
예제 #7
0
    def extract_picture_urls(info, weibo_id):
        """提取微博原始图片url"""
        try:
            first_pic = '/mblog/pic/' + weibo_id
            all_pic = '/mblog/picAll/' + weibo_id
            picture_urls = list()
            a_list = info.xpath('div/a/@href')
            all_href = ''.join(a_list)
            if first_pic in all_href:  # 检查是否有单张的缩略图
                if all_pic in all_href:  # 检查该条微博是否有多图
                    mblog_picall_curl_result = yield weibo_web_curl(
                        SpiderAim.mblog_pic_all, weibo_id=weibo_id)
                    mblogPicAllParser = None
                    if not mblog_picall_curl_result['error_code']:
                        mblogPicAllParser = MblogPicAllParser(
                            mblog_picall_curl_result['response'])

                    preview_picture_list = mblogPicAllParser.extract_preview_picture_list(
                    )
                    picture_urls = [
                        p.replace('/thumb180/', '/large/')
                        for p in preview_picture_list
                    ]
                else:
                    if info.xpath('.//img/@src'):
                        for link in info.xpath('div/a'):
                            if len(link.xpath('@href')) > 0:
                                if first_pic in link.xpath('@href')[0]:
                                    if len(link.xpath('img/@src')) > 0:
                                        preview_picture = link.xpath(
                                            'img/@src')[0]
                                        picture_urls = [
                                            preview_picture.replace(
                                                '/wap180/', '/large/')
                                        ]
                                        break
                    else:
                        LOGGING.warning(
                            u'爬虫微博可能被设置成了"不显示图片",请前往'
                            u'"https://weibo.cn/account/customize/pic",修改为"显示"'
                        )
                        sys.exit()
            return picture_urls
        except Exception as e:
            utils.report_log(e)
            return u'无'
예제 #8
0
    def get(self):
        # 获取参数
        args_dict = self.args2dict()  # 查询参数 -> 参数字典
        keyword, cursor, is_hot = args_dict.get('keyword'), args_dict.get(
            'cursor', '1'), args_dict.get('is_hot', False)
        if keyword is None:
            self.write(WeiboCurlError.REQUEST_LACK_ARGS)  # 缺少参数
            return
        try:
            cursor = 1 if not cursor else int(cursor)
        except ValueError:
            self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
            return
        # 进行爬取
        search_weibo_curl_result = yield weibo_web_curl(SpiderAim.search_weibo,
                                                        keyword=keyword,
                                                        page_num=cursor,
                                                        is_hot=is_hot)
        if not search_weibo_curl_result['error_code']:
            self.response = search_weibo_curl_result['response']
        else:
            error_res = curl_result_to_api_result(search_weibo_curl_result)
            self.write(error_res)
            return
        # 构建解析器
        searchWeiboParser = SearchWeiboParser(self.response)
        # 获取微博信息
        try:
            weibo_list = searchWeiboParser.parse_page()
            # print(weibo_list)
        except HTMLParseException:
            self.write(WeiboCurlError.HTML_PARSE_ERROR)
            return

        if weibo_list is None:
            self.write(WeiboCurlError.PAGE_NOT_FOUND)  # 页面找不到
            return
        # 成功返回结果
        success = settings.SUCCESS.copy()
        success['data'] = {
            'result': weibo_list,
            'cursor': str(cursor + 1) if cursor < 50 else '0'
        }
        self.write(success)
        return
예제 #9
0
    def get(self):
        # 获取参数
        args_dict = self.args2dict()
        weibo_id = args_dict.get('weibo_id')
        if weibo_id is None:
            self.write(WeiboCurlError.REQUEST_LACK_ARGS)
            return
        hot = args_dict.get('hot', False)  # 是否获取热评
        cursor = args_dict.get('cursor', '1')
        try:
            cursor = 1 if not cursor else int(cursor)
        except ValueError:
            self.write(WeiboCurlError.REQUEST_ARGS_ERROR)
            return
        if cursor > SEARCH_LIMIT_PAGES:
            results = settings.SUCCESS.copy()
            results['data'] = {'result': [], 'cursor': '0'}
            self.write(results)
            return
        # 进行爬取
        comment_curl_result = yield weibo_web_curl(SpiderAim.weibo_comment,
                                                   weibo_id=weibo_id,
                                                   page_num=cursor)
        if not comment_curl_result['error_code']:
            self.response = comment_curl_result['response']
        else:
            error_res = curl_result_to_api_result(comment_curl_result)
            self.write(error_res)
            return
        # 构建解析器
        try:
            commonParser = CommentParser(weibo_id, response=self.response)
        except CookieInvalidException:
            self.write(WeiboCurlError.COOKIE_INVALID)
            return

        try:
            weibo_detail = yield commonParser.parse_one_weibo()
        except HTMLParseException as e:
            report_log(e)
            self.write(WeiboCurlError.HTML_PARSE_ERROR)
            return
        except Exception as e:
            report_log(e)
            self.write(WeiboCurlError.UNKNOWN_ERROR)
            return

        # 根据 hot 参数来确定获取 comment_list 的方式
        if not hot:
            comment_list = commonParser.get_all_comment()
        else:
            hot_comment_curl_result = yield weibo_web_curl(
                SpiderAim.hot_comment, weibo_id=weibo_id, page_num=cursor)
            if not hot_comment_curl_result['error_code']:
                self.hot_comment_response = hot_comment_curl_result['response']
            else:
                error_res = curl_result_to_api_result(comment_curl_result)
                self.write(error_res)
                return

            try:
                comment_list = HotCommentParser(
                    weibo_id, self.hot_comment_response).get_all_comment()
            except HTMLParseException:
                self.write(WeiboCurlError.HTML_PARSE_ERROR)
                return
            except Exception as e:
                report_log(
                    (__class__.__name__, StatusesShowHandler.get.__name__), e)
                self.write(WeiboCurlError.UNKNOWN_ERROR)
                return
        # 成功时返回结果
        weibo_detail['weibo_id'] = weibo_id
        weibo_detail['comments'] = comment_list
        success = settings.SUCCESS.copy()
        success['data'] = {
            'result': weibo_detail,
            'cursor':
            str(cursor + 1) if cursor < weibo_detail['max_page'] else '0'
        }
        # print(success)
        self.write(success)
        return
예제 #10
0
    def get_retweet(self, info, weibo_id, weibo: Weibo):
        """获取转发微博"""
        try:
            weibo_content = utils.handle_garbled(info)
            weibo_content = weibo_content[weibo_content.find(':') +
                                          1:weibo_content.rfind(u'赞')]
            weibo_content = weibo_content[:weibo_content.rfind(u'赞')]
            # 检查当前是否已经为全部微博内容
            a_text = info.xpath('div//a/text()')
            if u'全文' in a_text:
                # 构造 CommentParser
                comment_resp = None
                for i in range(settings.RETRY_TIME):
                    comment_curl_result = yield weibo_web_curl(
                        SpiderAim.weibo_comment, weibo_id=weibo_id)
                    if not comment_curl_result['error_code']:
                        comment_resp = comment_curl_result['response']
                        break
                    if i == settings.RETRY_TIME - 1:
                        raise CurlError

                commentParser = CommentParser(weibo_id, comment_resp)
                wb_content = commentParser.get_long_retweet(rev_type=dict)
                if wb_content:
                    weibo_content = wb_content

            # 提取转发理由
            if isinstance(weibo_content, dict):
                retweet_reason = weibo_content.get('retweet_reason')
                retweet_id = weibo_content.get('retweet_id')
                weibo_content = weibo_content.get('retweet')
            else:
                original_div = utils.handle_garbled(info.xpath('div')[-1])
                retweet_reason = original_div[original_div.find(':') +
                                              1:original_div.rindex(u'赞')]
                retweet_id = self.get_retweet_id(info)

            # 提取原始用户
            original_user_node = info.xpath('./div/span[@class="cmt"]/a')[0]
            original_user = ''.join(original_user_node.xpath("./text()"))
            original_user_id = original_user_node.get('href')
            if original_user_id is not None:
                original_user_id = original_user_id[original_user_id.
                                                    rfind(r'/') + 1:]
            # 获取原始微博的footers
            original_footer_div = info.xpath(r'./div')[-2]

            footer_nodes = original_footer_div.xpath(
                r'.//span[@class="cmt"] | .//a[@class="cc"]')[-3:]
            original_like_num = 0
            original_retweet_num = 0
            original_comment_num = 0
            for i, footer_node in enumerate(footer_nodes):
                num = ''.join(footer_node.xpath('./text()'))
                try:
                    num = int(num[num.find('[') + 1:num.rfind(']')])
                except BaseException:
                    pass
                if i == 0:
                    original_like_num = num
                elif i == 1:
                    original_retweet_num = num
                elif i == 2:
                    original_comment_num = num

            # 获取话题
            original_div = info.xpath('./div')[0]
            retweet_div = info.xpath('./div')[-1]
            retweet_at_users, retweet_topics = PageParser.__get_atusers_and_topics(
                retweet_div)
            original_at_users, original_topics = PageParser.__get_atusers_and_topics(
                original_div)

            weibo.retweet['weibo_id'] = retweet_id
            weibo.retweet['user_id'] = original_user_id
            weibo.retweet['screen_name'] = original_user
            weibo.retweet['text'] = weibo_content
            weibo.retweet['topics'] = original_topics
            weibo.retweet['at_users'] = original_at_users
            weibo.retweet['attitudes_count'] = original_like_num
            weibo.retweet['comments_count'] = original_comment_num
            weibo.retweet['reposts_count'] = original_retweet_num
            weibo.topics = retweet_topics
            weibo.at_users = retweet_at_users
            weibo.text = retweet_reason

        except Exception as e:
            utils.report_log(e)
            raise HTMLParseException