Пример #1
0
    def parse_show_more_more_replies(self, response):
        """每个二级评论块可能有--另外?条回复"""
        try:
            res = json.loads(response.text)
        except:
            return []
        comment_list3_response = HtmlResponse(url=response.url, body=res.get('conversation_html'), encoding='utf-8')
        comment_list3_response.meta2 = copy.deepcopy(response.meta)
        comments = self.parse_secondary_comment(comment_list3_response)
        for comment in comments:  # 创建评论对象,同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                comment.pop("is_end_comment")
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item
    def parse_other_comment_page(self, response):
        """解析其他页一级评论页(json格式)"""
        # 翻页做准备
        commentator_account = response.meta.get("commentator_account")
        comment_id = response.meta.get("comment_id")
        # 评论翻页计数
        current_comment_page = response.meta.get("current_comment_page")
        is_new = response.meta.get("is_new")
        # if self.increment_crawl and not is_new and current_comment_page >= 3:
        #     return []
        try:
            res = json.loads(response.text)
        except:
            return []
        comment_list_response = HtmlResponse(url=response.url,
                                             body=res.get('items_html'),
                                             encoding='utf-8')
        comment_list_response.meta2 = copy.deepcopy(response.meta)
        comments = self.parse_comment(comment_list_response)
        for comment in comments:  # 创建评论对象,同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):  # 增量爬取,终止继续采集评论
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(
                    new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item

        # 其他一级评论页 再翻页
        has_more_comment = self.has_more_comment(comment_list_response)
        has_next = res.get('min_position') or has_more_comment
        if has_next:  # 是否含有下一页
            next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            if len(next_url) >= 10000:
                return []
            else:
                meta = copy.deepcopy(response.meta)
                meta["current_comment_page"] = current_comment_page + 1
                yield scrapy.Request(url=next_url,
                                     callback=self.parse_other_comment_page,
                                     meta=meta)
    def parse_secondary_comment_page(self, response):
        """解析回复数大于1的评论首页 Html格式
           得到二级评论
        """
        # 翻页做准备
        commentator_account = response.meta.get("commentator_account")
        comment_id = response.meta.get("comment_id")

        current_secondary_comment_page = 1
        response.meta2 = copy.deepcopy(response.meta)
        response.meta2[
            "current_secondary_comment_page"] = current_secondary_comment_page
        comments = self.parse_secondary_comment(response)  # 解析二级评论
        for comment in comments:  # 创建评论对象,同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(
                    new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item
        """二级评论首页翻页 """
        # 下一页评论地址(第二页)
        xpath_next_page = "//div[@class='ThreadedDescendants']/div[contains(@class,'stream-container')]/@data-min-position"
        has_next_page = response.xpath(xpath_next_page).extract_first()
        # 显示更多回复
        xpath_more_comment = "//li[@class='ThreadedConversation-showMoreThreads']/button/@data-cursor"
        has_more_comment = response.xpath(xpath_more_comment).extract_first()
        has_next = has_next_page or has_more_comment
        if has_next:  # 情况1,还有评论就继续传递更新社交关系, 没有更多二级评论就不用管
            next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            meta = copy.deepcopy(response.meta)
            meta[
                "current_secondary_comment_page"] = current_secondary_comment_page + 1
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_other_secondary_comment_page,
                meta=meta)
Пример #4
0
    def parse_article_detail(self, response):
        """解析推特帖子详情页 以及首页评论"""
        current_page = response.meta.get("current_page")
        article_count = response.meta.get("article_count")

        is_new = response.meta.get("is_new")
        user_id = response.meta.get("user_id")
        user_account = response.meta.get("UserAccount")  # 帖子作者(不一定是帖子原作者,而是所查询的用户)
        user_nick = response.meta.get("AuthorNick")  # 帖子作者昵称(不一定是帖子原作者,而是所查询的用户)
        XXX_UserAccount = response.meta.get("XXX_UserAccount")
        XXX_AuthorNick = response.meta.get("XXX_AuthorNick")

        # 发布时间
        xpath_publish_time = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//small[@class='time']//span[@data-time]/@data-time"
        ptime = response.xpath(xpath_publish_time).extract_first()
        if not ptime:
            return []
        p_time = int(ptime.strip())
        publish_time = str(datetime.datetime.fromtimestamp(p_time))
        if current_page > 1:  # 每个账号必采1页
            if not self.increment_crawl or is_new:  # 首次采集/新增账号
                if p_time < self.e_time and article_count["count"] >= self.article_count_limit:
                    article_count["end_collect"] = True
                    return []
            else:  # 增量爬取/且不是新增账号,大于第1页的采(self.inc_e_time)天以内
                if p_time < self.inc_e_time:
                    article_count["end_collect"] = True
                    return []

        # 继续采集 +1
        article_count["count"] += 1

        # 帖子url
        article_url = response.url
        # 文章ID
        # article_id = url.split('/')[-1]
        xpath_article_id = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-item-id"
        article_id = response.xpath(xpath_article_id).extract_first()

        # (原作者)作者账号 (作用:1.下一页拼接  2.社交关系)
        xpath_origin_author = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-screen-name"
        origin_author = response.xpath(xpath_origin_author).extract_first()

        # (原作者)作者昵称
        xpath_origin_author_nick = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-name"
        origin_author_nickname = response.xpath(xpath_origin_author_nick).extract_first()

        # (原作者)作者ID
        xpath_origin_author_id = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-user-id"
        origin_author_id = response.xpath(xpath_origin_author_id).extract_first()

        # 帖子内容(1.文本内容  2.视频内容  3.(未采集)其他底部视频内容(包含时间、转推、点赞等)[js-tweet-details-fixer tweet-details-fixer])
        xpath_content = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[@class='js-tweet-text-container'] | //div[contains(@class,'permalink-inner permalink-tweet-container')]//div[@class='AdaptiveMediaOuterContainer'] | //div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'u-block js-tweet-details-fixer')]"
        content = ''.join(response.xpath(xpath_content).extract())

        # 帖子内容语言代码
        xpath_content_language_code = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//p/@lang"
        content_language_code = response.xpath(xpath_content_language_code).extract_first()

        # 评论数
        xpath_comment = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--reply')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count"
        comments = response.xpath(xpath_comment).extract_first()
        comments_count = int(comments.strip()) if comments else 0

        # 转推数
        xpath_retweets = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--retweet')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count"
        retweets = response.xpath(xpath_retweets).extract_first()
        retweets_count = int(retweets) if retweets else 0

        # 点赞数
        xpath_likes = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--favorite')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count"
        likes = response.xpath(xpath_likes).extract_first()
        likes_count = int(likes) if likes else 0

        # 帖子内容下面部分的iframe标签里面的html
        xpath_additional_url = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-macaw-cards-iframe-container')]/@data-full-card-iframe-url"
        additional_url = response.xpath(xpath_additional_url).extract_first()
        additional_content = self.parse_related_article(url=response.urljoin(additional_url))
        content = content + additional_content if additional_content else content

        # 视频地址
        video_url = response.xpath("//meta[@property='og:video:secure_url']/@content").extract_first()

        if not content:  # 丢弃没有内容的文章
            return

        if self.user_msg_choose == 1:  # 假
            user_id_choose = user_id
            account = user_account
            user_account_choose = XXX_UserAccount
            user_nick_choose = XXX_AuthorNick
        elif self.user_msg_choose == 2:  # 伪
            user_id_choose = user_id
            account = user_account
            user_account_choose = user_account
            user_nick_choose = user_nick
        else:  # 真
            user_id_choose = origin_author_id
            account = origin_author
            user_account_choose = origin_author
            user_nick_choose = origin_author_nickname

        article_data = {
            "PNID": article_id,
            "Url": article_url,
            "Author": user_account_choose,  # 传过来的账号
            "AuthorNick": user_nick_choose,  # 传过来的昵称
            "PublishTime": publish_time,
            "Content": content,
            "CommentCount": comments_count,  # 评论数
            "ForwardNum": retweets_count,  # 转载数
            "ClickCount": likes_count,  # 点击数(点赞数?)
            "LanguageCode": content_language_code or 'en',  # 语言编码 response.meta.get("LanguageCode")

            "Title": "",
            "Abstract": "",
            "Keywords": "",

            "VideoUrl": video_url,
            "MediaSourceUrl": video_url or "",

            "is_new": is_new,
            "user_id": user_id_choose,
            "account": account
        }
        article_item = ArticleItem(**article_data)
        yield article_item

        """解析帖子页-->首页评论"""
        current_comment_page = 1
        response.meta2 = copy.deepcopy(response.meta)
        response.meta2['article_id'] = article_id
        response.meta2['article_url'] = article_url
        # 假/伪/真
        response.meta2['user_id_choose'] = user_id_choose
        response.meta2['user_account_choose'] = user_account_choose
        response.meta2['user_nick_choose'] = user_nick_choose
        response.meta2['account'] = account

        response.meta2['current_comment_page'] = current_comment_page  # 一级评论的页数

        """解析评论"""
        comments = self.parse_comment(response)
        for comment in comments:  # 创建评论对象,同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item

        # 下一页评论地址(第二页)
        xpath_next_page = "//div[@class='ThreadedDescendants']/div[contains(@class,'stream-container')]/@data-min-position"
        has_next_page = response.xpath(xpath_next_page).extract_first()
        # 显示更多回复
        xpath_more_comment = "//li[@class='ThreadedConversation-showMoreThreads']/button/@data-cursor"
        has_more_comment = response.xpath(xpath_more_comment).extract_first()
        has_next = has_next_page or has_more_comment
        if has_next:
            next_url = 'https://twitter.com/i/' + origin_author + '/conversation/' + article_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            meta = copy.deepcopy(response.meta2)
            meta["commentator_account"] = origin_author
            meta["comment_id"] = article_id
            meta["current_comment_page"] = current_comment_page + 1  # 第二页的评论
            yield scrapy.Request(url=next_url, callback=self.parse_other_comment_page, meta=meta)