Python SocialRelationItem примеры использования

Язык программирования: Python

Пространство имен/Пакет: twitter.items

Класс/Тип: SocialRelationItem

Примеров на hotexamples.com: 4

Python SocialRelationItem - 4 примера найдено. Это лучшие примеры Python кода для twitter.items.SocialRelationItem, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

SocialRelationItem(4)

Основные методы

SocialRelationItem (4)

Пример #1

Показать файл

Файл: twitter_spider.py Проект: ipylei/TwitterOldVersion

    def parse_show_more_more_replies(self, response):
        """每个二级评论块可能有--另外？条回复"""
        try:
            res = json.loads(response.text)
        except:
            return []
        comment_list3_response = HtmlResponse(url=response.url, body=res.get('conversation_html'), encoding='utf-8')
        comment_list3_response.meta2 = copy.deepcopy(response.meta)
        comments = self.parse_secondary_comment(comment_list3_response)
        for comment in comments:  # 创建评论对象，同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                comment.pop("is_end_comment")
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item

Пример #2

Показать файл

Файл: twitter_spider_temp.py Проект: ipylei/TwitterOldVersion

    def parse_other_comment_page(self, response):
        """解析其他页一级评论页(json格式)"""
        # 翻页做准备
        commentator_account = response.meta.get("commentator_account")
        comment_id = response.meta.get("comment_id")
        # 评论翻页计数
        current_comment_page = response.meta.get("current_comment_page")
        is_new = response.meta.get("is_new")
        # if self.increment_crawl and not is_new and current_comment_page >= 3:
        #     return []
        try:
            res = json.loads(response.text)
        except:
            return []
        comment_list_response = HtmlResponse(url=response.url,
                                             body=res.get('items_html'),
                                             encoding='utf-8')
        comment_list_response.meta2 = copy.deepcopy(response.meta)
        comments = self.parse_comment(comment_list_response)
        for comment in comments:  # 创建评论对象，同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):  # 增量爬取，终止继续采集评论
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(
                    new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item

        # 其他一级评论页 再翻页
        has_more_comment = self.has_more_comment(comment_list_response)
        has_next = res.get('min_position') or has_more_comment
        if has_next:  # 是否含有下一页
            next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            if len(next_url) >= 10000:
                return []
            else:
                meta = copy.deepcopy(response.meta)
                meta["current_comment_page"] = current_comment_page + 1
                yield scrapy.Request(url=next_url,
                                     callback=self.parse_other_comment_page,
                                     meta=meta)

Пример #3

Показать файл

Файл: twitter_spider_temp.py Проект: ipylei/TwitterOldVersion

    def parse_secondary_comment_page(self, response):
        """解析回复数大于1的评论首页 Html格式
           得到二级评论
        """
        # 翻页做准备
        commentator_account = response.meta.get("commentator_account")
        comment_id = response.meta.get("comment_id")

        current_secondary_comment_page = 1
        response.meta2 = copy.deepcopy(response.meta)
        response.meta2[
            "current_secondary_comment_page"] = current_secondary_comment_page
        comments = self.parse_secondary_comment(response)  # 解析二级评论
        for comment in comments:  # 创建评论对象，同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(
                    new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item
        """二级评论首页翻页 """
        # 下一页评论地址(第二页)
        xpath_next_page = "//div[@class='ThreadedDescendants']/div[contains(@class,'stream-container')]/@data-min-position"
        has_next_page = response.xpath(xpath_next_page).extract_first()
        # 显示更多回复
        xpath_more_comment = "//li[@class='ThreadedConversation-showMoreThreads']/button/@data-cursor"
        has_more_comment = response.xpath(xpath_more_comment).extract_first()
        has_next = has_next_page or has_more_comment
        if has_next:  # 情况1,还有评论就继续传递更新社交关系, 没有更多二级评论就不用管
            next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            meta = copy.deepcopy(response.meta)
            meta[
                "current_secondary_comment_page"] = current_secondary_comment_page + 1
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_other_secondary_comment_page,
                meta=meta)

Пример #4

Показать файл

Файл: twitter_spider.py Проект: ipylei/TwitterOldVersion

    def parse_article_detail(self, response):
        """解析推特帖子详情页 以及首页评论"""
        current_page = response.meta.get("current_page")
        article_count = response.meta.get("article_count")

        is_new = response.meta.get("is_new")
        user_id = response.meta.get("user_id")
        user_account = response.meta.get("UserAccount")  # 帖子作者(不一定是帖子原作者，而是所查询的用户)
        user_nick = response.meta.get("AuthorNick")  # 帖子作者昵称(不一定是帖子原作者，而是所查询的用户)
        XXX_UserAccount = response.meta.get("XXX_UserAccount")
        XXX_AuthorNick = response.meta.get("XXX_AuthorNick")

        # 发布时间
        xpath_publish_time = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//small[@class='time']//span[@data-time]/@data-time"
        ptime = response.xpath(xpath_publish_time).extract_first()
        if not ptime:
            return []
        p_time = int(ptime.strip())
        publish_time = str(datetime.datetime.fromtimestamp(p_time))
        if current_page > 1:  # 每个账号必采1页
            if not self.increment_crawl or is_new:  # 首次采集/新增账号
                if p_time < self.e_time and article_count["count"] >= self.article_count_limit:
                    article_count["end_collect"] = True
                    return []
            else:  # 增量爬取/且不是新增账号，大于第1页的采(self.inc_e_time)天以内
                if p_time < self.inc_e_time:
                    article_count["end_collect"] = True
                    return []

        # 继续采集 +1
        article_count["count"] += 1

        # 帖子url
        article_url = response.url
        # 文章ID
        # article_id = url.split('/')[-1]
        xpath_article_id = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-item-id"
        article_id = response.xpath(xpath_article_id).extract_first()

        # (原作者)作者账号 (作用：1.下一页拼接  2.社交关系)
        xpath_origin_author = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-screen-name"
        origin_author = response.xpath(xpath_origin_author).extract_first()

        # (原作者)作者昵称
        xpath_origin_author_nick = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-name"
        origin_author_nickname = response.xpath(xpath_origin_author_nick).extract_first()

        # (原作者)作者ID
        xpath_origin_author_id = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-user-id"
        origin_author_id = response.xpath(xpath_origin_author_id).extract_first()

        # 帖子内容(1.文本内容  2.视频内容  3.(未采集)其他底部视频内容(包含时间、转推、点赞等)[js-tweet-details-fixer tweet-details-fixer])
        xpath_content = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[@class='js-tweet-text-container'] | //div[contains(@class,'permalink-inner permalink-tweet-container')]//div[@class='AdaptiveMediaOuterContainer'] | //div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'u-block js-tweet-details-fixer')]"
        content = ''.join(response.xpath(xpath_content).extract())

        # 帖子内容语言代码
        xpath_content_language_code = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//p/@lang"
        content_language_code = response.xpath(xpath_content_language_code).extract_first()

        # 评论数
        xpath_comment = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--reply')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count"
        comments = response.xpath(xpath_comment).extract_first()
        comments_count = int(comments.strip()) if comments else 0

        # 转推数
        xpath_retweets = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--retweet')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count"
        retweets = response.xpath(xpath_retweets).extract_first()
        retweets_count = int(retweets) if retweets else 0

        # 点赞数
        xpath_likes = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--favorite')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count"
        likes = response.xpath(xpath_likes).extract_first()
        likes_count = int(likes) if likes else 0

        # 帖子内容下面部分的iframe标签里面的html
        xpath_additional_url = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-macaw-cards-iframe-container')]/@data-full-card-iframe-url"
        additional_url = response.xpath(xpath_additional_url).extract_first()
        additional_content = self.parse_related_article(url=response.urljoin(additional_url))
        content = content + additional_content if additional_content else content

        # 视频地址
        video_url = response.xpath("//meta[@property='og:video:secure_url']/@content").extract_first()

        if not content:  # 丢弃没有内容的文章
            return

        if self.user_msg_choose == 1:  # 假
            user_id_choose = user_id
            account = user_account
            user_account_choose = XXX_UserAccount
            user_nick_choose = XXX_AuthorNick
        elif self.user_msg_choose == 2:  # 伪
            user_id_choose = user_id
            account = user_account
            user_account_choose = user_account
            user_nick_choose = user_nick
        else:  # 真
            user_id_choose = origin_author_id
            account = origin_author
            user_account_choose = origin_author
            user_nick_choose = origin_author_nickname

        article_data = {
            "PNID": article_id,
            "Url": article_url,
            "Author": user_account_choose,  # 传过来的账号
            "AuthorNick": user_nick_choose,  # 传过来的昵称
            "PublishTime": publish_time,
            "Content": content,
            "CommentCount": comments_count,  # 评论数
            "ForwardNum": retweets_count,  # 转载数
            "ClickCount": likes_count,  # 点击数(点赞数？)
            "LanguageCode": content_language_code or 'en',  # 语言编码 response.meta.get("LanguageCode")

            "Title": "",
            "Abstract": "",
            "Keywords": "",

            "VideoUrl": video_url,
            "MediaSourceUrl": video_url or "",

            "is_new": is_new,
            "user_id": user_id_choose,
            "account": account
        }
        article_item = ArticleItem(**article_data)
        yield article_item

        """解析帖子页-->首页评论"""
        current_comment_page = 1
        response.meta2 = copy.deepcopy(response.meta)
        response.meta2['article_id'] = article_id
        response.meta2['article_url'] = article_url
        # 假/伪/真
        response.meta2['user_id_choose'] = user_id_choose
        response.meta2['user_account_choose'] = user_account_choose
        response.meta2['user_nick_choose'] = user_nick_choose
        response.meta2['account'] = account

        response.meta2['current_comment_page'] = current_comment_page  # 一级评论的页数

        """解析评论"""
        comments = self.parse_comment(response)
        for comment in comments:  # 创建评论对象，同时更新社交关系
            if isinstance(comment, scrapy.Request):
                yield comment
            else:
                if comment.pop("is_end_comment"):
                    return []
                # 被评论人等信息
                data_reply_to_users = comment.pop("data_reply_to_users")
                new_relations_dct = comment.pop("NEW_RELATIONS")
                comment_item = CommentItem(**comment)  # pop后就创建字典
                yield comment_item
                # social_relations = self.make_social_relation_dict(data_reply_to_users, comment)
                # for relation in social_relations:
                #     relation_item = SocialRelationItem(ListSocialRelation=[])
                #     relation_item["ListSocialRelation"].append(relation)
                #     yield relation_item

                social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment)
                relation_item = SocialRelationItem(**social_relation)
                yield relation_item

        # 下一页评论地址(第二页)
        xpath_next_page = "//div[@class='ThreadedDescendants']/div[contains(@class,'stream-container')]/@data-min-position"
        has_next_page = response.xpath(xpath_next_page).extract_first()
        # 显示更多回复
        xpath_more_comment = "//li[@class='ThreadedConversation-showMoreThreads']/button/@data-cursor"
        has_more_comment = response.xpath(xpath_more_comment).extract_first()
        has_next = has_next_page or has_more_comment
        if has_next:
            next_url = 'https://twitter.com/i/' + origin_author + '/conversation/' + article_id + '?include_available_features=1&include_entities=1&max_position=' + has_next
            meta = copy.deepcopy(response.meta2)
            meta["commentator_account"] = origin_author
            meta["comment_id"] = article_id
            meta["current_comment_page"] = current_comment_page + 1  # 第二页的评论
            yield scrapy.Request(url=next_url, callback=self.parse_other_comment_page, meta=meta)