def parse_show_more_more_replies(self, response): """每个二级评论块可能有--另外?条回复""" try: res = json.loads(response.text) except: return [] comment_list3_response = HtmlResponse(url=response.url, body=res.get('conversation_html'), encoding='utf-8') comment_list3_response.meta2 = copy.deepcopy(response.meta) comments = self.parse_secondary_comment(comment_list3_response) for comment in comments: # 创建评论对象,同时更新社交关系 if isinstance(comment, scrapy.Request): yield comment else: comment.pop("is_end_comment") # 被评论人等信息 data_reply_to_users = comment.pop("data_reply_to_users") new_relations_dct = comment.pop("NEW_RELATIONS") comment_item = CommentItem(**comment) # pop后就创建字典 yield comment_item # social_relations = self.make_social_relation_dict(data_reply_to_users, comment) # for relation in social_relations: # relation_item = SocialRelationItem(ListSocialRelation=[]) # relation_item["ListSocialRelation"].append(relation) # yield relation_item social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment) relation_item = SocialRelationItem(**social_relation) yield relation_item
def parse_other_comment_page(self, response): """解析其他页一级评论页(json格式)""" # 翻页做准备 commentator_account = response.meta.get("commentator_account") comment_id = response.meta.get("comment_id") # 评论翻页计数 current_comment_page = response.meta.get("current_comment_page") is_new = response.meta.get("is_new") # if self.increment_crawl and not is_new and current_comment_page >= 3: # return [] try: res = json.loads(response.text) except: return [] comment_list_response = HtmlResponse(url=response.url, body=res.get('items_html'), encoding='utf-8') comment_list_response.meta2 = copy.deepcopy(response.meta) comments = self.parse_comment(comment_list_response) for comment in comments: # 创建评论对象,同时更新社交关系 if isinstance(comment, scrapy.Request): yield comment else: if comment.pop("is_end_comment"): # 增量爬取,终止继续采集评论 return [] # 被评论人等信息 data_reply_to_users = comment.pop("data_reply_to_users") new_relations_dct = comment.pop("NEW_RELATIONS") comment_item = CommentItem(**comment) # pop后就创建字典 yield comment_item # social_relations = self.make_social_relation_dict(data_reply_to_users, comment) # for relation in social_relations: # relation_item = SocialRelationItem(ListSocialRelation=[]) # relation_item["ListSocialRelation"].append(relation) # yield relation_item social_relation = self.make_social_relation_dict_bad( new_relations_dct, comment) relation_item = SocialRelationItem(**social_relation) yield relation_item # 其他一级评论页 再翻页 has_more_comment = self.has_more_comment(comment_list_response) has_next = res.get('min_position') or has_more_comment if has_next: # 是否含有下一页 next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next if len(next_url) >= 10000: return [] else: meta = copy.deepcopy(response.meta) meta["current_comment_page"] = current_comment_page + 1 yield scrapy.Request(url=next_url, callback=self.parse_other_comment_page, meta=meta)
def parse_secondary_comment_page(self, response): """解析回复数大于1的评论首页 Html格式 得到二级评论 """ # 翻页做准备 commentator_account = response.meta.get("commentator_account") comment_id = response.meta.get("comment_id") current_secondary_comment_page = 1 response.meta2 = copy.deepcopy(response.meta) response.meta2[ "current_secondary_comment_page"] = current_secondary_comment_page comments = self.parse_secondary_comment(response) # 解析二级评论 for comment in comments: # 创建评论对象,同时更新社交关系 if isinstance(comment, scrapy.Request): yield comment else: if comment.pop("is_end_comment"): return [] # 被评论人等信息 data_reply_to_users = comment.pop("data_reply_to_users") new_relations_dct = comment.pop("NEW_RELATIONS") comment_item = CommentItem(**comment) # pop后就创建字典 yield comment_item # social_relations = self.make_social_relation_dict(data_reply_to_users, comment) # for relation in social_relations: # relation_item = SocialRelationItem(ListSocialRelation=[]) # relation_item["ListSocialRelation"].append(relation) # yield relation_item social_relation = self.make_social_relation_dict_bad( new_relations_dct, comment) relation_item = SocialRelationItem(**social_relation) yield relation_item """二级评论首页翻页 """ # 下一页评论地址(第二页) xpath_next_page = "//div[@class='ThreadedDescendants']/div[contains(@class,'stream-container')]/@data-min-position" has_next_page = response.xpath(xpath_next_page).extract_first() # 显示更多回复 xpath_more_comment = "//li[@class='ThreadedConversation-showMoreThreads']/button/@data-cursor" has_more_comment = response.xpath(xpath_more_comment).extract_first() has_next = has_next_page or has_more_comment if has_next: # 情况1,还有评论就继续传递更新社交关系, 没有更多二级评论就不用管 next_url = 'https://twitter.com/i/' + commentator_account + '/conversation/' + comment_id + '?include_available_features=1&include_entities=1&max_position=' + has_next meta = copy.deepcopy(response.meta) meta[ "current_secondary_comment_page"] = current_secondary_comment_page + 1 yield scrapy.Request( url=next_url, callback=self.parse_other_secondary_comment_page, meta=meta)
def parse_article_detail(self, response): """解析推特帖子详情页 以及首页评论""" current_page = response.meta.get("current_page") article_count = response.meta.get("article_count") is_new = response.meta.get("is_new") user_id = response.meta.get("user_id") user_account = response.meta.get("UserAccount") # 帖子作者(不一定是帖子原作者,而是所查询的用户) user_nick = response.meta.get("AuthorNick") # 帖子作者昵称(不一定是帖子原作者,而是所查询的用户) XXX_UserAccount = response.meta.get("XXX_UserAccount") XXX_AuthorNick = response.meta.get("XXX_AuthorNick") # 发布时间 xpath_publish_time = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//small[@class='time']//span[@data-time]/@data-time" ptime = response.xpath(xpath_publish_time).extract_first() if not ptime: return [] p_time = int(ptime.strip()) publish_time = str(datetime.datetime.fromtimestamp(p_time)) if current_page > 1: # 每个账号必采1页 if not self.increment_crawl or is_new: # 首次采集/新增账号 if p_time < self.e_time and article_count["count"] >= self.article_count_limit: article_count["end_collect"] = True return [] else: # 增量爬取/且不是新增账号,大于第1页的采(self.inc_e_time)天以内 if p_time < self.inc_e_time: article_count["end_collect"] = True return [] # 继续采集 +1 article_count["count"] += 1 # 帖子url article_url = response.url # 文章ID # article_id = url.split('/')[-1] xpath_article_id = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-item-id" article_id = response.xpath(xpath_article_id).extract_first() # (原作者)作者账号 (作用:1.下一页拼接 2.社交关系) xpath_origin_author = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-screen-name" origin_author = response.xpath(xpath_origin_author).extract_first() # (原作者)作者昵称 xpath_origin_author_nick = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-name" origin_author_nickname = response.xpath(xpath_origin_author_nick).extract_first() # (原作者)作者ID xpath_origin_author_id = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-actionable-user')]/@data-user-id" origin_author_id = response.xpath(xpath_origin_author_id).extract_first() # 帖子内容(1.文本内容 2.视频内容 3.(未采集)其他底部视频内容(包含时间、转推、点赞等)[js-tweet-details-fixer tweet-details-fixer]) xpath_content = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[@class='js-tweet-text-container'] | //div[contains(@class,'permalink-inner permalink-tweet-container')]//div[@class='AdaptiveMediaOuterContainer'] | //div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'u-block js-tweet-details-fixer')]" content = ''.join(response.xpath(xpath_content).extract()) # 帖子内容语言代码 xpath_content_language_code = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//p/@lang" content_language_code = response.xpath(xpath_content_language_code).extract_first() # 评论数 xpath_comment = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--reply')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count" comments = response.xpath(xpath_comment).extract_first() comments_count = int(comments.strip()) if comments else 0 # 转推数 xpath_retweets = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--retweet')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count" retweets = response.xpath(xpath_retweets).extract_first() retweets_count = int(retweets) if retweets else 0 # 点赞数 xpath_likes = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'stream-item-footer')]//div[contains(@class,'ProfileTweet-actionCountList')]//span[contains(@class,'ProfileTweet-action--favorite')]/span[@class='ProfileTweet-actionCount']/@data-tweet-stat-count" likes = response.xpath(xpath_likes).extract_first() likes_count = int(likes) if likes else 0 # 帖子内容下面部分的iframe标签里面的html xpath_additional_url = "//div[contains(@class,'permalink-inner permalink-tweet-container')]//div[contains(@class,'js-macaw-cards-iframe-container')]/@data-full-card-iframe-url" additional_url = response.xpath(xpath_additional_url).extract_first() additional_content = self.parse_related_article(url=response.urljoin(additional_url)) content = content + additional_content if additional_content else content # 视频地址 video_url = response.xpath("//meta[@property='og:video:secure_url']/@content").extract_first() if not content: # 丢弃没有内容的文章 return if self.user_msg_choose == 1: # 假 user_id_choose = user_id account = user_account user_account_choose = XXX_UserAccount user_nick_choose = XXX_AuthorNick elif self.user_msg_choose == 2: # 伪 user_id_choose = user_id account = user_account user_account_choose = user_account user_nick_choose = user_nick else: # 真 user_id_choose = origin_author_id account = origin_author user_account_choose = origin_author user_nick_choose = origin_author_nickname article_data = { "PNID": article_id, "Url": article_url, "Author": user_account_choose, # 传过来的账号 "AuthorNick": user_nick_choose, # 传过来的昵称 "PublishTime": publish_time, "Content": content, "CommentCount": comments_count, # 评论数 "ForwardNum": retweets_count, # 转载数 "ClickCount": likes_count, # 点击数(点赞数?) "LanguageCode": content_language_code or 'en', # 语言编码 response.meta.get("LanguageCode") "Title": "", "Abstract": "", "Keywords": "", "VideoUrl": video_url, "MediaSourceUrl": video_url or "", "is_new": is_new, "user_id": user_id_choose, "account": account } article_item = ArticleItem(**article_data) yield article_item """解析帖子页-->首页评论""" current_comment_page = 1 response.meta2 = copy.deepcopy(response.meta) response.meta2['article_id'] = article_id response.meta2['article_url'] = article_url # 假/伪/真 response.meta2['user_id_choose'] = user_id_choose response.meta2['user_account_choose'] = user_account_choose response.meta2['user_nick_choose'] = user_nick_choose response.meta2['account'] = account response.meta2['current_comment_page'] = current_comment_page # 一级评论的页数 """解析评论""" comments = self.parse_comment(response) for comment in comments: # 创建评论对象,同时更新社交关系 if isinstance(comment, scrapy.Request): yield comment else: if comment.pop("is_end_comment"): return [] # 被评论人等信息 data_reply_to_users = comment.pop("data_reply_to_users") new_relations_dct = comment.pop("NEW_RELATIONS") comment_item = CommentItem(**comment) # pop后就创建字典 yield comment_item # social_relations = self.make_social_relation_dict(data_reply_to_users, comment) # for relation in social_relations: # relation_item = SocialRelationItem(ListSocialRelation=[]) # relation_item["ListSocialRelation"].append(relation) # yield relation_item social_relation = self.make_social_relation_dict_bad(new_relations_dct, comment) relation_item = SocialRelationItem(**social_relation) yield relation_item # 下一页评论地址(第二页) xpath_next_page = "//div[@class='ThreadedDescendants']/div[contains(@class,'stream-container')]/@data-min-position" has_next_page = response.xpath(xpath_next_page).extract_first() # 显示更多回复 xpath_more_comment = "//li[@class='ThreadedConversation-showMoreThreads']/button/@data-cursor" has_more_comment = response.xpath(xpath_more_comment).extract_first() has_next = has_next_page or has_more_comment if has_next: next_url = 'https://twitter.com/i/' + origin_author + '/conversation/' + article_id + '?include_available_features=1&include_entities=1&max_position=' + has_next meta = copy.deepcopy(response.meta2) meta["commentator_account"] = origin_author meta["comment_id"] = article_id meta["current_comment_page"] = current_comment_page + 1 # 第二页的评论 yield scrapy.Request(url=next_url, callback=self.parse_other_comment_page, meta=meta)