def is_dup_detail(detail_url, spider_name, channel_id=0): """ 检查详细页是否重复 :param detail_url: :param spider_name: :param channel_id: :return: """ detail_dup_key = 'dup:%s:%s' % (spider_name, channel_id) detail_url_finger = get_request_finger(detail_url) return redis_client.sismember(detail_dup_key, detail_url_finger)
def add_dup_detail(detail_url, spider_name, channel_id=0): """ 把当前详细页加入集合 :param detail_url: :param spider_name: :param channel_id: :return: """ detail_dup_key = 'dup:%s:%s' % (spider_name, channel_id) detail_url_finger = get_request_finger(detail_url) return redis_client.sadd(detail_dup_key, detail_url_finger)
def parse_article_list(self, response): """ 文章列表解析 没有翻页特征 <a class=\"page next S_txt1 S_line1 page_dis\"><span>下一页<\/span> 解析链接 href=\"\/p\/1005051627825392\/wenzhang?pids=Pl_Core_ArticleList__61&cfs=600&Pl_Core_ArticleList__61_filter=&Pl_Core_ArticleList__61_page=6#Pl_Core_ArticleList__61\" """ print('task_url: %s' % response.url) # 页面解析(微博是JS动态数据, 无法直接解析页面) article_list_body = response.body_as_unicode() article_list_rule = r'<script>FM.view\({"ns":"pl.content.miniTab.index","domid":"Pl_Core_ArticleList__\d+".*?"html":"(.*?)"}\)</script>' article_list_re_parse = re.compile(article_list_rule, re.S).findall(article_list_body) if not article_list_re_parse: return article_list_html = ''.join(article_list_re_parse) # 转义字符处理 article_list_html = article_list_html.replace('\\r', '') article_list_html = article_list_html.replace('\\t', '') article_list_html = article_list_html.replace('\\n', '') article_list_html = article_list_html.replace('\\"', '"') article_list_html = article_list_html.replace('\\/', '/') article_list_doc = fromstring(article_list_html) article_list_doc_parse = article_list_doc.xpath( '//div[@class="text_box"]') for article_item in article_list_doc_parse: article_detail_url = article_item.xpath( './div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/@href' ) article_detail_title = article_item.xpath( './div[@class="title W_autocut"]/a[@class="W_autocut S_txt1"]/text()' ) article_detail_abstract = article_item.xpath( './div[@class="text"]/a[@class="S_txt1"]/text()') if not (article_detail_url and article_detail_title): continue article_detail_url = article_detail_url[0].strip() article_detail_url = response.urljoin(article_detail_url) article_detail_title = article_detail_title[0].strip() article_detail_abstract = article_detail_abstract[0].strip( ) if article_detail_abstract else '' meta_article_item = { 'article_url': article_detail_url, 'article_title': article_detail_title, 'article_abstract': article_detail_abstract, 'article_id': get_request_finger(article_detail_url), } meta = dict(response.meta, **meta_article_item) # 两种不同类型页面 if '/ttarticle/p/show?id=' in article_detail_url: yield scrapy.Request(url=article_detail_url, callback=self.parse_article_detail_html, meta=meta) else: yield scrapy.Request(url=article_detail_url, callback=self.parse_article_detail_js, meta=meta) # 翻页处理 next_url_parse = article_list_doc.xpath( '//a[@class="page next S_txt1 S_line1"]/@href') if not next_url_parse: print('当前条件列表页最后一页:%s' % response.url) else: next_url = next_url_parse[0] next_url = response.urljoin(next_url) print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_article_list, meta=response.meta)