def parse_detail(self, response): # article_item = JobBoleArticleItem() # 实例化后用于填充 # 提取文章的具体字段 # 新闻标题 # title = response.xpath('//div[@class="entry-header"]/h1/text()') # 发布日期 # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·', # '').strip() # 点赞数 # praise_nums = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]) # 收藏数 # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # 评论数 # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # 正文内容 # content = response.xpath('//div[@class="entry"]').extract()[0] # 标签 # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # 将标签中出现的评论部分过滤掉 # tags = ",".join(tag_list) # -------通过css提取字段----------------------------------------------------------------------------------------- # title = response.css('.entry-header h1::text').extract_first("") # h1::text 伪类选择器 # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip() # praise_nums = response.css('.vote-post-up h10::text').extract()[0] # # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css('a[href="#article-comment"] span::text').extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css('div.entry').extract()[0] # # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # 将标签中出现的评论部分过滤掉 # tags = ",".join(tag_list) # -------将item送去pipeline进行数据的存储操作--------------------------------------------------------------------- # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # -------通过自定义的ArticleItemLoader加载item--------------------------------------------------------------------------------- front_image_url = response.meta.get('front_image_url', '') # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', 'a[href="#article-comment"] span::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item # 将配置好的item传送到pipelines
def parse_detail(self, response): # 实例化JobBoleArticleItem article_item = JobBoleArticleItem() # 提取文章的具体字段 # use CSS Selector to locate Element # 获取文章封面图 front_image_url = response.meta.get("front_image_url", "") # get title title = response.css(".entry-header h1::text").extract()[ 0] # CSS伪类选择器:: create_date = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].replace( "·", "").strip() # 处理/r/n空格,处理点号,处理空格 praise_nums = response.css(".vote-post-up h10::text").extract()[ 0] # ' 2 收藏' fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css( "a[href='#article-comment'] span::text").extract()[0] # ' 2 评论' match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] # tag = response.css("p.entry-meta-hide-on-mobile a::text").extract()[0] # '开发' tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract( ) # ['开发', ' 2 评论 ', '数据科学', '机器学习'] tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) # '开发,数据科学,机器学习' article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title # in items.py article_item["url"] = response.url # need to convert create_date str to date try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url ] # [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content # 通过item loader加载item item_loader = ItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_xpath() item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # call item article_item = item_loader.load_item() # call yield , article_item will transfer to pipelines yield article_item
def parse_datail(self, response): article_item = JobBoleArticleItem() # # 使用xpath方式爬取内容 # # 标题 # title = response.xpath('//*[@id="post-114610"]/div[1]/h1/text()').extract()[0] # # # 时间 # create_date = response.xpath('//*[@id="post-114610"]/div[2]/p/text()').extract()[0].strip().replace(' ·', '') # tag_list = response.xpath('//*[@id="post-114610"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # # 点赞数 # praise_nums = int(response.xpath('//*[@id="post-114610"]/div[3]/div[3]/span[1]/h10/text()').extract()[0]) # # # 收藏数 # fav_nums = response.xpath('//*[@id="post-114610"]/div[3]/div[3]/span[2]/text()').extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # # 评论数 # comment_nums = response.xpath('//*[@id="post-114610"]/div[3]/div[3]/a/span/text()').extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # # 正文 # cotent = response.xpath('//*[@id="post-114610"]/div[3]').extract()[0] # # 使用css方式爬取内容 # # 图片获取 # front_image_url = response.meta.get('front_image_url', '') # # # 标题 # title = response.css('.entry-header h1::text').extract()[0] # # # 时间 # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace(' ·', '') # tag_list = response.css('.entry-meta a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # # 点赞数 # praise_nums = int(response.css('.vote-post-up h10::text').extract()[0]) # if praise_nums: # praise_nums = praise_nums # else: # praise_nums = 0 # # # 收藏数 # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # # 评论数 # comment_nums = response.css('a[href="#article-comment"] span::text').extract_first("") # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # # 正文 # content = response.css('div.entry').extract()[0] # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # try: # create_date = datetime.datetime.strftime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['url'] = response.url # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['tags'] = tags # article_item['content'] = content # 图片获取 front_image_url = response.meta.get('front_image_url', '') item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_value('front_image_path', ArticleImagePipeline.item_completed) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', 'a[href="#article-comment"] span::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', '.entry-meta a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) """ article_item = JobboleArticleItem() title = response.css("#news_title a::text").extract_first("") # title = response.xpath('//*[@id="news_title"]//a/text()') create_data = response.css("#news_info .time::text").extract_first("") match_re = re.match(".*?(\d+.*)", create_data) if match_re: create_date = match_re.group(1) # create_date = response.xpath('//*[@id="news_info"]//*[@class="time"]/text()') content = response.css("#news_content").extract()[0] # content = response.xpath('//*[@id="news_content"]').extract()[0] tag_list = response.css(".news_tags a::text").extract() # tag_list = response.xpath('//*[@class="news_tags"]//a/text()').extract() tags = ",".join(tag_list) """ ''' 同步请求代码,在并发要求不是很高时可以采用 post_id = match_re.group(1) html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) j_data = json.loads(html.text) ''' """ article_item["title"] = title article_item["create_date"] = create_date article_item["content"] = content article_item["tags"] = tags article_item["url"] = response.url # 报错:ValueError:Missing scheme in request url:h # 上述报错原因:对于图片下载的字段一定要使用list类型,故[response.meta.get("front_image_url", "")] if response.meta.get("front_image_url", ""): article_item["front_image_url"] = [response.meta.get("front_image_url", "")] else: article_item["front_image_url"] = [] """ item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("create_date", "#news_info .time::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # article_item = item_loader.load_item() print( parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={ "article_item": item_loader, "url": response.url }, callback=self.parse_nums)
def parse_detail(self, response): """ 提取文章的具体字段 """ # article_item = JobBoleArticleItem() # # # title = response.css('.entry-header h1::text').extract_first() # # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace('·','').strip() # # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ",".join(tag_list) # # praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first() or 0) # # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']").extract_first() # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过itemloader加载item front_image_url = response.meta.get("front_image_url", "") # 封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): article_item = JobBoleArticleItem() ''' 处理文章内容 :param response: :return: ''' # 文章封面图 image_url = response.meta.get('image_url','') # # 标题 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # 创建时间 # create_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·','').strip() # # 标签 # tag_lists = response.xpath('//p[contains(@class,"entry-meta-hide-on-mobile")]/a/text()').extract() # tag_lists = [i for i in tag_lists if not i.strip().endswith('评论')] # tag_lists = ','.join(tag_lists) # # 点赞数 # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract() # if len(praise_nums) == 0: # praise_nums = 0 # else: # praise_nums = praise_nums[0] # # # 收藏数 # shoucang_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # # match_re = re.match(r'.*(\d+).*', shoucang_nums) # if match_re: # shoucang_nums = match_re.group(1) # else: # shoucang_nums = 0 # # # 评论数 # comment_nums = response.xpath('//span[contains(@class,"hide-on-480")]/text()').extract()[0] # # match_re = re.match(r'.*(\d+).*', comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # # 内容 # content = response.xpath('//div[@class="entry"]').extract()[0] # # article_item['title'] = title # article_item['url'] = response.url # try: # article_item['create_time'] = datetime.strptime(create_time,'%Y/%m/%d %H:%M:%S') # except Exception as e: # article_item['create_time'] = datetime.now() # article_item['image_url'] = [image_url] # article_item['praise_nums'] = int(praise_nums) # article_item['shoucang_nums'] = int(shoucang_nums) # article_item['comment_nums'] = int(comment_nums) # article_item['content'] = content # article_item['url_object_id'] = get_md5(response.url) # article_item['tag_lists'] = tag_lists # 通过itemloader来加载实例 item_loader = ArticleItemLoader(item=JobBoleArticleItem(),response=response) # item_loader.add_css() # item_loader.add_xpath() item_loader.add_xpath('title','//div[@class="entry-header"]/h1/text()') item_loader.add_value('url',response.url) item_loader.add_value('url_object_id',get_md5(response.url)) item_loader.add_xpath('create_time','//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value('image_url',[image_url]) item_loader.add_value('image_path',image_url) item_loader.add_xpath('praise_nums','//span[contains(@class,"vote-post-up")]/h10/text()') item_loader.add_xpath('comment_nums','//span[contains(@class,"hide-on-480")]/text()') item_loader.add_xpath('shoucang_nums','//span[contains(@class,"bookmark-btn")]/text()') item_loader.add_xpath('tag_lists','//p[contains(@class,"entry-meta-hide-on-mobile")]/a/text()') item_loader.add_xpath('content','//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # content = response.xpath("//div[@class='entry']").extract()[0] # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) front_image_url = response.meta.get("front_image_url", "") #文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace( "·", "").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css( "a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css( "p.entry-meta-hide-on-mobile a::text").extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段 title = response.xpath( '//*[@class="entry-header"]/h1/text()').extract_first().strip() # extract_first() 相当于 extract()[0]且自带默认空值,不会报错 # css选择器写法 response.css(".entry-header h1::text").extract_first().strip() create_date = response.xpath( "//p[@class='entry-meta-hide-on-mobile']/text()").extract_first( ).replace("·", "").strip() # css选择器写法response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip() front_image_url = response.meta.get("front_image_url", "") # 文章封面图 dianzan_nums = response.xpath( "//div[@class='post-adds']/span[1]/h10[1]/text()").extract_first( ).strip() # css选择器写法response.css(".vote-post-up h10::text").extract_first() match_re = re.match( '.*?(\d+).*', response.xpath("//div[@class='post-adds']/span[2]/text()"). extract_first().strip()) # css选择器写法response.css(".bookmark-btn::text").extract_first() shoucang_nums = '0' if match_re: shoucang_nums = match_re.group(1) match_re = re.match( '.*?(\d+).*', response.xpath("//div[@class='post-adds']/a[1]/span[1]/text()"). extract_first().strip()) # css选择器写法response.css('a[href="#article-comment"] span::text').extract_first() pinglun_nums = '0' if match_re: pinglun_nums = match_re.group(1) tag_list = response.xpath( "//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # css选择器写法response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) content_list = response.css("div.entry ::text").extract() content = "".join(content_list).strip() article_item["title"] = title try: create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["url"] = response.url article_item["url_object_id"] = get_md5(response.url) article_item["front_image_url"] = {front_image_url} article_item["dianzan_nums"] = int(dianzan_nums) article_item["shoucang_nums"] = int(shoucang_nums) article_item["pinglun_nums"] = int(pinglun_nums) article_item["tags"] = tags article_item["content"] = content yield article_item pass
def parse_detail(self, response): #导入后进行初始化 article_item = JobBoleArticleItem() # #提取图片(文章封面图) # front_image_url = response.meta.get("front_image_url","") # #提取标题 # title= response.css(".entry-header h1 ::text").extract()[0] # #小标题提取(时间节点) # create_date=response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·"," ").strip() # # #提取点赞数 # praise_nums = response.css("span.vote-post-up h10::text").extract()[0] # #底部收藏数的提取 # fav_nums= response.css("span.bookmark-btn ::text").extract()[0] # match_re=re.match(".*?(\d+).*","fav_nums") # if match_re: # fav_nums=int(match_re.group(1)) # else: # fav_nums=0 # # 底部评论数的提取 # comm_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", "comm_nums") # if match_re: # comm_nums = int(match_re.group(1)) # else: # comm_nums=0 # #主体正文内容 # content = response.css("div.entry").extract()[0] # # 小标题节点的提取,里面将某个节点去重进行 # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # # # # # title=response.xpath('//div[@class="entry-header"]/h1/text()') # # #小标题的提取(时间) # # create_date= response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·"," ").strip() # # #小标题节点的提取,里面将某个节点去重进行 # # tag_list=response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # # tag_list=[element for element in tag_list if not element.strip().endswith("评论")] # # tags=",".join(tag_list) # # #底部点赞数的提取 # # praise_nums=response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # # #底部收藏数的提取 # # fav_nums=response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # # match_re1 = re.match(r".*?(\d+).*",fav_nums) # # if match_re1: # # fav_nums=match_re1.group(1) # # #底部评论数的提取 # # comm_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract() # # match_re2 = re.match(".*?(\d+).*",comm_nums) # # if match_re2: # # comm_nums = match_re2.group(1) # # #主体正文内容 # # content=response.xpath("//div[@class='entry']").extract()[0] # # article_item["url_object_id"]=get_md5(response.url) # article_item["title"]=title # article_item["url"]=response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comm_nums"] = comm_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content #通过itemloader加载item front_image_url = response.meta.get("front_image_url", "") #文章封面 item_loader = ItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1 ::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comm_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 提取文章的具体信息 # article_item = JobBoleArticleItem() # # title = response.css(".entry-header h1::text").extract_first("") # front_image_url = response.meta.get('front_image_url', '') # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first("").strip().replace('·', # '').strip() # praise_nums = response.css(".vote-post-up h10::text").extract_first("") # fav_nums = response.css(".bookmark-btn::text").extract_first("") # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("") # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.css("div.entry").extract_first("") # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strtime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() # 跳转到piplines中 yield article_item
def parse_detail(self, response): #make an instance call article_item article_item = JobBoleArticleItem() # 提取文章的具体字段 # #unilike array # #start from 1 # #not working # #re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1"); # #this working # #//*[@id="post-110287"]/div[1]/h1 # #id = xxx must be unique # #text() function to reject getting h1, get the text only # #re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1"); # #re_selector2 = response.xpath('//*[@id="post-110287"]/div[1]/h1/text()'); # # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # via css to get the information # extract first is for the error checking #get the image url, get avoid error front_image_url = response.meta.get("front_image_url", "") #get title title = response.css(".entry-header h1::text").extract()[0] #get create day create_date = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace( "·", "").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css( "a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css( "p.entry-meta-hide-on-mobile a::text").extract() tag_list = [ element for element in tag_list if not element.strip().endswith("评论") ] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content #send to the pipeline.py, let it receive it yield article_item
def parse_detail(self, response): ''' # 这里相当于把实例化的item附给这里的一个变量,然后对这个变量进行赋值 article_item = JobBoleArticleItem() # 直接复制xpath的情况 title = response.xpath("/html/body/div[3]/div[1]/div[3]/div[1]/h1/text()").extract()[0] # 利用class来定位 re2_selector = response.xpath('//div[@class="article-head"]/h1/text()') # 获取发表日期,strip是在去掉换行符,可以加.strip().replace(".","").strip() create_date = response.xpath('//div[@class="date"]/span/text()').extract()[0].strip() read_nums = response.xpath('//div[@class="about-left"]/span[2]/text()').extract()[0] regex_str = ".*?(\d+).*" match_obj = re.match(regex_str, read_nums) if match_obj: #print(match_obj.group(1)) read_nums = int(match_obj.group(1)) else: read_nums = 0 content = response.xpath('//div[@class="article-main"]').extract()[0] front_image_url = response.meta.get("front_image_url", "") article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url #对日期格式做一个转换 try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date #不加[]传进来的就是一个值,只有加上[]才是一个图片地址的数组 article_item["front_image_url"] = [front_image_url] article_item["read_nums"] = read_nums article_item["content"] = content ''' # 上面全注释,只用下面几行就够了,改造完成 front_image_url = response.meta.get("front_image_url", "") # 通过itemloader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 将数据填充进来 item_loader.add_css("title", ".article-head h1::text") #item_loader.add_xpath() #有些不是通过选择器,是直接通过response这种形式 item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", ".about-left span:nth-child(1)::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("read_nums", '.about-left span:nth-child(2)::text') item_loader.add_xpath("content", '//div[@class="article-main"]') article_item = item_loader.load_item() #通过css选择器提取字段 ''' title = response.css(".article-head h1::text").extract() create_date = response.css(".about-left span::text").extract()[0] #span:nth-child(2)这样来指明是第几个span read_nums = response.css('.about-left span:nth-child(2)::text').extract()[0] print(read_nums) pass ''' yield article_item
def parse_func(self, response): #获取文章封面图片,用上述request代码的meta获取 #FrontImageUrl = response.meta.get("FrontImageUrl","") # #获取文章标题 # Title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # #获取文章创建时间 # CreateTime = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(" ·","") # try: # CreateTime = datetime.datetime.strptime(CreateTime,"%Y/%m/%d") # except Exception as e: # CreateTime = datetime.datetime.now().date() # #获取点赞数 # VoteNum = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract() # #如果数目为空则为0次 # if VoteNum: # VoteNum = int(VoteNum[0]) # else: # VoteNum = 0 # #获取收藏数 # BookMarkNum = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0].strip("收藏").strip() # if not BookMarkNum: # BookMarkNum = 0 # else: # BookMarkNum = int(BookMarkNum) # #获取评论数 # ArticleComment = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0].replace("评论","").strip() # if not ArticleComment: # ArticleComment = 0 # else: # ArticleComment = int(ArticleComment) #获取文章内容 #Content = response.css("div.entry").extract()[0] #实例化items的JobBoleArticleItem,并填充自定义的值 # ArticleItem = JobBoleArticleItem() # ArticleItem["FrontImageMD5"] = get_md5(response.url) # ArticleItem["Title"] = Title # ArticleItem["URL"] = response.url # ArticleItem["CreateTime"] = CreateTime # #下载图片需要用集合形式的URL # ArticleItem["FrontImageUrl"] = FrontImageUrl # ArticleItem["VoteNum"] = VoteNum # ArticleItem["BookMarkNum"] = BookMarkNum # ArticleItem["ArticleComment"] = ArticleComment # ArticleItem["Content"] = Content #通过itemloader机制填充ArticleItem的值 ArticleItem = ArticleItemLoader(item=JobBoleArticleItem(), response=response) ArticleItem.add_css("Title", ".entry-header h1::text") ArticleItem.add_value("URL", response.url) #ArticleItem.add_value("FrontImageMD5",get_md5(response.url)) #ArticleItem.add_css("CreateTime","p.entry-meta-hide-on-mobile::text".strip().replace(" ·","")) ArticleItem.add_xpath( "CreateTime", '//p[@class="entry-meta-hide-on-mobile"]/text()') #ArticleItem.add_value("FrontImageUrl",[FrontImageUrl]) ArticleItem.add_css("VoteNum", ".vote-post-up h10::text") ArticleItem.add_css("ArticleComment", "a[href='#article-comment'] span::text") ArticleItem.add_css("BookMarkNum", ".bookmark-btn::text") ArticleItem.add_css("Content", "div.entry") ArticleItemLoder = ArticleItem.load_item() yield ArticleItemLoder
def parse_detail(self, response): article_item = JobBoleArticleItem() # 封面图 front_image_url = response.meta.get('front_image_url', '') """ # 标题 title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # 创建时间 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·", "") # 点赞数 praise_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # 收藏 fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 # 评论数 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 # 标签 tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) # 内容 content = response.xpath("//div[@class='entry']").extract()[0] article_item['title'] = title article_item['url'] = response.url article_item['url_object_id'] = get_md5(response.url) try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: # 获取当前日期 create_date = datetime.datetime.now().date() article_item['create_date'] = create_date article_item['praise_nums'] = praise_nums # scrapy自动下载图片需要传列表 article_item['front_image_url'] = [front_image_url] article_item['fav_nums'] = fav_nums article_item['comment_nums'] = comment_nums article_item['tags'] = tags article_item['content'] = content """ """ 配置css、xpath、valur提取规则,简洁方便 """ # 通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_xpath() # item_loader.add_value() item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text") item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("content", "div.entry") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) # 解析规则 article_item = item_loader.load_item() # 传递到pipelines中去 yield article_item """
def parse_detail(self, response): #实例化 article_item = JobBoleArticleItem() #提取文章的具体字段 #fire fox: /html/body/div[1]/div[3]/div[1]/div[1]/h1 #re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1") #re2_selector = response.xpath('//*[@id="post-114093"]/div[1]/h1/text()') '''title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]''' #praise_nums = response.xpath("//*[@id='114093votetotal']/text()").extract()[0] '''match_obj = re.match(r".*?(\d+).*$",response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0])''' #其中,(\d*)如果写成(\d+)的话,执行print(match_obj.group(1)),有可能会报错 AttributeError: 'NoneType' object has no attribute 'group' #因为有可能根本没有数字(根本没人收藏),致使正则表达式无法match '''if match_obj: fav_nums = match_obj.group(1) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_obj = re.match(r".*?(\d+).*$",comment_nums) if match_obj: comment_nums = match_obj.group(1) else: comment_nums = 0 content = response.xpath("//div[@class='entry']").extract()[0] tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list)''' #通过css选择器提取字段 ''' title = response.css(".entry-header h1::text").extract_first("没有元素") create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() praise_nums = response.css(".vote-post-up h10::text").extract_first("没有元素") praise_nums = int(praise_nums) fav_nums = response.css(".bookmark-btn::text").extract_first("没有元素") match_obj = re.match(r".*?(\d+).*$",fav_nums) if match_obj: fav_nums = int(match_obj.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("没有元素") match_obj = re.match(r".*?(\d+).*$", comment_nums) if match_obj: comment_nums = int(match_obj.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract_first("没有元素") tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) #list0 = response.css("p.entry-meta-hide-on-mobile a::text") #list0 = response.css("p.entry-meta-hide-on-mobile a::text") #tag_list = [] #for each in list0: #tag_list.append([each].extract_first("没有元素")) #AttributeError: list object has no attribute extract_first #下次可尝试用tag_list.append(each.css("").extract_first("没有元素")) #tag_list = [element for element in tag_list if not element.strip().endswith("评论")] #tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["url"] = response.url article_item["title"] = title try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["fav_nums"] = fav_nums article_item["comment_nums"] = comment_nums article_item["tags"] = tags article_item["content"] = content ''' #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) # 其实不用加[]的 item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() #对item调用yield后, 这个item会传递到pipelines yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段 # 1.XPath写法 # 2.CSS选择器写法 # 使用XPath ''' # 标题 title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # 日期 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # 点赞数 praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # 获取收藏数 fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = match_re.group(1) # 获取评论数 comment_nums = response.xpath("//a[@href='#article-comment']/span").extract()[0] match_re = re.match(".*(\d+).*", comment_nums) if match_re: comment_nums = match_re.group(1) # 主体文章 content = response.xpath("//div[@class='entry']").extract()[0] tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) ''' # 使用CSS选择器 ''' # 文章封面图 front_image_url = response.meta.get("front_image_url", "") # 通过CSS选择器提取字段 # 提取标题 title = response.css(".entry-header h1::text").extract()[0] # 提取日期 create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", " ").strip() # 提取点赞数 praise_nums = response.css(".vote-post-up h10::text").extract()[0] # 提取收藏数 fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 # 提取评论数 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 # 提取主体文章 content = response.css("div.entry").extract()[0] tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() tags = [element for element in tags if not element.strip().endswith("评论")] tags = ",".join(tags) # url转md5 article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y%m%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content ''' # 使用item_loader # 文章封面图 front_image_url = response.meta.get("front_image_url", "") # 通过itemloader加载item # 使用自定义的ArticleItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) # 处理为md5格式 item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # item_loader.add_xpath() # item_loader.add_value() article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): article_item = JobBoleArticleItem() # 提取文章的具体字段 # /html/body/div[1]/div[3]/div[1]/div[1]/h1 # //*[@id="post-114167"]/div[1]/h1 """ re_selector = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1") re1_selector = response.xpath('//*[@id="post-114167"]/div[1]/h1/text()') re2_selector = response.xpath('//div[@class="entry-header"]/h1/text()') re3_selector = response.css(".entry-header h1::text").extract() create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first("")) fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0] match_re = re.match(".*?(\d+).*",fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.xpath("//div[@class='entry']").extract()[0] tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) """ """ # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 title = response.css(".entry-header h1::text").extract() create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip() praise_nums = int(response.css(".vote-post-up h10::text").extract()[0]) fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment']::text").extract_first("") match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content """ # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item = JobBoleArticleItem(),response = response) item_loader.add_css("title", ".entry-header h1::text") # item_loader.add_xpatn() item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") # item_loader.add_css("comment_nums", "a[href='#article-comment']::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 文章封面图 # front_image_url = response.meta.get("front_image_url", "") # re_selector = response.xpath("//*[@id='post-114397']/div[1]/h1/text()").extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","") # title = response.css(".entry-header h1::text").extract()[0] # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.xpath("//div[@class='entry']").extract()[0] # # tags_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # # tags_list = [element for element in tags_list if not element.strip().endswith("评论")] # # tags = ",".join(tags_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["url"] = response.url # article_item["title"] = title # try: # create_date = datetime.datetime.strptime(create_date, "%Y%m%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["front_image_path"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过ItemLoader 加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("front_image_path", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #提取文章的具体字段,通过xpath提取 # title = response.xpath('//*[@class="grid-8"]/div[1]/div[1]/h1/text()').extract_first() # get_date = response.xpath("//*[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # praise_nums = response.xpath("//*[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # # fav_nums = response.xpath("//*[contains(@class, 'bookmark-btn')]/text()").extract()[0].strip() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comments_nums = response.xpath("//a[contains(@href, '#article-comment')]/span/text()").extract()[0].strip() # match_re = re.match(".*?(\d+).*", comments_nums) # if match_re: # comments_nums = match_re.group(1) # # content = response.xpath("//div[@class='entry']").extract()[0] # tag_list = response.xpath("//*[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [num for num in tag_list if not num.strip().endswith("评论")] # tags = ",".join(tag_list) article_item = JobBoleArticleItem() #通过CSS选择器提取字段 front_image_url = response.meta.get("front_image_url", "") title = response.css(".entry-header h1::text").extract()[0] create_data = response.css( "p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace( "·", "").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comments_nums = response.css(".hide-on-480::text").extract()[2].strip() match_re = re.match(".*?(\d+).*", comments_nums) if match_re: comments_nums = int(match_re.group(1)) else: comments_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() tag_list = [num for num in tag_list if not num.strip().endswith("评论")] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url article_item["create_date"] = create_data article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["fav_nums"] = fav_nums article_item["comments_nums"] = comments_nums article_item["tags"] = tags article_item["content"] = content yield article_item pass
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章具体字段 # /html/body/div[3]/div[3]/div[1]/div[1] # // *[ @ id="post-113737"] / div[1] / h1 # re_selector = response.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/h1') # re2_selector=response.xpath('//*[@id="post-113737"]/div[1]/h1/text()') # re3_selector=response.xpath('//div[@class="entry-header"]/h1/text()') # 标题、日期 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first('') # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace('·','').strip() # # 点赞数、收藏数、评论数 # prase_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comments_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re=re.match('.*?(\d+).*', comments_nums) # if match_re: # comments_nums = int(match_re.group(1)) # else: # comments_nums = 0 # # 内容 # content = response.xpath("//div[@class='entry']").extract()[0] # # 标签 # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ','.join(tag_list) # 通过css选择器提取字段 # front_image_url = response.meta.get('front_image_url', '') # 文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·','').strip() # try: # create_date = datetime.datetime.strptime(create_date, '%Y%m%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # prase_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re=re.match('.*?(\d+).*', fav_nums) # if match_re: # fav_nums= int(match_re.group(1)) # else: # fav_nums=0 # comments_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re=re.match('.*?(\d+).*', comments_nums) # if match_re: # comments_nums = int(match_re.group(1)) # else: # comments_nums=0 # content = response.css("div.entry").extract()[0] # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list=[element for element in tag_list if not element.strip().endswith("评论")] # tags=','.join(tag_list) # # article_item['title'] = title # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['prase_nums'] = prase_nums # article_item['fav_nums'] = fav_nums # article_item['comments_nums'] = comments_nums # article_item['content'] = content # article_item['tags'] = tags # # front_image_path在pipelines.py中填充 # 通过ItemLoader加载item front_image_url = response.meta.get('front_image_url', '') # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('prase_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('comments_nums', "a[href='#article-comment'] span::text") item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ # 实例化一个jobboleitem article_item = JobBoleArticleItem() # 获取meta,获取到Request的封面图提取出来 front_image_url = response.meta.get('front_image_url', '') -------------- css 案例 start -------------- # 标题 extract_first()防止数组越界 article_title_css = response.css('div.entry-header h1::text').extract_first('') # 时间 article_time_css = response.css('p.entry-meta-hide-on-mobile::text').extract_first('').strip().replace( '·', '').strip() # 点赞数 article_praise_css = response.css('.vote-post-up h10::text').extract_first('') # 正则提取收藏数字 match_article_praise_css = re.match('.*(\d+).*', article_praise_css) if match_article_praise_css: article_praise_css = int(match_article_praise_css.group(1)) else: article_praise_css = 0 # 收藏数 bookmark_css = response.css( '.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text').extract_first('') # 正则提取收藏数字 match_bookmark_css = re.match('.*(\d+).*', bookmark_css) if match_bookmark_css: article_bookmark_css = int(match_bookmark_css.group(1)) else: article_bookmark_css = 0 # 评论数 comments_css = response.css('a[href="#article-comment"] span::text').extract_first('') match_comments_css = re.match('.*(\d+).*', comments_css) if match_comments_css: article_comments_css = int(match_comments_css.group(1)) else: article_comments_css = 0 # 文章详情 article_contents_css = response.css('.entry').extract_first('') # 文章标签 tag_list_css = response.css('p.entry-meta-hide-on-mobile a::text').extract() # 去重标签 tag_list_css = [element for element in tag_list_css if not element.strip().endswith("评论")] tags_css = ','.join(tag_list_css) -------------- css 案例 end -------------- article_item["title"] = article_title_css # 将字符串时间转为日期 try: article_time_css = datetime.datetime.strptime(article_time_css, '%Y/%m%d').date() except Exception as e: article_time_css = datetime.datetime.now().date() article_item["create_date"] = article_time_css article_item["url"] = response.url article_item["url_object_id"] = get_md5(response.url) article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = article_praise_css article_item["comments_nums"] = article_comments_css article_item["fav_nums"] = article_bookmark_css article_item["tags"] = tags_css article_item["content"] = article_contents_css """ ''' 通过item_loader加载item,目的:比原来的item便于维护 start''' # 获取meta,获取到Request的封面图提取出来 front_image_url = response.meta.get('front_image_url', '') item_loader = ArticleItemLoad(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", "div.entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", 'p.entry-meta-hide-on-mobile::text') item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", '.vote-post-up h10::text') item_loader.add_css("comments_nums", 'a[href="#article-comment"] span::text') item_loader.add_css( "fav_nums", '.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text' ) item_loader.add_css("tags", 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css("content", '.entry') # 必须调用此步骤 article_item = item_loader.load_item() ''' 通过item_loader加载item,目的:比原来的item便于维护 end''' yield article_item