def parse_details(self, response): """ 提取具体字段 """ # # 通过CSS选择器提取文章的具体字段,并添加到item中 # title = response.css('.entry-header h1::text').extract_first() # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·', '').strip() # # 数据库里定义的是date对象,所以这里要处理一下 # try: # create_date = self.pares_ymd(create_date) # except Exception as e: # create_date = datetime.now().date() # tag = response.css('.entry-meta-hide-on-mobile a::text').extract()[-1] # front_image_url = response.meta.get("front_image_url", "") # content = response.css("div.entry").extract_first() # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # # item对应字段填充值 # article_item = JobBoleArticleItem() # article_item["title"] = title # article_item["url"] = response.url # article_item["create_date"] = create_date # article_item["url_object_id"] = get_md5(response.url) # article_item["tag"] = tag # article_item["front_image_url"] = [front_image_url] # article_item["content"] = content # article_item["fav_nums"] = fav_nums # article_item["front_image_path"] = " " # 通过item loader加载item # 文章封面图 front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text") item_loader.add_css("tag", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) article_item = item_loader.load_item() # 调用后传递到pipelines.py yield article_item
def parse_detail(self, response): article_item = QiushibaikeItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 flagTrue = response.meta.get("flag", "") # 标识 original = "http://www.tuicool.com/" + response.css("span.from a::attr(href)").extract_first("") item_loader = ArticleItemLoader(item=QiushibaikeItem(), response=response) item_loader.add_css("title", ".article_row_fluid div:nth-child(1) h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "span.timestamp::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("sites", original) item_loader.add_value("flag", flagTrue) item_loader.add_css("original", "div.source a::text") item_loader.add_css("tags", "span.new-label::text") item_loader.add_css("content", "div.article_body") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # xpath选择器,提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # creat_data = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first().strip().replace("·", "") # praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first() # fav_nums = response.xpath('//span[contains(@class,"bookmark-btn ")]/text()').extract_first() # match_re = re.match(".*?(\d+).*", "fav_nums") # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # match_re = re.match(".*?(\d+).*", "comment_nums") # if match_re: # comment_nums = match_re.group(1) # content = response.xpath('//div[@class="entry"]').extract_first() # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # # endswith() 方法用于判断字符串是否以指定后缀结尾,如果以指定后缀结尾返回True,否则返回False # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # .join() 连接字符串数组,将字符串、元组、列表中的元素以指定的字符(分隔符)连接生成一个新的字符串 # # css选择器 # title = response.css('.entry-header h1::text').extract_first("") # create_data = response.css('p.entry-meta-hide-on-mobile ::text').extract_first("").strip().replace("·", "") # praise_nums = response.css('.vote-post-up h10::text').extract_first("") # fav_nums = response.css('.bookmark-btn ::text').extract_first("") # match_re = re.match(".*?(\d+).*", "fav_nums") # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css('a[href="#article-comment"] ::text').extract_first("") # match_re = re.match(".*?(\d+).*", "comment_nums") # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css('.entry').extract_first("") # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # # try: # # create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() # # except Exception as e: # # create_data = datetime.datetime.now().date() # article_item["create_data"] = create_data # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["front_image_url"] = [front_image_url] # article_item["comment_nums"] = comment_nums # article_item["content"] = content # article_item["tags"] = tags # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_data", "p.entry-meta-hide-on-mobile ::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn ::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] ::text") item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", ".entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() # title = response.css('div.entry-header > h1::text').extract_first() # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip() # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0) front_img_url = response.meta.get('front_img_url', '') # # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first() # fav_num_re = re.match(".*(\d+).*", fav_num_info) # if fav_num_re: # fav_num = fav_num_re.group(1) # else: # fav_num = 0 # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first() # comment_num_re = re.findall("\d+",comment_num_info) # if comment_num_re: # comment_num = comment_num_re[0] # else: # comment_num = 0 # # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract() # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')]) # content = response.css('.entry').extract_first() # # article_item['url_object_id'] = get_md5(response.url) # article_item['url'] = response.url # article_item['title'] = title # try: # create_date = datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now() # article_item['create_date'] = create_date # article_item['praise_num'] = praise_num # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['front_img_url'] = [front_img_url] # article_item['tags'] = tags # article_item['content'] = content #通过item loader价值item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', 'div.entry-header > h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text') item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text') #re item_loader.add_css('comment_num', 'a[href="#article-comment"] span::text') #re item_loader.add_css( 'tag', '.entry-meta .entry-meta-hide-on-mobile a::text') #处理函数 item_loader.add_css('content', '.entry') item_loader.add_value('front_img_url', [front_img_url]) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath("//div[@class='entry']").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) #通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): ''' 解析文章详情 :param response: :return: ''' # 实例化item article_item = JobBoleArticleItem() # 使用xpath来获取数据 # #// *[ @ id = "post-114041"] / div[1] / h1 # title = response.xpath("//*[@id='post-114041']/div/h1/text()").extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first("").strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract_first("") # fav_nums = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract_first("") # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath("//a[href='#article-comment']/span/text()").extract_first("") # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # content = response.xpath("//div[class='entry']").extract()extract_first("") # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract_first("") # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # 使用css选择器来获取数据 # extract_first函数:extract返回的是一个数组,数组有可能为空,所以取第0个值得时候会报错,调用这个函数可以在取不到值得时候给一个默认值 # title = response.css(".entry-header h1::text").extract_first("") # create_date = response.css(".entry-meta-hide-on-mobile::text").extract_first("").strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract_first("") # fav_nums = response.css(".bookmark-btn::text").extract_first("") # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first("") # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract_first("") # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) front_image_url = response.meta.get("front_image_url", "") # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_path"] #通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date", ".entry-meta-hide-on-mobile::text") item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #提取文章具体字段(xpath) # title = response.xpath('//*[@id="post-113789"]/div[1]/h1/text()').extract()[0] # # create_date = response.xpath('//*[@id="post-113789"]/div[2]/p/text()[1]').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.xpath('//*[@id="113789votetotal"]/text()').extract() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/span[2]/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.xpath('//*[@id="post-113789"]/div[3]/div[12]/a/span/text()').extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.xpath('//*[@id="post-113789"]/div[3]').extract()[0] # # tag_list = response.xpath('//*[@id="post-113789"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) #以下通过css选择器提取字段 # article_item = JobboleArticleItem() #实例化 # # front_image_url = response.meta.get('front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) # #文章封面图 # # title = response.css('.entry-header h1::text').extract()[0] # # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·', '').strip() # # praise_nums = response.css('.vote-post-up h10::text').extract_first() # if praise_nums: # praise_nums = int(praise_nums[0]) # else: # praise_nums = 0 # # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match(r'.*?(\d+).*', fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(r'.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # #填充值到items # article_item['title'] = title # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) #对url做MD5 # # try: #为了将文章的创建时间写入数据库,要把str类型的create_time转换为date类型 # create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date() #将格式为%Y/%m/%d 的str类型转换为date类型 # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # # article_item['front_image_url'] = [front_image_url] #images需要接受一个数组 # article_item['praise_nums'] = praise_nums # article_item['fav_nums'] = fav_nums # article_item['comment_nums'] = comment_nums # article_item['tags'] = tags # article_item['content'] = content #通过itemLoader加载item front_image_url = response.meta.get('front_image_url', '') #get key=front_image_url 的值,如果没有key=front_image_url,回传''(空) #item_loader = ItemLoader(item=JobboleArticleItem(), response=response) #定义ItemLoader实例 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) #改用自定义的 ItemLoader # ItemLoader.add_css(self, field_name, css) # ItemLoader.add_xpath(self, field_name, xpath) # ItemLoader._add_value(self, field_name, value) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() #调用默认的item方法的话会有两个问题:1.值都是list 2.还需要对取出的值行进处理(做re的提取等) #-->去修改items.py #1.在items.py 的Field()里面用TakeFirst进行处理 2.在items.py 的Field()里面用MapCompose进行处理 yield article_item #调用yield之后,item会传递到pipelines.py
def parse_detail(self, response): item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", cutils.get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") yield item_loader.load_item()
def parse_question(self, response): zhihu_id = response.meta.get("question_id", "") question_item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) question_item_loader.add_css("title", "h1.QuestionHeader-title::text") question_item_loader.add_css("content", ".QuestionHeader-detail") question_item_loader.add_value("url", response.url) question_item_loader.add_value("zhihu_id", zhihu_id) question_item_loader.add_css("answer_num", ".List-headerText span::text") question_item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text") question_item_loader.add_css("watch_user_num", ".NumberBoard-itemValue::text") question_item_loader.add_css( "topics", ".QuestionHeader-topics .Popover div::text") question_item = question_item_loader.load_item() yield scrapy.Request(self.start_answer_url.format(zhihu_id, 20, 0), headers=self.headers, callback=self.parse_answer) yield question_item
def parse_content(self, response): # 通过css选择器提取数据 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css('.entry-header h1::text').extract_first() # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip() # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数 # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数 # match_re = re.match(".*?(\d+).*", fav_num) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数 # match_re = re.match(".*?(\d+).*", comments_num) # 正则获取字符串中的数字 # if match_re: # comments_num = int(match_re.group(1)) # else: # comments_num = 0 # content = response.css('div.entry').extract_first() # 正文 # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')] # tags = ",".join(tag_list) # 标签 # # article_item = JobboleArticleItem() # article_item["title"] = title # try: # create_date = datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now().date() # article_item["create_date"] = create_date # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_num # article_item["comment_nums"] = comments_num # article_item["fav_nums"] = fav_num # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item 使用自定义的loader:ArticleItemLoader 由list变成str front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 通过itemloder front_image_url = response.meta.get('front_image_url', '') item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_num', '.vote-post-up h10::text') item_loader.add_css('fav_num', '.bookmark-btn::text') item_loader.add_css('com_num', "a[href='#article-comment'] span::text") item_loader.add_css('content', 'div.entry') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 提取文章的具体字段, 回调函数 :param response: :return: """ article_item = JobBoleArticleItem() # 文章封面图 front_image_url = response.meta.get("front_image_url", "") """ ''' # 方法一:【通过XPath提取字段】 # 可以在浏览器inspect html里copy Xpath # chrome return 和 firefox return 可能不一样。 有时直接copy的值无法获得数据, 因为获取的是动态html而不是原始html # 标题 # extract_first() 就是 extract()[0],还可以传一个default值 title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # 创建时间 create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract_first("").strip().replace("·", "").strip() # 点赞数 vote_numbers = int(response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first("")) # 收藏数 bookmark_numbers = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first("") match_re = re.match(r".*?(\d+).*", bookmark_numbers) if match_re: bookmark_numbers = match_re.group(1) # 评论数 comment_numbers = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("") match_re = re.match(r".*?(\d+).*", comment_numbers) if match_re: comment_numbers = match_re.group(1) # 正文 (不提取text而是提取整个html结构) content = response.xpath("//div[@class='entry']").extract_first("") # 标签 tags = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() tags = ','.join([item for item in tags if not item.strip().endswith('评论')]) ''' # 方法二:【通过CSS选择器提取字段】 # 标题 # ::text 代表去text title = response.css(".entry-header h1::text").extract_first("") # 创建时间 create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first("").strip().replace("·","").strip() # 点赞数 vote_numbers = int(response.css(".vote-post-up h10::text").extract_first("")) # 收藏数 bookmark_numbers = response.css(".bookmark-btn::text").extract_first("") match_re = re.match(r".*?(\d+).*", bookmark_numbers) if match_re: bookmark_numbers = int(match_re.group(1)) else: bookmark_numbers = 0 # 评论数 comment_numbers = response.css("a[href='#article-comment'] span::text").extract_first("") match_re = re.match(r".*?(\d+).*", comment_numbers) if match_re: comment_numbers = int(match_re.group(1)) else: comment_numbers = 0 # 正文 (不提取text而是提取整个html结构) content = response.css("div.entry").extract_first("") # 标签 tags = response.css('p.entry-meta-hide-on-mobile a::text').extract() tags = ','.join([item for item in tags if not item.strip().endswith('评论')]) # 赋值生成item article_item['url_object_id'] = get_md5(response.url) article_item['title'] = title article_item['url'] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item['create_date'] = create_date # scrapy 的image下载接受的是数组 article_item['front_image_url'] = [front_image_url] article_item['vote_numbers'] = vote_numbers article_item['bookmark_numbers'] = bookmark_numbers article_item['comment_numbers'] = comment_numbers article_item['tags'] = tags article_item['content'] = content """ # 通过item loader 加载 item # 更加简洁可配置 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("vote_numbers", ".vote-post-up h10::text") item_loader.add_css("bookmark_numbers", ".bookmark-btn::text") item_loader.add_css("comment_numbers", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_job(self, response): item_loader = ArticleItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name span::text") item_loader.add_value("url", response.url) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a div h2::text") job_item = item_loader.load_item() return job_item