def parse_job(self, response): item_loader = ArticleItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name span::text") item_loader.add_value("url", response.url) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a div h2::text") job_item = item_loader.load_item() return job_item
def get_detail_use_item_loader(self, response): ''' 使用item_loader,这里得到的字段是列表 :return: ''' article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=article_item, response=response) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("title", "//div[@class = 'entry-header']/h1/text()") # 标题 item_loader.add_xpath("create_date", "//div[@class='entry-meta']/p/text()") item_loader.add_xpath("praise_nums", "//div[@class='post-adds']//h10/text()") # 点赞数 item_loader.add_xpath( "fav_nums", "//div[@class='post-adds']/span[2]/text()") # 收藏数 item_loader.add_xpath( "comment_nums", "//span[@class='btn-bluet-bigger href-style hide-on-480']/text()" ) # 评论数 item_loader.add_xpath("content", "//div[@class='entry']") # 内容 item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") # 内容 article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #article_item=JobBoleArticleItem() # article_item['title']=response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # # # # 提取meta中的值,使用get方法遇到空的键值才不会报错,默认值为空,此处使用的是元祖非[] # # # image的url要改为数组,不然在使用自动下载器会报错,即setting中的IMAGES_URLS_FILELD # article_item['front_image_url']=[response.meta.get('front_image_url','')] # date_time=re.match('.*?(\d{4}/\d+/\d+).*',response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]').extract()[0]) # try: # article_item['create_date']=datetime.datetime.strptime(date_time,'%Y/%m/%d').date() # except Exception as e: # article_item['create_date']=datetime.datetime.now().date() # article_item['tag']=','.join(response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()) # article_item['content']=''.join(response.xpath('//div[@class="entry"]/p/text()').extract()) # article_item['praise_nums']=response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0] # fav_num=response.xpath('//div[@class="post-adds"]/span[2]/text()').extract()[0] # match_re=re.match(".*?(\d+).*",fav_num) # if match_re: # article_item['fav_nums']=match_re.group(1) # else: # article_item['fav_nums']=0 # comment_num=response.xpath('//div[@class="post-adds"]/a/span/text()').extract()[0] # match_re = re.match(".*?(\d+).*", comment_num) # if match_re: # article_item['comment_nums'] = match_re.group(1) # else: # article_item['comment_nums']=0 # article_item['url_object_id'] =common.get_md5(response.url) # 使用ItemLoader加载item #item_loader=ItemLoader(item=JobBoleArticleItem(),response=response) # 使用自定义ArticleItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath( 'tag', '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_value('front_image_url', [response.meta.get('front_image_url', '')]) item_loader.add_value('url', response.url) item_loader.add_xpath('content', '//div[@class="entry"]/p/text()') item_loader.add_xpath('praise_nums', '//div[@class="post-adds"]/span/h10/text()') item_loader.add_xpath('comment_nums', '//div[@class="post-adds"]/a/span/text()') item_loader.add_xpath('fav_nums', '//div[@class="post-adds"]/span[2]/text()') item_loader.add_xpath( 'create_date', '//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]') item_loader.add_value('url_object_id', common.get_md5(response.url)) article_item = item_loader.load_item() yield article_item pass
def parse_detail(self, response): # 提取文章的具体字段 article_item = JobboleArticleItem() # 图片 image = response.meta.get("front_img", "") # # # 标题 # title = response.xpath("//div[@class='entry-header']/h1/text()") # title_result = title.extract_first("") # # 创建时间 # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()") # create_date_result = create_date.extract()[0].replace("·", "").strip() # # 点赞数 # praise_num = response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()") # praise_num_result = int(praise_num.extract()[0]) # # 收藏数 # fav_num = response.xpath("//span[contains(@class,'bookmark-btn')]/text()") # match_re = re.match(r".*?(\d+).*", fav_num.extract()[0]) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # # # 评论数 # comment_num = response.xpath("//a[@href='#article-comment']/span/text()") # match_re = re.match(".*?(\d+).*", comment_num.extract()[0]) # if match_re: # comment_num = int(match_re.group(1)) # else: # comment_num = 0 # # 文章内容 # # article_content=response.xpath("//") # content = response.xpath("//div[@class='entry']//text()").extract() # content_data = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # content_key = [content_key for content_key in content_data if not content_key.strip().endswith("评论")] # content_keys = ','.join(content_key) # # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) # article_item['front_img_url'] = [image] # article_item['title'] = title_result # try: # create_date_result = datetime.datetime.strptime(create_date_result, '%Y/%m/%d').date() # except Exception as e: # create_date_result = datetime.datetime.now() # article_item['create_time'] = create_date_result # article_item['praise_num'] = praise_num_result # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['content'] = content # article_item['tags'] = content_keys # 通过item loader 加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_img_url", [image]) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath("create_time", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_xpath("praise_num", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath("fav_num", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("comment_num", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//div[@class='entry']//text()") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") article_item= item_loader.load_item() yield article_item