def parse_content(self,response): item_loader = ArticleItemLoader(item=DouBanItem(), response=response) item_loader.add_value("url",response.url) item_loader.add_xpath("title","//div[@id='content']/h1/span[1]/text()") #item_loader.add_xpath("time","//div[@id='content']/h1/span[2]/text()") item_loader.add_xpath("director","//div[@id='info']/span[1]/span[2]/a/text()") #item_loader.add_xpath("area","//*[@id='info']/text()[8]") #item_loader.add_xpath("language","//*[@id='info']/text()[10]") item_loader.add_css("score","div.rating_self strong::text") item_loader.add_xpath("introduction","//span[@property='v:summary']/text()") item_loader.add_xpath("front_image_url","//*[@id='mainpic']/a/img/@src") infos=response.xpath("//*[@id='info']/text()").extract() info_list=[] for info in infos: match_re = re.match(self.info_rule, info.strip()) if match_re: info_list.append(match_re.group(1)) time=response.xpath("//div[@id='content']/h1/span[2]/text()").extract()[0] match_re = re.match(self.time_rule, time) if match_re: item_loader.add_value("time",match_re.group(1)) item_loader.add_value("area",info_list[0]) item_loader.add_value("language",info_list[1]) item_loader.add_value("nickname",info_list[2]) douban_item = item_loader.load_item() yield douban_item
def parse_detail(self, response): # xpath 选取字段 # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first() # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].replace('·', '').strip() # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # vote_post_up = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # # bookmark_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # if bookmark_nums: # match_re = re.match(".*(\d+).*", bookmark_nums) # if match_re: # bookmark = int(match_re.group(1)) # # comment_nums = response.xpath("//a[@href='#article-comment']/text()").extract_first() # if comment_nums: # match_re = re.match(".*(\d+).*", comment_nums) # if match_re: # comment = int(match_re.group(1)) # css 选取字段 # article_item = JobBoleArticleItem() # article_item['title'] = title # article_item['create_date'] = create_date # article_item['front_image_url'] = [response.meta['front_image_url']] # article_item['url'] = response.url # article_item['url_object_id'] = get_md5(response.url) #通过 ItemLoader 加载 Item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") # item_loader.add_xpath("vote_post_up", "//span[contains(@class, 'vote-post-up')]/h10/text()") # item_loader.add_xpath("bookmark_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_css("content", "div.entry") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", response.meta.get("front_image_url", "")) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = ArticlespiderItem() # title = response.css("#news_title a::text").extract_first("") # # title = response.xpath("//*[@id='news_title'//a/text()") # create_time = response.css("#news_info .time::text").extract_first("") # # create_date = response.xpath("//*[@id='news_info'//*[@class='time']/text()") # match_re = re.match(".*?(\d+.*)", create_time) # if match_re: # create_time = match_re.group(1) # content = response.css("#news_content").extract()[0] # # content = response.xpath("//*[@id='news_content']").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # 无法存储list # # tag_list = response.xpath("//*[@class=news_tags']//a/text()") # tags = ",".join(tag_list) # post_id = match_re.group(1) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # 尽量不用同步的库 # 打断点看是否符合要求 # url路径 / 可以避免加入到子路径 # j_data = json.loads(html.text) # article_item['title'] = title # article_item['create_time'] = create_time # article_item['content'] = content # article_item['tags'] = tags # # article_item['url'] = response.url # if response.meta.get("front_image_ur;", ""): # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] # else: # article_item['front_image_url'] = [] # 使用itemloader的代码,使得程序可以更加易于维护 匹配项 item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get('front_image_url', []): item_loader.add_value('front_image_url', response.meta.get('front_image_url', [])) article_item = item_loader.load_item() if response.meta.get("front_image_ur;", ""): article_item['front_image_url'] = [response.meta.get('front_image_url', "")] else: article_item['front_image_url'] = [] yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": article_item}, callback=self.parse_nums) # praise_nums = j_data['DiggCount'] # fav_nums = j_data['TotalView'] # comment_nums = j_data['CommentCount'] pass
def parse_detail(self, response): article_item = TuiCoolArticleItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 flagTrue = response.meta.get("flag", "") # 标识 original = "http://www.tuicool.com/" + response.css( "span.from a::attr(href)").extract_first("") item_loader = ArticleItemLoader(item=TuiCoolArticleItem(), response=response) item_loader.add_css("title", ".article_row_fluid div:nth-child(1) h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "span.timestamp::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("sites", original) item_loader.add_value("flag", flagTrue) item_loader.add_css("original", "div.source a::text") item_loader.add_css("tags", "span.new-label::text") item_loader.add_css("content", "div.article_body") article_item = item_loader.load_item()
def parse_detail(selfs, response): article_item = CnblogsArticleItem() # title = response.css("#cb_post_title_url::text").extract()[0] # create_date = response.css('#post-date::text').extract()[0] # author = response.css('.postDesc a::text').extract()[0] # # 动态生成的,暂时爬取不了 # view_count = response.css('#post_view_count::text').extract()[0] # comment_count = response.css('#post_comment_count::text').extract()[0] # # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["author"] = author # article_item["url_object_id"] = get_md5(response.url) # 通过item loader加载item item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response) item_loader.add_css("title", '#cb_post_title_url::text') item_loader.add_css('create_date', '#post-date::text') item_loader.add_css('author', '.postDesc a::text') item_loader.add_value('url', response.url) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item=JobBoleArticleItem() # front_image_url=response.meta.get("front_image_url","") # title=response.css('.entry-header h1::text').extract() # date=response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip() # content=response.css("div.entry").extract()[0] # # match_obj= re.match(".*项目",title) # # if match_obj: # # print(match_obj.group(0)) # article_item["url_object_id"]=get_md5(response.url) # article_item["title"]=title # article_item["url"]=response.url # article_item["date"]=date # article_item["front_image_url"]=[front_image_url] # article_item["content"]=content front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("date", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("content", "div.entry") item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", [front_image_url]) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) # 先判断url有没有id if match_re: post_id = match_re.group(1) # 此处提取post_id来给下面的的id赋值 # article_Item = JoBoleArticleItem() # title = response.css("#news_title a::text").extract_first("") # time = response.css("#news_info .time::text").extract_first("") # match_re = re.match(".*?(\d+.*)", time) # if match_re: # time = match_re.group(1) # content = response.css("#news_content").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # # # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("") # # time = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("") # # content = response.xpath("//*[@id='news_content']").extract()[0] # # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract() # tags = ",".join(tag_list) # # # # # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))#注意因为requests是同步的而这里用的是一个异步的方法,所以换异步的方法实现, 得加/ # # j_data = json.loads(html.text) # # article_Item["title"] = title#这里面一定要确保key是在item里面存在的 # article_Item["time"] = time # article_Item["tags"] = tags # article_Item["content"] = content # article_Item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_Item["front_image_url"] = [response.meta.get("front_image_url","")]#可以通过此方法获得传递过来的图片值 # else: # article_Item["front_image_url"] = [] item_loader = ArticleItemLoader(item=JoBoleArticleItem(), response=response) # 记住这里的ArticleItemLoader类为items里自定义的方法 item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # 用yeid把上方法换成异步 yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": item_loader, "url": response.url}, callback=self.parse_nums)
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = ArticleSpiderItem() # title = response.css("#news_title a::text").extract_first("") # create_date = response.css("#news_info .time::text").extract_first("") # match_re = re.match(".*?(\d+.*)", create_date) # if match_re: # create_date = match_re.group(1) # content = response.css("#news_content").extract_first("") # tag_list = response.css(".news_tags a::text").extract() # tags = ",".join(tag_list) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # j_data = json.loads(html.text) # praise_nums = j_data["DiggCount"] # fav_nums = j_data["TotalView"] # comment_nums = j_data["CommentCount"] # article_item["title"] = title # article_item["create_date"] = create_date # article_item["content"] = content # article_item["tags"] = tags # article_item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_item["front_image_url"] = [response.meta.get("front_image_url", "")] # else: # article_item["front_image_url"] = [] item_loader = ArticleItemLoader(item=ArticleSpiderItem(), response=response) item_loader.add_css('title', '#news_title a::text') item_loader.add_css('create_date', '#news_info .time::text') item_loader.add_css('content', '#news_content') item_loader.add_css('tags', '.news_tags a::text') item_loader.add_value('url', response.url) if response.meta.get('front_image_url', []): item_loader.add_value('front_image_url', response.meta.get('front_image_url', [])) yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={ "article_item": item_loader, "url": response.url }, callback=self.parse_nums)
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = JobBoleArticleItem() # title = response.css('#news_title a::text').extract_first('') # create_date = response.css('#news_info .time::text').extract_first('') # match_re = re.match(".*?(\d+.*)", create_date) # if match_re: # create_date = match_re.group(1) # content = response.css('#news_content').extract()[0] # tag_list = response.css('.news_tags a::text').extract() # tags = ','.join(tag_list) # # article_item["title"] = title # article_item["create_date"] = create_date # article_item["content"] = content # article_item["tags"] = tags # img_url = response.meta.get('front_image_url',"") # if img_url: # if img_url.startswith('http'): # article_item["front_image_url"] = [img_url] # else: # article_item["front_image_url"] = ["https:"+img_url] # article_item["url"] = response.url item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("create_date", "#news_info .time::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_value("url", response.url) img_url = response.meta.get('front_image_url', "") if img_url: if img_url.startswith('http'): item_loader.add_value("front_image_url", [img_url]) else: item_loader.add_value("front_image_url", ["https:" + img_url]) yield Request(url=parse.urljoin( response.url, f'/NewsAjax/GetAjaxNewsInfo?contentId={post_id}'), meta={ 'article_item': item_loader, 'url': response.url }, callback=self.parse_nums)
def parse_detail(self, response): sohuItem = SohuItem() front_image_url = response.meta.get("front_image_url", "") article_type = response.meta.get("article_type", "") item_loader = ArticleItemLoader(item=sohuItem, response=response) item_loader.add_css("title", ".text-title h1::text") item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", front_image_url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("article_type", article_type) item_loader.add_css("author_name", ".user-info h4 a::text") item_loader.add_css("publish_time", ".article-info span::text") item_loader.add_css("content", "article") item_loader.add_value("crawl_time", datetime.now()) sohuItem = item_loader.load_item() yield sohuItem
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = JobBoleArticleItem() # title = response.css("#news_title a::text").extract_first("") # # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("") # create_date = response.css("#news_info .time::text").extract_first("") # match_re = re.match(".*?(\d+.*)", create_date) # if match_re: # create_date = match_re.group(1) # #create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("") # content = response.css("#news_content").extract()[0] # # content = response.xpath("//*[@id='news_content']").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract() # tags = ",".join(tag_list) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # j_data = json.loads(html.text) # article_item["title"] = title # article_item["create_date"] = create_date # article_item["content"] = content # article_item["tags"] = tags # article_item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_item["front_image_url"] = [response.meta.get("front_image_url", "")] # else: # article_item["front_image_url"] = [] #更简介的写法 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_date", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # article_item = item_loader.load_item() yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item":item_loader, "url":response.url}, callback=self.parse_nums)
def parse_detail(self, response): match_re_id = re.match(".*?(\d+)", response.url) if match_re_id: post_id = match_re_id.group(1) # article_item = CnblogsArticleItem() # title = response.css("#news_title a::text").extract_first("") # create_time = response.css("#news_info .time::text").extract_first("") # match_re_time = re.match(".*?(\d+.*)", create_time) # if match_re_time: # create_time = match_re_time.group(1) # content = response.css("#news_content").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # tags = ",".join(tag_list) # # article_item["title"] = title # article_item["create_time"] = create_time # article_item["content"] = content # article_item["tags"] = tags # article_item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_item["front_image_url"] = [response.meta.get("front_image_url", "")] # else: # article_item["front_image_url"] = [] item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={ "article_item": item_loader, "url": response.url }, callback=self.parse_nums)
def parse_zl(self, response): LaGouArticleItem = ArticleItemLoader(item=ZhiLianItem(), response=response) LaGouArticleItem.add_css("job_name", '.fixed-inner-box h1::text') LaGouArticleItem.add_xpath("salary", "//div[@class='terminalpage-left']/ul/li[1]/strong/text()") LaGouArticleItem.add_xpath("job_exp", "//div[@class='terminalpage-left']/ul/li[5]/strong/text()") LaGouArticleItem.add_xpath("edu", "//div[@class='terminalpage-left']/ul/li[6]/strong/text()") LaGouArticleItem.add_xpath("job_type", "//div[@class='terminalpage-left']/ul/li[4]/strong/text()") LaGouArticleItem.add_xpath("work_city","//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()") LaGouArticleItem.add_css("company_name",".inner-left a ::text") LaGouArticleItem.add_css("company_url",".inner-left a::attr(href)") LaGouArticleItem.add_css("work_addr",".terminalpage-main h2::text") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_xpath("create_date","//div[@class='terminalpage-left']/ul/li[3]/strong") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id",get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".welfare-tab-box ::text") LaGouArticleItem.add_xpath("job_desc","//div[@class='tab-inner-cont'][1]/p") LaGouArticleItem.add_xpath("tag","//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_question(self, response): zhihu_question_item = ArticleItemLoader(item=ZhiHuQuestionItem(), response=response) question_id = response.meta.get('question_id', 0) zhihu_question_item.add_css('title', "h1.QuestionHeader-title::text") zhihu_question_item.add_value("question_id", question_id) zhihu_question_item.add_css("question_detail", ".QuestionHeader-detail") zhihu_question_item.add_css("tags", ".Tag-content .Popover div::text") # zhihu_question_item.add_css("follow_nums", ".QuestionFollowStatus .NumberBoard-itemValue") zhihu_question_item = zhihu_question_item.load_item() yield zhihu_question_item yield Request(self.query_next_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answers)
def parse_content(self, response): front_image_url = response.meta.get('front_image_url', '') # 通过 ItemLoader 加载 Item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', 'h1::text') item_loader.add_css('content', 'div.entry') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', response.url) item_loader.add_value('front_image_url', front_image_url) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # 调用原生 ItemLoader 的 load_item 方法对规则进行解析并生成 item 对象时 # 存在两个问题: # 1. item 的所有值都是一个 list # 2. 不能做过滤处理 # 要解决该问题,需要定制 Item, ItemLoader item = item_loader.load_item() # yield item 到 pipeline yield item
def parse_job(self, response): '''item=ArticleSpiderItem() time.sleep(1) front_image_url = response.meta.get("front_image_url", "")#文章封面图 title=response.css('#cb_post_title_url::text').extract()[0] date=response.css('#post-date::text').extract()[0] match_re=re.match(r'.*?((\d+).(\d+).(\d+))',date) if match_re: date=match_re.group(1) content=response.css('#cnblogs_post_body').extract() item['url_object_id'] = get_md5(response.url) item['title']=title try: date=datetime.datetime.strptime(date,"%Y/%m/%d").date() except Exception as e: date=datetime.datetime.now().date() item['date'] = date item['url'] = response.url item['front_image_url'] = [front_image_url] item['content'] = content''' #通过itemloader加载item #global front_image_url front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=ArticleSpiderItem(), response=response) item_loader.add_css('title', '#cb_post_title_url::text') item_loader.add_value('url', response.url) #直接添加值用add_value() item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('date', '#post-date::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('content', '#cnblogs_post_body') item = item_loader.load_item() #调用 问题1:会把所以值变成list 2还要加处理函数 #print(item) #item_loader.add_xpath() #item_loader.add_value() yield item
def parse_detail(self, response): """ 提取文章 """ article_item = JobBoleAritcleItem() # title = response.xpath('//*[@id="post-112499"]/div[1]/h1/text()').extract()[0] # date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "") # priase_nums = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # content = response.xpath("//div[@class='entry']").extract()[0] # author = response.xpath("//a[@href='http://www.jobbole.com/members/hanxiaomax']/text()").extract_first("") # pass # title = response.css(".entry-header h1::text").extract()[0] # date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "") # priase_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css("span.bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("span.hide-on-480::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract()[0] # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # date = datetime.datetime.strptime(date, "%Y/%m/%d").date() # except Exception as e: # date = datetime.datetime.now().date() # article_item["create_date"] = date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = priase_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["content"] = content #通过Itemloader加载 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleAritcleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', 'span.bookmark-btn::text') item_loader.add_css('comment_nums', 'span.hide-on-480::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): #提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first("") # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·","").strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath("//div[@class='entry']").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) #通过css选择器提取字段 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content article_item = JobBoleArticleItem() #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get("front-img-url", "") #文章封面图url # title = response.xpath("//div[@class='entry-header']/h1/text()").extract_first(default="") # creat_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().strip().replace("·","").strip() # praise_num = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract_first() # collect_num = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract_first() # match_re = re.match(".*?(\d+).*", collect_num) # if match_re: # collect_num = int(match_re.group(1)) # else: # collect_num = 0 # # comment_num = response.css(".btn-bluet-bigger.href-style.hide-on-480::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_num) # if match_re: # comment_num = int(match_re.group(1)) # else: # comment_num = 0 # # content = response.xpath("//div[@class='entry']").extract_first() # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # article_item["front_image_url"] = [front_image_url] # try: # creat_date = datetime.strptime(creat_date, "%Y/%m/%d").date() # except Exception as e: # creat_date = datetime.now().date() # article_item["creat_date"] = creat_date # article_item["praise_num"] = praise_num # article_item["collect_num"] = collect_num # article_item["comment_num"] = comment_num # article_item["content"] = content # article_item["tags"] = tags # 通过ItemLoder加载item front_image_url = response.meta.get("front-img-url", "") # 文章封面图url item_loder = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loder.add_xpath("title", "//div[@class='entry-header']/h1/text()") item_loder.add_xpath("creat_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loder.add_xpath( "praise_num", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loder.add_xpath( "collect_num", "//span[contains(@class, 'bookmark-btn')]/text()") item_loder.add_css("comment_num", ".btn-bluet-bigger.href-style.hide-on-480::text") item_loder.add_xpath("content", "//div[@class='entry']") item_loder.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loder.add_value("url", response.url) item_loder.add_value("url_object_id", get_md5(response.url)) item_loder.add_value("front_image_url", [front_image_url]) article_item = item_loder.load_item() #传到pipeline yield article_item
def parse_detail(self,response): # #提取文章的具体字段 # #/html/body/div[3]/div[3]/div[1]/div[1](div[3]的下标从1开始) front_image_url=response.meta.get('format_image','')#文章封面图 # title=response.xpath('//*[@class="entry-header"]/h1/text()').extract()#id是唯一表示 # post_time=response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·',' ').strip()#extract()表示把它变为一个列表访问下表为‘0’的元素,strip()是去掉空格换行,replace('需要替换的内容',‘替换成的内容’) # praise_nums=int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0])#contains()函数表示span标签中的class属性的内容包含vote-post-up # fav_nums=response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] # result=re.match('.*(\d+).*',fav_nums) # if result: # fav_nums=int(result.group(1)) # else : # fav_nums=0 # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # result = re.match('.*(\d+).*',comment_nums) # if result: # comment_nums = int(result.group(1)) # else: # comment_nums=0 # context=response.xpath('//div[@class="entry"]').extract()[0] # tag_list=response.xpath('//*[@id="post-114253"]/div[2]/p/a/text()').extract() # tag_list=[element for element in tag_list if not element.strip().endswith('Git')]#去掉以前获取过的元素Git重复(不以Git为结尾的过滤掉)列表生成器 # #列表生成器过程:1.定义elemet 2.判断if not 结果为真时赋值给element,3.element添加列表中 # tags=','.join(tag_list) # #往item当中填值 # ''' title=scrapy.Field() # post_time=scrapy.Field() # url=scrapy.Field() # url_object_id=scrapy.Field() # front_image_url=scrapy.Field() # front_image_path=scrapy.Field() # praise_nums=scrapy.Field() # comment_nums=scrapy.Field() # fav_nums=scrapy.Field() # tags=scrapy.Field() # content=scrapy.Field()''' # article = ArticleItem() # article['url_object_id']=get_md5(response.url) # article['title']=title # try: # post_time=datetime.datetime.strptime(post_time,'%Y/%m/%d').date() # except Exception as e: # post_time=datetime.datetime.now() # article['post_time']=post_time # article['url']=response.url # article['front_image_url']=[front_image_url]#下载图片的路径传递的是数组 # article['praise_nums']=praise_nums # article['fav_nums']=fav_nums # article['comment_nums']=comment_nums # article['tags']=tags # article['content']=context # yield article#调用yield article会传递到pipelines.py 中 # #通过css选择器提取字段 # # title1=response.css('.entry-header h1::text').extract()[0] # # post_time1=response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·',' ').strip() # # praise_nums1=int(response.css('.vote-post-up h10::text').extract()[0]) # # fav_nums=response.css('.bookmark-btn::text').extract()[0] # # result = re.match('.*?(\d+).*', fav_nums) # # if result: # # fav_nums = int(result.group(1)) # # else: # # fav_nums=0 # # comment_nums=response.css('a[href="#article-comment"] span::text').extract()[0] # # result = re.match('.*?(\d+).*', comment_nums) # # if result: # # comment_nums = int(result.group(1)) # # else: # # comment_nums=0 #通过item Loader加载item front_image_url = response.meta.get('format_image', '') # 文章封面图 item_loader=ArticleItemLoader(item=ArticleItem(),response=response)#ArticleItemLoader是自定义的itemloader,他是继承itemloader,是在item.py文件重写的 item_loader.add_css('title','.entry-header h1::text') item_loader.add_css('post_time','p.entry-meta-hide-on-mobile::text') item_loader.add_value('url',response.url) item_loader.add_value('url_object_id',get_md5(response.url)) item_loader.add_value('front_image_url',[front_image_url]) item_loader.add_css('praise_nums','.vote-post-up h10::text') item_loader.add_css('fav_nums','.bookmark-btn::text') item_loader.add_css('comment_nums','a[href="#article-comment"] span::text') item_loader.add_css('tags','p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content','div.entry') article=item_loader.load_item()#利用item loader生成item yield article
def parse_detail(self, response): # 实例化item article_item = JobBoleArticleItem() # jobbole.py 解析字段,使用选择器 # 首先需要实例化一个ItemLoader类的对象 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 实例化一个对象 """有三种重要的方法 item_loader.add_css() # 通过css选择器选择的 item_loader.add_xpath() item_loader.add_value() # 不是选择器选择的,而是直接填充 """ # 通过item loader加载item # 获得meta中的front_image_url,文章封面图 front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("content", "div.entry") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # 获取article_item article_item = item_loader.load_item() """ 调用默认的load_item()方法有两个问题,第一个问题会将所有的值变成一个list,虽然听起来不合理,但是从另外的角度来看,也是合理的 因为通过css选择器取出来的极有可能就是一个list,不管是取第0个还是第1个,都是一个list,所以默认情况就是list 如何解决问题呢,list里面只取第一个,以及对某个字段的list加一些额外的处理过程 在item.py对字段进行定义,scrapy.Field()里面是有参数的,input_processor表示对输入的值预处理过程,后面MapCompose()类中可以传递很多函数名的参数,表示从左到右依次处理 title = scrapy.Field( input_processor = MapCompose(add_jobbole) ) """ yield article_item # 将item传递到pipeline中
def parse_detail(self, response): # 实例化JobBoleArticleItem article_item = JobBoleArticleItem() # 提取文章的具体字段 # use CSS Selector to locate Element # 获取文章封面图 # front_image_url = response.meta.get("front_image_url", "") # # # get title # title = response.css(".entry-header h1::text").extract()[0] # CSS伪类选择器:: # # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace("·", "").strip() # 处理/r/n空格,处理点号,处理空格 # # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # ' 2 收藏' # # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # ' 2 评论' # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # # # tag = response.css("p.entry-meta-hide-on-mobile a::text").extract()[0] # '开发' # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # ['开发', ' 2 评论 ', '数据科学', '机器学习'] # # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # '开发,数据科学,机器学习' # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # in items.py # article_item["url"] = response.url # # # need to convert create_date str to date # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # article_item["create_date"] = create_date # # article_item["front_image_url"] = [front_image_url] # [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # 加载自定义item不能继承自ItemLoader,而是继承ArticleItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_xpath() item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # call item article_item = item_loader.load_item() # call yield , article_item will transfer to pipelines yield article_item
def parse_detail(self, response): article_item = JobboleArticleItem() # 提取文章的具体字段 # 通过xpath提取文章具体字段 """ title = response.xpath('//*[@id="post-112051"]/div[1]/h1/text()').extract_first("") #extract_first("")提取不到,返回为空 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace(" ·","") praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re : fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract_first("") match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 #没有评论 content = response.xpath("//div[@class='entry']").extract()[0] tags = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [elment for elment in tags if not elment.strip().endswith("评论")] tags = ",".join(tag_list) print(title, create_date, praise_nums, comment_nums, tags) """ """ # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") # 文章封面图 title = response.css(".entry-header h1::text").extract_first() create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace(" ·","") praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css("span.bookmark-btn::text").extract()[0] match_re = re.match(r".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(r".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 # 没有评论 content = response.css("div.entry").extract()[0] tags = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [elment for elment in tags if not elment.strip().endswith("评论")] tags = ",".join(tag_list) print(title, create_date, praise_nums, comment_nums, tags) # 给Item填充值 article_item["title"] = title try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as error: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["url"] = response.url article_item["url_object_id"] = get_md5(response.url) article_item["front_image_url"] = [front_image_url] # 改为数组 article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content """ """ 通过item Loader来加载Item ----> 在以后的开发中都是用ItemLoader来解析值 """ front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() """ 传递到pipeline类里面去 """ yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() ## 提取文章的具体字段 ## xpath的提取元素的用法 # re_selector = response.xpath('//*[@id="post-114000"]/div[1]/h1/text()') # re_selector = response.xpath('//*[@id="post-114000"]/div[1]/h1') # re_selector2 = response.xpath('//div[@class="entry-header"]/h1/text()') # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # datetime = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip() # like_num = int(response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0]) # # collect_text = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # match_var = re.match(r'.*?(\d+).*', collect_text) # if match_var: # collect_num = match_var.group(1) # # comment_text = response.xpath("//a[@href='#article-comment']/span[1]/text()").extract()[0] # match_var = re.match(r'.*?(\d+).*', comment_text) # if match_var: # comment_num = match_var.group(1) # # context = response.xpath("//div[@class='entry']").extract()[0] ## css选择器提取元素的用法 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title_css = response.css(".entry-header h1::text").extract()[0] # datetime_css = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip() # like_num_css = int(response.css(".vote-post-up h10::text").extract()[0]) # # collect_text_css = response.css(".bookmark-btn::text").extract()[0] # match_var = re.match(r'.*?(\d+).*', collect_text_css) # if match_var: # collect_num_css = match_var.group(1) # # comment_text_css = response.css("a[href='#article-comment'] span::text").extract()[0] # match_var = re.match(r'.*?(\d+).*', comment_text_css) # if match_var: # comment_num_css = match_var.group(1) # # context_css = response.css(".entry").extract()[0] # # article_item["title"] = title_css # # datetime_css = "2018/05/12" # try: # create_date = datetime.strptime(datetime_css, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.now().date() # article_item["datetime"] = create_date # article_item["like_num"] = like_num_css # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["collect_num"] = collect_num_css # article_item["context"] = context_css # 通过item Loader 来加载 item # 首先定义itemLoader 的实例 # ItemLoader()函数有2个重要的参数 # 第一个参数 item对应于 items.py中的JobBoleArticleItem() # 第二个参数是 response # 为了能够自动取list的第一个元素,我们自定义了ArticleItemLoader 继承 ItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) #item_loader.add_css() 是通过网页中的样式进行提取,为 item_loader添加规则 # 用add_css()方法的好处是 其代码较为清晰,并且参数css的样式可以通过对数据库查找动态的加载,而 # 不是硬编码到程序中,方便进行动态配置 # 第一个参数为item中 选项值 # 第二个参数为css的样式 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("context", ".entry") item_loader.add_css("datetime", "p.entry-meta-hide-on-mobile::text") item_loader.add_css("like_num", ".vote-post-up h10::text") item_loader.add_css("collect_num", ".bookmark-btn::text") item_loader.add_css("context", ".entry") front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # 如果不是通过css的样式取出对应的值,则使用 item_loader.add_value item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) # 调用 load_item() 方法,才会将 前面的规则进行解析 article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 通过 css选择器 提取字段 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace(' ·', '') # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".post-adds span:nth-child(2)::text").extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # # 如果没有匹配到数字 给一个默认值 # fav_nums = 0 # comment_nums = response.css(".post-adds a span::text").extract_first() # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css(".hentry").extract()[0] # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() # tag_list = [ele for ele in tag_list if not ele.strip().endswith("评论")] # tags = ','.join(tag_list) # 获取图片信息 # 取传入信息的默认值 防止未取到 设置默认 ‘’ # 文章封面图 # front_image_url = response.meta.get('front_image_url', '') # 此处下标从1开始 # /html/body/div[1]/div[3]/div[1]/h1 # // *[ @ id = "post-95521"] / div[1] / h1 # re1_selector = response.xpath("/html/head/title") # re2_selector = response.xpath('//*[@id="post-95521"]/div[1]/h1') # re3_selector = response.xpath('//*[@class="entry-header"]/h1/text()') # 通过 xpath 提取字段 # title = response.xpath('//*[@class="entry-header"]/h1/text()') # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract()[0] # # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # # comment_nums = response.xpath('//a[contains(@href, "#article-comment")]/span/text()').extract()[0] # match_re = re.match(r".*?(\d+).*", comment_nums) # if match_re: # comment_nums = match_re.group(1) # # content = response.xpath('//div[contains(@class, "hentry")]').extract()[0] # # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [ele for ele in tag_list if not ele.strip().endswith("评论")] # tag = ','.join(tag_list) # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["front_image_url"] = [front_image_url] # 通过 item_loader 加载 item 便于项目后期维护 # 用 scrapy 提供的 loader # item_loader = ItemLoader(item=JobBoleArticleItem(), response=response) # 用自定义的 loader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', ".entry-meta-hide-on-mobile::text") item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('fav_nums', '.post-adds span:nth-child(2)::text') item_loader.add_css('comment_nums', '.post-adds a span::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', '.hentry') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", response.meta.get('front_image_url', '')) # item_loader.add_xpath() # 默认取出的所有 item 对象都是一个 list article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # re_selector = response.xpath('//*[@id="post-113771"]/div[1]/h1/text()') # article_item = JobBoleArticleItem() # # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # page_url = response.meta.get("page_url", "") # 文章所在页面的url # # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace("·", # "").strip() # # praise_nums = response.xpath( # '//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()').extract()[ # 0].strip() # fav_nums = response.xpath( # '//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()').extract()[0] # # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # else: # fav_nums = 0 # # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0] # if match_re: # comment_nums = match_re.group(1) # else: # comment_nums = 0 # # content = response.xpath("//div[@class='entry']/text()").extract()[0] # # tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # article_item["page_url"] = page_url # 通过Item Loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 page_url = response.meta.get("page_url", "") # 文章所在页面的url item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css() # item_loader.add_value() item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath("create_date", '//p[@class="entry-meta-hide-on-mobile"]/text()') item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("page_url", "-") item_loader.add_xpath("comment_nums", '//a[@href="#article-comment"]/span/text()') item_loader.add_xpath("praise_nums", '//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()') item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_xpath("tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", '//div[@class="entry"]/text()') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段,也属于回调函数 '''#通过XPATH提取字段 #获取标题 response.xpath('//*[@id="post-110287"]/div[1]/h1/text()').extract_first() #获取时间 createdate = re.sub('\r|\n| |·', '', response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]) #获取点赞数 thumbs = int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]) #获取收藏数 bookmark = int(response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0].split()[0]) #获取评论数 comments = int(response.xpath("//div[@class='post-adds']/a/span/text()").extract()[0].split()[0]) #获取正文 contents = response.xpath('//div[@class="entry"]').extract() #获取career career = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a[1]/text()").extract()[0],\ response.xpath("//p[@class='entry-meta-hide-on-mobile']/a[3]/text()").extract()[0] career2 = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() career3 = [element for element in career2 if not element.strip().endswith("评论")] tags = ','.join(career3) #获取作者 author = response.xpath("//a[@href='http://www.jianshu.com/p/dc859753a035']/text()").extract_first() ''' ''' # 通过CSS选择器提取字段 # 获取标题左侧图片,文章封面 front_image_url = response.meta.get("front_image_url", "") # 获取标题 title = response.css(".entry-header h1::text").extract()[0] # 获取时间 createdate2 = re.sub('\r|\n| |·', '', response.css(".entry-meta-hide-on-mobile::text").extract()[0]) try: createdate = datetime.strptime(createdate2, '%Y/%m/%d').date() except Exception as e: createdate = datetime.now().date() # 拿到url的md5值 url_object_id = common.get_md5(response.url) # 获取点赞数 m3 = re.search(r'\d+', response.css("span[class*='vote-post-up'] h10::text").extract_first()) if m3: thumbs = int(m3.group(0)) else: thumbs = None # 获取收藏数 m1 = re.search(r'\d+', response.css("span[class*='bookmark-btn']::text").extract_first()) if m1: bookmark = int(m1.group(0)) else: bookmark = None # 获取评论数 m2 = re.search(r'\d+', response.css("div.post-adds a span::text").extract_first()) if m2: comments = int(m2.group(0)) else: comments = None # 获取正文 contents = response.css("div.entry").extract() # 获取career career = response.css("p.entry-meta-hide-on-mobile a::text").extract() career2 = [element for element in career if not element.strip().endswith("评论")] tags = ','.join(career2) # 获取作者 author = response.css(".copyright-area > a::text").extract_first() # author = response.css(".copyright-area a:nth-child(1)::text").extract_first() article_item["title"] = title article_item["createdate"] = createdate article_item["url"] = response.url article_item["url_object_id"] = url_object_id article_item["front_image_url"] = [front_image_url] article_item["thumbs"] = thumbs article_item["bookmark"] = bookmark article_item["comments"] = comments article_item["contents"] = contents article_item["tags"] = tags article_item["author"] = author yield article_item ''' # 通过itemloader加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # item_loader.add_css(),item_loader.add_value(),item_loader_add_xpath() # item_loader会自动做.extract()方法 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("createdate", ".entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", common.get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("thumbs", "span[class*='vote-post-up'] h10::text") item_loader.add_css("bookmark", "span[class*='bookmark-btn']::text") item_loader.add_css("comments", "div.post-adds a span::text") item_loader.add_css("contents", "div.entry") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("author", ".copyright-area > a::text") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 实例化JobBoleArticleItem article_item = JobBoleArticleItem() ''' # re_selector = response.xpath("/html/body/div[3]/div[3]/div[1]/div[1]/h1") # re2_selector = response.xpath('//*[@id="post-112048"]/div[1]/h1/text()') title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # 获取时间 create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract()[0].strip().replace("·", "").strip() # 点赞数 praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0] # 收藏数 fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = match_re.group(1) # 评论数 comment_nums = response.xpath('//a[@href="#article-comment"]/span').extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) #没有评论就设置默认值为0 else: common_nums=0 # 正文内容 content = response.xpath('//div[@class="entry"]').extract()[0] tag_list = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) ''' ''' python通过css选择器来提取元素 ''' # 通过css选择器来获取标题。 front_image_url = response.meta["front_image_url"] # 文章封面图 # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip() # praise_nums = response.css("span.vote-post-up h10::text").extract()[0] # fav_nums = response.css("span.bookmark-btn::text").extract()[0] # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract()[0] # tag_list = response.css(".entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # # 填充值 # article_item["title"] = title # article_item["url"] = response.url # # # 日期转换 # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # # MD5压缩赋值 # article_item["url_object_id"] = get_md5(response.url) # 通过itemloader来加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") # item_loader.add_xpath() 暂时不用 item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", "span.vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", "span.bookmark-btn::text") item_loader.add_css("tags", ".entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 解析填充的规则 article_item = item_loader.load_item() # 异步调用 yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 通过xpath提取文章的具体字段 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath("//p[@class='entry-meta-hide-on-mobile']/text()").extract_first().replace(u"·","").strip() # praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()').extract_first() # fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract_first() # # fav_nums = re.sub('\D', '', fav_nums) # match_re = re.match(".*?(\d+).*?", fav_nums) # if match_re: # fav_nums = match_re.group(1) # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # comment_nums = re.sub('\D', '', comment_nums) # content = response.xpath('//div[@class="entry"]').extract_first() # tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith(u'评论')] # tag = ','.join(tag_list) # 通过css选择器提取字段 # title = response.css(".entry-header h1::text").extract_first('') # front_image_url = response.meta.get("front_image_url", "") # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract_first('').replace(u"·","").strip() # praise_nums = response.css(".vote-post-up h10::text").extract_first('') # fav_nums = response.css(".bookmark-btn::text").extract_first('') # fav_nums = re.sub('\D', '', fav_nums) # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first('') # comment_nums = re.sub('\D', '', comment_nums) # content = response.css('div.entry').extract_first() # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith(u'评论')] # tags = ','.join(tag_list) # # article_item["title"] = title # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item_loader加载item front_image_url = response.meta.get("front_image_url", "") item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", front_image_url) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() yield article_item
def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item question_id = response.meta.get('question_id', '') item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text") item_loader.add_css( "watch_user_num", ".NumberBoard:first-child .NumberBoard-itemValue::text") item_loader.add_css( "click_num", ".NumberBoard:last-child .NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 请求答案api answer_url = self.start_answer_url.format(question_id, 5, 0) yield scrapy.Request(answer_url, headers=self.headers, callback=self.parse_answer) yield question_item