def parse_detail(self, response): # 实例化 article_item = JobBoleArticleItem() # 通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) if response.url.find("/Index/newslist") or response.url.find("/index"): # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".article-title::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "#date-topic::text") item_loader.add_css("content", ".article-content") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() else: item_loader.add_css("title", "title::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "title::text") item_loader.add_css("content", "body::text") # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): article_item=JobBoleArticleItem() # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") #文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = CnBlogsArticleItem() title = response.css("#news_title a::text").extract_first() create_date = response.css(".time::text").extract_first() # create_time = response.css(".time::text").extract_first() match_pattern = ".*(\d{4}-\d{2}-\d{2}).*?(\d+:\d+)" match_re = re.match(match_pattern, create_date) if match_re: create_date = match_re.group(1) # create_time = match_re.group(2) content = response.css('#news_body').extract_first() article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strftime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date # article_item["create_time"] = create_time article_item["content"] = content yield article_item
def parse_job(self, response): item_load = LagouJobItemLoader(item=LagouJobItem(), response=response) item_load.add_value("url", response.url) item_load.add_value("url_object_id", get_md5(response.url)) item_load.add_css("title", "div.job-name::attr(title)") item_load.add_css("salary", ".salary::text") item_load.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_load.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_load.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_load.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_load.add_css("pulish_time", ".publish_time::text") item_load.add_xpath("tags", "//*[@class='position-label clearfix']/li/text()") item_load.add_xpath("job_advantage", "//*[@class='job-advantage']/p/text()") item_load.add_xpath("job_desc", "//*[@class='job_bt']/div") item_load.add_xpath("job_addr", "//*[@class='work_addr']/a/text()") item_load.add_xpath("company_url", "//*[@class='c_feature']/li/a/@title") item_load.add_css("company_name", ".job_company dt img::attr(alt)") item_load.add_value("crawl_time", datetime.datetime.now()) item_load.add_value("crawl_update_time", datetime.datetime.now()) lagou_item = item_load.load_item() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return lagou_item
def parse_detail(self, response): try: # 使用Crawl api记录文章详情页请求成功的Request self.crawler.stats.inc_value("ArticleDetail_Success_Reqeust") except Exception as e: _ = e article_item = JobBoleArticleItem() #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_job(self, response): LaGouArticleItem = ArticleItemLoader(item=LaGouItem(), response=response) LaGouArticleItem.add_css("job_name", '.job-name::attr(title)') LaGouArticleItem.add_css("salary", ".salary::text") LaGouArticleItem.add_xpath( "job_exp", "//dd[@class='job_request']/p/span[3]/text()") LaGouArticleItem.add_xpath( "edu", "//dd[@class='job_request']/p/span[4]/text()") LaGouArticleItem.add_xpath( "job_type", "//dd[@class='job_request']/p/span[5]/text()") LaGouArticleItem.add_xpath( "work_city", "//dd[@class='job_request']/p/span[2]/text()") LaGouArticleItem.add_css("company_name", "#job_company .b2::attr(alt)") LaGouArticleItem.add_css("company_url", ".job_company dt a::attr(href)") LaGouArticleItem.add_css("work_addr", ".work_addr") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_css("create_date", ".publish_time::text") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id", get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".job-advantage p::text") LaGouArticleItem.add_css("job_desc", ".job_bt div") LaGouArticleItem.add_css("tag", ".position-label li") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_job(self, response): # 解析拉勾网的职位 # i = {} #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # 尽量少写处理逻辑,关于数据的清洗放在ItemLoader函数中去做 css .class #id item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) # 参考jobbole4中的md5用法 item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", ".position-label li::text") item_loader.add_css( "publish_time", ".publish_time::text") # need to convert str and split item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_detail(self,response): article_item = response.meta.get("article_item","") re_url = response.url re_title = response.xpath('//div[@id="left_content_pages"]/h1[@class="contents_header"]/a/text()').extract_first("") re_info = response.xpath('//div[@id="left_content_pages"]/div[@class="contents_info"]//text()').extract() re_info = ''.join(re_info) if re.search(r'作者:(.*?)来源',re_info): re_author = re.search(r'作者:(.*?)来源',re_info).group().replace('作者:','').replace('来源','').split()[0] re_source = re.search(r'来源:(.*?)发布时间', re_info).group().replace('来源:', '').replace('发布时间', '').split()[0] release_time = re.search(r'发布时间:(.*?)阅读', re_info).group().replace('发布时间:', '').replace('阅读', '').split()[0] re_read = re.search(r'阅读:(.*?)推荐', re_info).group().replace('阅读:', '').replace('推荐', '').split()[0] re_recommend = response.xpath('//div[@id="btnDetailDigg"]/span/text()').extract()[0] else: re_author = re.search(r'作者:(.*?)发布时间', re_info).group().replace('作者:', '').replace('发布时间', '').split()[0] re_source = "" release_time = re.search(r'发布时间:(.*?)阅读', re_info).group().replace('发布时间:', '').replace('阅读', '').split()[0] re_read = re.search(r'阅读:(.*?)推荐', re_info).group().replace('阅读:', '').replace('推荐', '').split()[0] re_recommend = response.xpath('//div[@id="btnDetailDigg"]/span/text()').extract()[0] re_text = response.xpath('//div[@id="ArticleCnt"]//text()').extract() re_images_url = response.xpath('//div[@id="ArticleCnt"]/p/img/@src').extract_first("") article_item["re_title"] = re_title article_item["re_url"] = re_url article_item["re_author"] = re_author article_item["re_source"] = re_source article_item["release_time"] = release_time article_item["re_read"] = re_read article_item["re_text"] = "".join(re_text) article_item["url_object_id"] = common.get_md5(re_url) article_item["re_images_url"] = [re_images_url] article_item["re_recommend"] = re_recommend # print(re_url,re_info) yield article_item
def parse_detail(self, response): article_item = LvChaSoftItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=LvChaSoftItem(), response=response) item_loader.add_xpath("title", "//div[@id='soft_title']/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("type", "//*[@id='main1k']/div[3]/a[3]/text()") item_loader.add_xpath("size", "//em[@id='ljdx']/text()") item_loader.add_xpath( "update_time", "//*[@id='main1k']/div[4]/div[2]/div[2]/div[1]/p[6]/em/text()") item_loader.add_xpath("content", "//*[@class='rjjsbox']/p/text()") item_loader.add_xpath("tag", "//*[@class='fllist clearfix']/p[4]/em/text()") item_loader.add_xpath("fav_nums", "//*[@class='fllist clearfix']/p[5]/em/@class") item_loader.add_xpath( "download_urls", "//*[@class='clearfix count_down']/dd/a[1]/@href") article_item = item_loader.load_item() yield article_item
def detail(self, response): # item = ArticlespiderItem() # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # pubtime = response.xpath('//*[@id="post-113778"]/div[2]/p/text()').extract_first().strip().split(" ")[0] # tag_list = response.xpath('//*[@id="post-113778"]/div[2]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = "·".join(tag_list) # print("xpath选择器", title, pubtime, tags) # # title = response.css('div.entry-header h1::text').extract_first() # pubtime = response.css('.entry-meta-hide-on-mobile::text').extract()[0].strip().split(" ")[0] # try: # pubtime = datetime.datetime.strptime(pubtime, '%Y/%m/%d').date() # except: # pubtime = datetime.datetime.now().date() # tag_list = response.css('.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = "·".join(tag_list) # # print("css 选择器", title, pubtime, tags) # # all_num = response.css('.post-adds') # praise_num = all_num.css('#113778votetotal::text').extract_first(0) # fav_num = all_num.css('.bookmark-btn::text').extract_first("0") # match_re = re.match('.*?(\d+).*', fav_num) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # comment_num = all_num.css('.fa.fa-comments-o::text').extract_first(0) # content = response.css('.entry p::text').extract() # contents = "__".join([element for element in content if element]) # item['title'] = title # item['pubtime'] = pubtime # item['tags'] = tags # item['praise_num'] = praise_num # item['fav_num'] = fav_num # item['comment_num'] = comment_num # item['contents'] = contents # item['image_urls'] = response.meta.get('image_urls', "") # item['url_object_id'] = get_md5(response.url) # item['url'] = response.url # ItemLoader item_loader = ArticleItemLoder(item=ArticlespiderItem(), response=response) item_loader.add_css('title', 'div.entry-header h1::text') item_loader.add_css('pubtime', '.entry-meta-hide-on-mobile::text') item_loader.add_css('tags', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('praise_num', '.post-adds span h10::text') item_loader.add_css('fav_num', '.bookmark-btn::text') item_loader.add_css('comment_num', 'a[href="#article-comment"] span::text') item_loader.add_css('content', 'div.entry p::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_value('image_urls', [response.meta.get("image_urls", '')]) article_item = item_loader.load_item() return article_item
def parse_job(self, response): i = {} item_loader = LagouItemLoader(item=LagouJobItem(), response=response) item_loader.add_xpath('title', '//div[@class="job-name"]/@title') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(url=response.url)) item_loader.add_xpath('salary', '//span[@class="salary"]/text()') item_loader.add_xpath('job_city', '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath('work_years', '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath('degree_need', '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath('job_type', '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_xpath('publish_time', '//*[@class="publish_time"]/text()') item_loader.add_css('tags', '.position-label li::text') item_loader.add_xpath('job_advantage', '//*[@class="job-advantage"]/p/text()') item_loader.add_xpath('job_desc', '//*[@class="job_bt"]/div') item_loader.add_xpath('company_url', '//*[@class="job_company"]/dt/a/@href') item_loader.add_xpath('company_name', '//*[@class="job_company"]/dt/a/img/@alt') job_item = item_loader.load_item() return job_item
def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", "") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text") item_loader.add_css("work_years", ".job_request p span:nth-child(3)::text" ) # 这里使用css ,是为了在学习时,熟悉css选择器用法 item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("tags", ".position-label.clearfix li::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_value("crawl_time", datetime.datetime.now()) # item_loader.add_css("crawl_update_time", datetime.datetime.now()) job_item = item_loader.load_item( ) # 这里先赋值给一个变量,是考虑到便于调试以及代码可读性,而不是为了代码简洁而直接return return job_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体信息 # xpath解析 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace('·', '').strip() # praise_nums = response.xpath("//span[contains(@class, 'vote-post-up')]/h10/text()").extract()[0] # fav_nums = response.xpath("//span[contains(@class, 'bookmark-btn')]/text()").extract()[0] # fav_nums = response.css("span.bookmark-btn::text").extract()[0] # match_re = re.match(r".*(\d+).*", fav_nums) # if match_re: # fav_nums = match_re.group(1) # comment_num = response.xpath("//a[@href='#article-comment']/span").extract()[0] # match_re = re.match(r".*(\d+).*", comment_num) # if match_re: # comment_num = match_re.group(1) # contetn = response.xpath("//div[@class='entry']").extract()[0] # print(contetn) # css选择器 front_image_url =response.meta.get("front_image_url","") #文章封面图 title = response.css(".entry-header h1::text").extract_first() create_date = response.css(".entry-meta-hide-on-mobile::text").extract()[0].strip().replace('·', '').strip() praise_nums = response.css(".vote-post-up h10::text").extract_first().strip() fav_nums = response.css("span.bookmark-btn::text").extract_first() match_re = re.match(r".*(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css('a[href="#article-comment"] span::text').extract_first() match_re = re.match(r".*(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract_first() tags = response.css("p.entry-meta-hide-on-mobile a::text").extract_first() tag_list = [element for element in tags if not element.strip().endswith('\0')] tag_list = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["fav_nums"] = fav_nums article_item["comment_nums"] = comment_nums article_item["tags"] = tags article_item["content"] = content yield article_item #传到pipelines.py
def parse_detail(self, response): #make an instance call article_item article_item = JobBoleArticleItem() #send to the pipeline.py, let it receive it #using item loader to load the items #getting all item via loadering item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # assgin the loader into the item article_item = item_loader.load_item() yield article_item
def parse_details(self, response): news = ArticlespiderItem() img_url = response.meta.get("img_url", "") title = response.css(".entry-header h1::text").extract_first("") datetime = response.css(".entry-meta-hide-on-mobile::text").extract_first("").replace(" ·", "").strip() praise_num_str = response.css("#114676votetotal::text").extract_first("") if praise_num_str: praise_num = int(praise_num_str) else: praise_num = 0 content = response.css(".entry p::text").extract()[0] news['title'] = title news['datetime'] = datetime news['praise_num'] = praise_num news['content'] = content news['img_url'] = [img_url] news['url_object_id'] = get_md5(img_url) item_loader=ItemLoader(item=ArticlespiderItem,response=response) item_loader.add_css() item_loader.add_xpath() item_loader.add_value() yield news
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) article_item = JobBoleArticleItem() title = response.css("#news_title a::text").extract_first("") create_date = response.css("#news_info .time::text").extract_first( "") match_re = re.match(".*?(\d+.*)", create_date) if match_re: create_date = match_re.group(1) content = response.css("#news_content").extract()[0] tag_list = response.css(".news_tags a::text").extract() tags = ",".join(tag_list) article_item["title"] = title article_item["create_date"] = create_date article_item["content"] = content article_item["tags"] = tags article_item["url"] = response.url if response.meta.get("front_image_url", ""): article_item["front_image_url"] = [ response.meta.get("front_image_url", "") ] else: article_item["front_image_url"] = [] article_item["url_object_id"] = common.get_md5(response.url) yield article_item
def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_xpath("tags", '//li[@class="labels"]/text()') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_nums(self, response): # j_data = json.loads(response.text) page_source = response.text j_str = remove_tags(page_source) j_data = json.loads(j_str) # article_item = response.meta.get("article_item", "") item_loader = response.meta.get('article_item', '') # praise_nums = j_data["DiggCount"] # fav_nums = j_data["TotalView"] # comment_nums = j_data["CommentCount"] # # article_item["praise_nums"] = praise_nums # article_item["fav_nums"] = fav_nums # article_item["comment_nums"] = comment_nums # article_item["url_object_id"] = common.get_md5(article_item["url"]) item_loader.add_value('praise_nums', j_data['DiggCount']) item_loader.add_value('fav_nums', j_data['TotalView']) item_loader.add_value('comment_nums', j_data['CommentCount']) item_loader.add_value('url_object_id', common.get_md5(response.meta.get('url', ''))) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): matches = re.match(".*/(?P<contentId>[0-9]+)", response.url) if matches: article_item = CnblogsArticleItem() title = response.xpath('//div[@id="news_title"]/a/text()').extract_first("") news_info_node = response.xpath('//div[@id="news_info"]') create_time = news_info_node.xpath('span[@class="time"]/text()').extract_first("") matches2 = re.match(".*?(?P<create_time>[0-9:-]+[\s]*?[0-9:-]+).*", create_time) # .*?(?P<create_time>\d.*) # .*?(?P<create_time>[0-9:-]+[\s]*?[0-9:-]+).* if matches2: create_time = matches2["create_time"] content = response.xpath('//*[@id="news_content"]').extract_first("") tag_list = response.xpath('//*[@id="news_more_info"]/div[@class="news_tags"]/a/text()').extract() tags = ",".join(tag_list) article_item["title"] = title article_item["create_time"] = create_time article_item["content"] = content article_item["tags"] = tags article_item["url"] = response.url article_item["url_object_id"] = common.get_md5(response.url) article_item["front_image_url"] = response.meta.get("front_image_url", "") contentId = matches["contentId"] url = parse.urljoin(response.url, "https://news.cnblogs.com/NewsAjax/GetAjaxNewsInfo?contentId={}".format(contentId)) yield Request(url=url, meta={"article_item": article_item}, callback=self.parse_news_info)
def parse_job(self, response): # 解析拉钩职位 item_loder = LaGouJobItemLoader(item=LaGouJobItem(), response=response) item_loder.add_css('title', '.job-name::attr(title)') item_loder.add_value('url', response.url) item_loder.add_value('url_object_id', get_md5(response.url)) item_loder.add_css('salary', '.job_request .salary::text') item_loder.add_xpath('job_city', "//*[@class='job_request']/p/span[2]/text()") item_loder.add_xpath('work_years', "//*[@class='job_request']/p/span[3]/text()") item_loder.add_xpath('degree_need', "//*[@class='job_request']/p/span[4]/text()") item_loder.add_xpath('job_type', "//*[@class='job_request']/p/span[5]/text()") item_loder.add_css('publish_time', '.publish_time::text') item_loder.add_css('job_advantage', '.job-advantage p::text') item_loder.add_css('job_desc', '.job_bt div') item_loder.add_css('job_addr', '.work_addr') item_loder.add_css('company_url', '#job_company dt a::attr(href)') item_loder.add_css('company_name', '#job_company dt a img::attr(alt)') item_loder.add_css('tags', '.position-label li::text') item_loder.add_value('crawl_time', datetime.now()) item_loder.add_value('crawl_update_time', datetime.now()) lagou_job_item = item_loder.load_item() return lagou_job_item
def parse_detail(self,response): # article_item = JobBoleArticleItem() # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.css('div.entry-header h1::text').extract_first() # create_data = response.css('p.entry-meta-hide-on-mobile::text').extract_first().strip().replace("·","").strip() # praise_nums = response.css('span.vote-post-up h10::text').extract_first() # fav_nums = response.css(".bookmark-btn::text").extract_first() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css('div.entry').extract_first() # tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() # except Exception as e: # create_data = datetime.datetime.now().date() # article_item['create_date'] = create_data # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_xpath("title", '//div/span[@class="name"]/text()') item_loader.add_xpath("url", response.url) item_loader.add_xpath("url_object_id", get_md5(response.url)) item_loader.add_xpath("salary", '//dd/p/span[@class="salary"]/text()') item_loader.add_xpath("job_city", '//dd/p/span[2]/text()') item_loader.add_xpath("work_years", '//dd/p/span[3]/text()') item_loader.add_xpath("degree_need", '//dd/p/span[4]/text()') item_loader.add_xpath("job_type", '//dd/p/span[5]/text()') item_loader.add_xpath("tags", '//dd[@class="job_request"]/ul/li/text()') item_loader.add_xpath("publish_time", '//dd/p[@class="publish_time"]/text()') item_loader.add_xpath("job_advantage", '//dl/dd/p/text()') item_loader.add_xpath("job_desc", '//dd/div/p/text()') item_loader.add_xpath("job_addr", '//dd/div[@class="work_addr"]') item_loader.add_xpath("company_url", '//dl/dt/a/@href') item_loader.add_xpath("company_name", '//dl/dt/a/img/@alt') item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_loader = LagouItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request p .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", ".position-label li::text") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", ".c_feature li a::text ") item_loader.add_css("company_name", "img.b2::attr(alt)") item_loader.add_value("crawl_time", datetime.datetime.now()) lagoujob_item = item_loader.load_item() yield lagoujob_item
def parse_detail(self, response): # 通过item loader加载item type_name = response.meta.get("type_name", "") publish_date = response.meta.get("publish_date", "") # 发布时间 item_loader = kjjysItemLoader(item=kjjysItem(), response=response) image_url = response.css("#UCAP-CONTENT img::attr(src)").extract() content = response.css(".Zoom").extract_first("") title = response.meta.get("title", "") new_image_url = [] if len(image_url) > 0: for in_url in image_url: in_url = parse.urljoin(response.url, in_url) new_image_url.append(in_url) else: item_loader.add_value("front_image_path", '--') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) if len(new_image_url) > 0: item_loader.add_value("front_image_url", new_image_url) # else: # item_loader.add_value("front_image_url", [""]) item_loader.add_value("source_net", self.start_urls[0]) item_loader.add_value("source_name", '中华人民共和国科学技术部') item_loader.add_value("type_name", type_name) item_loader.add_value("title", title) item_loader.add_value("content", content) item_loader.add_value("publish_time", publish_date) item_loader.add_value("crawl_time", datetime.datetime.now()) article_item = item_loader.load_item() yield article_item
def parse_nums(self, response): j_data = json.loads(response.text) if j_data: article_item = response.meta.get('article_item', "") # 基于回调的代码 praise_nums = int(j_data["DiggCount"]) fav_nums = j_data['TotalView'] comment_nums = j_data['CommentCount'] # 延迟调用 代码分离 # item_loader = response.meta.get("article_item", "") # item_loader.add_value("praise_nums", j_data["DiggCount"]) article_item["praise_nums"] = praise_nums article_item['fav_nums'] = fav_nums article_item['comment_nums'] = comment_nums article_item['url_obj_id'] = common.get_md5(article_item['url']) yield article_item else: print("Error here")
def parse_detail(self, response): """ 提取文章信息 """ # 通过自定义的item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath( "praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_item(self, response): # i = {} # #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # #i['name'] = response.xpath('//div[@id="name"]').extract() # #i['description'] = response.xpath('//div[@id="description"]').extract() # return i item_loader = LagouItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", '.job-name > .name::text') item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", '.job_request .salary::text') item_loader.add_xpath("job_city", '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath("work_years", '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath("degree_need", '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath("job_type", '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css("publish_time", '.publish_time::text') item_loader.add_css("job_advantage", '.job-advantage p::text') item_loader.add_css("job_desc", '.job_bt div::text') item_loader.add_css("job_addr", '.work_addr') item_loader.add_css("company_name", '#job_company dt a img::attr(alt)') item_loader.add_css("company_url", '#job_company dt a::attr(href)') item_loader.add_css("tags", '.position-label li::text ') item_loader.add_value("crawl_time", datetime.datetime.now()) item = item_loader.load_item() return item
def parse_nums(self, response): j_data = json.loads(response.text) ''' 被精简的代码 article_item = response.meta.get("article_item", "") commentCount = j_data["CommentCount"] totalView = j_data["TotalView"] diggCount = j_data["DiggCount"] buryCount = j_data["BuryCount"] # article_item = CdnBlogArtcleItem() article_item["praise_nums"] = diggCount article_item["fav_nums"] = totalView article_item["comment_nums"] = commentCount article_item["url_object_id"] = common.get_md5(article_item["url"]) ''' item_loader = response.meta.get("article_item", "") item_loader.add_value("praise_nums", j_data["DiggCount"]) item_loader.add_value("fav_nums", j_data["TotalView"]) item_loader.add_value("comment_nums", j_data["CommentCount"]) item_loader.add_value("url_object_id", common.get_md5(response.meta.get("url", ""))) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 提取文章的具体字段 :type response: HtmlResponse :param response: :return: """ # 通过Item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', response.meta.get("front_image_url", "")) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item # 传递到pipelines.py
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get('front_image_url','') #文章封面图 # title = response.css('.entry-header h1::text').extract_first('') # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip() # praise_nums = response.css('.vote-post-up h10::text').extract()[0] # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match('.*?(\d+).*',fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # taglist = response.css("p.entry-meta-hide-on-mobile a::text").extract() # taglist = [element for element in taglist if not element.strip().endswith('评论')] # tags = ','.join(taglist) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] =comment_nums # article_item['content'] = content # article_item['fav_nums'] =fav_nums # article_item['tags'] =tags #通过item_loader加载item front_image_url = response.meta.get('front_image_url', '') #文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item