def parse_question(self, response): # 处理question页面, 从页面中提取出具体的question item question_id = response.meta.get('question_id', '') item_loader = ArticleItemLoader(item=ZhihuQuestionItem(), response=response) item_loader.add_css("title", "h1.QuestionHeader-title::text") item_loader.add_css("content", ".QuestionHeader-detail") item_loader.add_value("url", response.url) item_loader.add_value("zhihu_id", question_id) item_loader.add_css("answer_num", ".List-headerText span::text") item_loader.add_css("comments_num", ".QuestionHeader-Comment span::text") item_loader.add_css( "watch_user_num", ".NumberBoard:first-child .NumberBoard-itemValue::text") item_loader.add_css( "click_num", ".NumberBoard:last-child .NumberBoard-itemValue::text") item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text") question_item = item_loader.load_item() # 请求答案api answer_url = self.start_answer_url.format(question_id, 5, 0) yield scrapy.Request(answer_url, headers=self.headers, callback=self.parse_answer) yield question_item
def parse_list(self, response): # 实例化项目 post_url = response.xpath('//*[@id="fmimg"]/img/@src').extract_first( "") # 图片 front_image_url = parse.urljoin(response.url, post_url) list = response.xpath('//*[@id="list"]/dl/dd/a/@href').extract() # 列表 # 通过自定义ArticleItemLoader加载item item_loader = ArticleItemLoader(item=BiQuGeListItem(), response=response) # xpath 获取方式 item_loader.add_value("url_object_id", get_md5(response.url)) # MD5ID item_loader.add_xpath("title", '//*[@id="info"]/h1/text()') # 文章标题 item_loader.add_xpath("author", '//*[@id="info"]/p[1]/text()') # 作者 item_loader.add_xpath("last_update_time", '//*[@id="info"]/p[3]/text()') # 最后更新时间 item_loader.add_value("front_image_url", [front_image_url]) # 图片下载链接 article_item = item_loader.load_item() yield article_item # 循环爬取详情页 if list: post_urls = [] for each in list: # post_urls.append(parse.urljoin(response.url, each)) yield Request( url=parse.urljoin(response.url, each), meta={"url_object_id": article_item['url_object_id']}, callback=self.parse_details) pass
def parse_job(self, response): LaGouArticleItem = ArticleItemLoader(item=LaGouItem(), response=response) LaGouArticleItem.add_css("job_name", '.job-name::attr(title)') LaGouArticleItem.add_css("salary", ".salary::text") LaGouArticleItem.add_xpath( "job_exp", "//dd[@class='job_request']/p/span[3]/text()") LaGouArticleItem.add_xpath( "edu", "//dd[@class='job_request']/p/span[4]/text()") LaGouArticleItem.add_xpath( "job_type", "//dd[@class='job_request']/p/span[5]/text()") LaGouArticleItem.add_xpath( "work_city", "//dd[@class='job_request']/p/span[2]/text()") LaGouArticleItem.add_css("company_name", "#job_company .b2::attr(alt)") LaGouArticleItem.add_css("company_url", ".job_company dt a::attr(href)") LaGouArticleItem.add_css("work_addr", ".work_addr") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_css("create_date", ".publish_time::text") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id", get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".job-advantage p::text") LaGouArticleItem.add_css("job_desc", ".job_bt div") LaGouArticleItem.add_css("tag", ".position-label li") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_question(self, response): question_loader = ArticleItemLoader(item=V2exQuItem(), response=response) question_loader.add_xpath("title", "//div[@class='header']/h1/text()") content="" mar_content = response.xpath("//div[@class='markdown_body']").extract() if len(mar_content)==0: content="".join(response.xpath("//div[@class='topic_content']").extract()).replace("\n","") else: content="".join(mar_content).replace("\n","") match_re1 = re.match(self.content_rule, content) if match_re1: question_loader.add_value("content",match_re1.group(1)) comment_count=response.xpath("//div[@class='cell']/span[@class='gray']/text()").extract() if len(comment_count)==0: question_loader.add_value("comment_count",0) else: match_re2 = re.match(self.comment_rule, comment_count[0]) if match_re2: question_loader.add_value("comment_count", match_re2.group(1)) question_loader.add_value("user_id",random.randint(2,14)) question_loader.add_value("created_date",time.time()) question_item=question_loader.load_item() yield question_item pass
def parse_detail(selfs, response): article_item = CnblogsArticleItem() # title = response.css("#cb_post_title_url::text").extract()[0] # create_date = response.css('#post-date::text').extract()[0] # author = response.css('.postDesc a::text').extract()[0] # # 动态生成的,暂时爬取不了 # view_count = response.css('#post_view_count::text').extract()[0] # comment_count = response.css('#post_comment_count::text').extract()[0] # # article_item["title"] = title # article_item["url"] = response.url # try: # create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item["create_date"] = create_date # article_item["author"] = author # article_item["url_object_id"] = get_md5(response.url) # 通过item loader加载item item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response) item_loader.add_css("title", '#cb_post_title_url::text') item_loader.add_css('create_date', '#post-date::text') item_loader.add_css('author', '.postDesc a::text') item_loader.add_value('url', response.url) article_item = item_loader.load_item() yield article_item
def parse(self, response): res_json = json.loads(response.text)[0]["data"] item_loader = ArticleItemLoader(item=PDFItem(), response=response) url_list = [ "http://reportdocs.static.szse.cn/UpFiles/fxklwxhj/CDD00079356200.pdf" ] item_loader.add_value("file_urls", url_list) pdf_item = item_loader.load_item() yield pdf_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = ArticlespiderItem() # title = response.css("#news_title a::text").extract_first("") # # title = response.xpath("//*[@id='news_title'//a/text()") # create_time = response.css("#news_info .time::text").extract_first("") # # create_date = response.xpath("//*[@id='news_info'//*[@class='time']/text()") # match_re = re.match(".*?(\d+.*)", create_time) # if match_re: # create_time = match_re.group(1) # content = response.css("#news_content").extract()[0] # # content = response.xpath("//*[@id='news_content']").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # 无法存储list # # tag_list = response.xpath("//*[@class=news_tags']//a/text()") # tags = ",".join(tag_list) # post_id = match_re.group(1) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # 尽量不用同步的库 # 打断点看是否符合要求 # url路径 / 可以避免加入到子路径 # j_data = json.loads(html.text) # article_item['title'] = title # article_item['create_time'] = create_time # article_item['content'] = content # article_item['tags'] = tags # # article_item['url'] = response.url # if response.meta.get("front_image_ur;", ""): # article_item['front_image_url'] = [response.meta.get('front_image_url', "")] # else: # article_item['front_image_url'] = [] # 使用itemloader的代码,使得程序可以更加易于维护 匹配项 item_loader = ArticleItemLoader(item=ArticlespiderItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get('front_image_url', []): item_loader.add_value('front_image_url', response.meta.get('front_image_url', [])) article_item = item_loader.load_item() if response.meta.get("front_image_ur;", ""): article_item['front_image_url'] = [response.meta.get('front_image_url', "")] else: article_item['front_image_url'] = [] yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": article_item}, callback=self.parse_nums) # praise_nums = j_data['DiggCount'] # fav_nums = j_data['TotalView'] # comment_nums = j_data['CommentCount'] pass
def parse_detail(self, response): person_nodes = response.css(".ge2_content tr") for person_node in person_nodes: person = person_node.css("td::text").extract() item_loader = ArticleItemLoader(item=GZZBItem(), response=response) if person: item_loader.add_value("code", person[0]) item_loader.add_value("name", person[1]) item_loader.add_value("date", "2017-05") gzzb_item = item_loader.load_item() yield gzzb_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # title = response.css("#news_title a::text").extract_first("") # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("") # create_date = response.css("#news_info .time::text").extract_first("") # create_date = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("") # date_re = re.match(".*?(\d+.*)", create_date) # if date_re: # create_date = date_re.group(1) # content = response.css("#news_content").extract_first("") # content = response.xpath("//*[@id='news_content']").extract_first("") # tag_list = response.css(".news_tags a::text").extract() # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract() # tags = ",".join(tag_list) # 赋值 # article_item = CnblogsArticleItem() # article_item['title'] = title # article_item['create_date'] = create_date # article_item['content'] = content # article_item['tags'] = tags # article_item['url'] = response.url # if response.meta.get("front_image_url", ""): # front_image_url = response.meta.get("front_image_url", "") # if not re.match('^https:.*', front_image_url): # front_image_url = 'https:' + front_image_url # article_item['front_image_url'] = [front_image_url] # else: # article_item['front_image_url'] = [] # 注意:使用绝对路径!直接加到域名后面! # 这是一个同步请求,后续的代码会被 block # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # json_data = json.loads(html.text) # Use ItemLoader item_loader = ArticleItemLoader(item=CnblogsArticleItem(), response=response) item_loader.add_xpath('title', "//*[@id='news_title']//a/text()") item_loader.add_xpath('content', "//*[@id='news_content']") item_loader.add_xpath('tags', "//*[@class='news_tags']//a/text()") item_loader.add_xpath('create_date', "//*[@id='news_info']//*[@class='time']/text()") item_loader.add_value('url', response.url) item_loader.add_value('front_image_url', response.meta.get('front_image_url', '')) # article_item = item_loader.load_item() # 使用异步请求 yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"item_loader": item_loader, "url_object_id": get_md5(response.url)}, callback=self.parse_nums)
def parse_detail(self, response): article_item = JobbboleItem() if (response.url == 'http://blog.jobbole.com/all-posts/'): pass # url = response.url # title = response.xpath('//*[@class="entry-header"]/h1/text()').extract()[0] # create_date = response.xpath('//*[@class="entry-meta"]/p/text()').extract()[0].strip().replace("·","").strip() # praise_nums = int(response.xpath("//span[contains(@class,'vote-post-up')]/h10/text()").extract()[0]) # book_mark = response.xpath("//span[contains(@class,'bookmark-btn')]/text()").extract()[0] # comment_nums = response.xpath("//a[@href='#article-comment']/span/text()").extract()[0] # match_re = re.match(r".*(\d+).*",book_mark) # match_com = re.match(r".*(\d).*",comment_nums) # content = response.xpath("//*[@class='entry']/p/text()").extract()[0] # if match_re: # book_mark = int(match_re.group(1)) # else: # book_mark = 0 # if match_com: # comment_nums = int(match_com.group(1)) # else: # comment_nums = 0 # article_item['title'] = title # """ # 将字符串日期格式化为date类型 # """ # try: # create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date() # except Exception as er: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['praise_nums'] = praise_nums # article_item['book_mark'] = book_mark # article_item['comment_nums'] = comment_nums # article_item['content'] = content # article_item['url'] = url #通过itemLoader加载 item item_loader = ArticleItemLoader(item=JobbboleItem(), response=response) item_loader.add_xpath("title", '//*[@class="entry-header"]/h1/text()') item_loader.add_value("url", response.url) item_loader.add_xpath("create_date", '//*[@class="entry-meta"]/p/text()') item_loader.add_xpath( "praise_nums", "//span[contains(@class,'vote-post-up')]/h10/text()") item_loader.add_xpath( "book_mark", "//span[contains(@class,'bookmark-btn')]/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath("content", "//*[@class='entry']/p/text()") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) ''' 被itemloader优化的代码 article_item = CdnBlogArtcleItem() # title = response.css('div#news_title a::text').extract_first("") # create_date = response.css('div#news_info span.time::text').extract_first("") # content = response.css('div#news_content').extract_first("") # tag_list = response.css('div.news_tags a::text').extract() title = response.xpath('//*[@id="news_title"]//a/text()').extract_first("") create_date = response.xpath('//div[@id="news_info"]//span[@class="time"]/text()').extract_first("") match_re = re.match(".*?(\d+.*)", create_date) if match_re: create_date = match_re.group(1) content = response.xpath('//div[@id="news_content"]').extract_first("") tag_list = response.xpath('//div[@class="news_tags"]//a/text()').extract() tags = ",".join(tag_list) article_item["title"] = title article_item["create_date"] = create_date article_item["url"] = response.url article_item["content"] = content article_item["tags"] = tags image_url = response.meta.get("front_image_url", "") if image_url: article_item["front_image_url"] = [image_url] else: article_item["front_image_url"] = [] ''' item_loader = ArticleItemLoader(item=CdnBlogArtcleItem(), response=response) item_loader.add_xpath("title", '//*[@id="news_title"]//a/text()') item_loader.add_xpath("create_date", '//div[@id="news_info"]//span[@class="time"]/text()') item_loader.add_xpath("content", '//div[@id="news_content"]') item_loader.add_xpath("tags", '//div[@class="news_tags"]//a/text()') item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # 同步請请求和异步请求 # html = requests.get("https://news.cnblogs.com/NewsAjax/GetAjaxNewsInfo?contentId=654012") # html = requests.get(parse.urljoin(response.url,"/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # j_data = json.load(html.text) # '{"ContentID":654012,"CommentCount":0,"TotalView":31,"DiggCount":0,"BuryCount":0} # yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)),meta={"article_item", article_item}, callback=self.parse_nums) yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": item_loader, "url": response.url}, callback=self.parse_nums)
def parse_content(self,response): item_loader = ArticleItemLoader(item=DouBanItem(), response=response) item_loader.add_value("url",response.url) item_loader.add_xpath("title","//div[@id='content']/h1/span[1]/text()") #item_loader.add_xpath("time","//div[@id='content']/h1/span[2]/text()") item_loader.add_xpath("director","//div[@id='info']/span[1]/span[2]/a/text()") #item_loader.add_xpath("area","//*[@id='info']/text()[8]") #item_loader.add_xpath("language","//*[@id='info']/text()[10]") item_loader.add_css("score","div.rating_self strong::text") item_loader.add_xpath("introduction","//span[@property='v:summary']/text()") item_loader.add_xpath("front_image_url","//*[@id='mainpic']/a/img/@src") infos=response.xpath("//*[@id='info']/text()").extract() info_list=[] for info in infos: match_re = re.match(self.info_rule, info.strip()) if match_re: info_list.append(match_re.group(1)) time=response.xpath("//div[@id='content']/h1/span[2]/text()").extract()[0] match_re = re.match(self.time_rule, time) if match_re: item_loader.add_value("time",match_re.group(1)) item_loader.add_value("area",info_list[0]) item_loader.add_value("language",info_list[1]) item_loader.add_value("nickname",info_list[2]) douban_item = item_loader.load_item() yield douban_item
def parse_detail(self, response): article_item = JobboleArticleItem() #通过css选择器提取数据 front_image_url = response.meta.get('front_image_url', "") # 获取文章封面图 #通过ItemLoader加载Item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css("title", '.entry-header>h1::text') item_loader.add_value("url", response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) if response.css('.vote-post-up>h10::text'): item_loader.add_css("praise_number", '.vote-post-up>h10::text') else: item_loader.add_value("praise_number", "0") item_loader.add_css("comment_nums", 'a[href="#article-comment"]>span::text') item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile>a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): # 实例化 article_item = JobBoleArticleItem() # 通过item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) if response.url.find("/Index/newslist") or response.url.find("/index"): # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".article-title::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "#date-topic::text") item_loader.add_css("content", ".article-content") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() else: item_loader.add_css("title", "title::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "title::text") item_loader.add_css("content", "body::text") # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = ArticleSpiderItem() # title = response.css("#news_title a::text").extract_first("") # create_date = response.css("#news_info .time::text").extract_first("") # match_re = re.match(".*?(\d+.*)", create_date) # if match_re: # create_date = match_re.group(1) # content = response.css("#news_content").extract_first("") # tag_list = response.css(".news_tags a::text").extract() # tags = ",".join(tag_list) # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # j_data = json.loads(html.text) # praise_nums = j_data["DiggCount"] # fav_nums = j_data["TotalView"] # comment_nums = j_data["CommentCount"] # article_item["title"] = title # article_item["create_date"] = create_date # article_item["content"] = content # article_item["tags"] = tags # article_item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_item["front_image_url"] = [response.meta.get("front_image_url", "")] # else: # article_item["front_image_url"] = [] item_loader = ArticleItemLoader(item=ArticleSpiderItem(), response=response) item_loader.add_css('title', '#news_title a::text') item_loader.add_css('create_date', '#news_info .time::text') item_loader.add_css('content', '#news_content') item_loader.add_css('tags', '.news_tags a::text') item_loader.add_value('url', response.url) if response.meta.get('front_image_url', []): item_loader.add_value('front_image_url', response.meta.get('front_image_url', [])) yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={ "article_item": item_loader, "url": response.url }, callback=self.parse_nums)
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) # 先判断url有没有id if match_re: post_id = match_re.group(1) # 此处提取post_id来给下面的的id赋值 # article_Item = JoBoleArticleItem() # title = response.css("#news_title a::text").extract_first("") # time = response.css("#news_info .time::text").extract_first("") # match_re = re.match(".*?(\d+.*)", time) # if match_re: # time = match_re.group(1) # content = response.css("#news_content").extract()[0] # tag_list = response.css(".news_tags a::text").extract() # # # title = response.xpath("//*[@id='news_title']//a/text()").extract_first("") # # time = response.xpath("//*[@id='news_info']//*[@class='time']/text()").extract_first("") # # content = response.xpath("//*[@id='news_content']").extract()[0] # # tag_list = response.xpath("//*[@class='news_tags']//a/text()").extract() # tags = ",".join(tag_list) # # # # # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)))#注意因为requests是同步的而这里用的是一个异步的方法,所以换异步的方法实现, 得加/ # # j_data = json.loads(html.text) # # article_Item["title"] = title#这里面一定要确保key是在item里面存在的 # article_Item["time"] = time # article_Item["tags"] = tags # article_Item["content"] = content # article_Item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_Item["front_image_url"] = [response.meta.get("front_image_url","")]#可以通过此方法获得传递过来的图片值 # else: # article_Item["front_image_url"] = [] item_loader = ArticleItemLoader(item=JoBoleArticleItem(), response=response) # 记住这里的ArticleItemLoader类为items里自定义的方法 item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("time", "#news_info .time::text") item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # 用yeid把上方法换成异步 yield Request(url=parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={"article_item": item_loader, "url": response.url}, callback=self.parse_nums)
def parse_question(self, response): zhihu_question_item = ArticleItemLoader(item=ZhiHuQuestionItem(), response=response) question_id = response.meta.get('question_id', 0) zhihu_question_item.add_css('title', "h1.QuestionHeader-title::text") zhihu_question_item.add_value("question_id", question_id) zhihu_question_item.add_css("question_detail", ".QuestionHeader-detail") zhihu_question_item.add_css("tags", ".Tag-content .Popover div::text") # zhihu_question_item.add_css("follow_nums", ".QuestionFollowStatus .NumberBoard-itemValue") zhihu_question_item = zhihu_question_item.load_item() yield zhihu_question_item yield Request(self.query_next_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answers)
def parse_detail(self, response): sohuItem = SohuItem() front_image_url = response.meta.get("front_image_url", "") article_type = response.meta.get("article_type", "") item_loader = ArticleItemLoader(item=sohuItem, response=response) item_loader.add_css("title", ".text-title h1::text") item_loader.add_value("url", response.url) item_loader.add_value("front_image_url", front_image_url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("article_type", article_type) item_loader.add_css("author_name", ".user-info h4 a::text") item_loader.add_css("publish_time", ".article-info span::text") item_loader.add_css("content", "article") item_loader.add_value("crawl_time", datetime.now()) sohuItem = item_loader.load_item() yield sohuItem
def parse_detail(self, response): match_re = re.match(".*?(\d+)", response.url) if match_re: post_id = match_re.group(1) # article_item = JobBoleArticleItem() # title = response.css("#news_title a::text").extract_first("") # create_date = response.css(".time::text").extract_first("") # match_re = re.match(".*?(\d+.*)", create_date) # if match_re: # create_date = match_re.group(1) # content = response.css("#news_content").extract_first() # tag_list = response.css(".news_tags a::text").extract() # tags = ",".join(tag_list) # # html = requests.get(parse.urljoin(response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id))) # # article_item["title"] = title # article_item["create_date"] = create_date # article_item["content"] = content # article_item["tags"] = tags # article_item["url"] = response.url # if response.meta.get("front_image_url", ""): # article_item["front_image_url"] = [response.meta.get("front_image_url", "")] # else: # article_item["front_image_url"] = [] item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", "#news_title a::text") item_loader.add_css("content", "#news_content") item_loader.add_css("tags", ".news_tags a::text") item_loader.add_css("create_date", ".time::text") item_loader.add_value("url", response.url) if response.meta.get("front_image_url", []): item_loader.add_value("front_image_url", response.meta.get("front_image_url", [])) # article_item = item_loader.load_item() yield Request(url=parse.urljoin( response.url, "/NewsAjax/GetAjaxNewsInfo?contentId={}".format(post_id)), meta={ "article_item": item_loader, "url": response.url }, callback=self.parse_nums)
def parse_detail(self, response): front_end_url = response.meta["front_end_url"] # title = response.xpath("//div[@class='post']/div[@class='article']/h1[@class='title']/text()").extract()[0] # print(title) # print(get_md5(response.url)) # jianshu_item = JianShuArticlespiderItem() # # jianshu_item["url"] = response.url # jianshu_item["title"] = title # jianshu_item['front_image_url'] = [front_end_url] item = ArticleItemLoader(item=JianShuArticlespiderItem(), response=response) item.add_xpath("title", "//div[@class='post']/div[@class='article']/h1[@class='title']/text()") item.add_value("url", response.url) item.add_value("front_image_url", [front_end_url]) jianshu = item.load_item() yield jianshu
def parse_detail(self, response): # article_item = JobboleArticleItem() # 提取文章的具体字段 # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first() # create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', '') # fav_nums = response.xpath('//div[@class="post-adds"]/span[2]/h10/text()').extract_first() # comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first() # content = response.xpath('//div[@class="entry"]').extract()[0] # tag_list = response.xpath('//div[@class="entry-meta"]/p/a/text()').extract() # tag_list = [element for element in tag_list if not element.strip().endswith('评论')] # tags = ','.join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过ItemLoader加载item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_xpath( 'create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()') fav_nums = response.xpath( '//div[@class="post-adds"]/span[2]/h10/text()').extract_first() if fav_nums is None: fav_nums = '0' item_loader.add_value('fav_nums', fav_nums) item_loader.add_xpath('tags', '//div[@class="entry-meta"]/p/a/text()') item_loader.add_xpath('content', '//div[@class="entry"]') article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = TuiCoolArticleItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 flagTrue = response.meta.get("flag", "") # 标识 original = "http://www.tuicool.com/" + response.css( "span.from a::attr(href)").extract_first("") item_loader = ArticleItemLoader(item=TuiCoolArticleItem(), response=response) item_loader.add_css("title", ".article_row_fluid div:nth-child(1) h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "span.timestamp::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_value("sites", original) item_loader.add_value("flag", flagTrue) item_loader.add_css("original", "div.source a::text") item_loader.add_css("tags", "span.new-label::text") item_loader.add_css("content", "div.article_body") article_item = item_loader.load_item()
def parse_zl(self, response): LaGouArticleItem = ArticleItemLoader(item=ZhiLianItem(), response=response) LaGouArticleItem.add_css("job_name", '.fixed-inner-box h1::text') LaGouArticleItem.add_xpath("salary", "//div[@class='terminalpage-left']/ul/li[1]/strong/text()") LaGouArticleItem.add_xpath("job_exp", "//div[@class='terminalpage-left']/ul/li[5]/strong/text()") LaGouArticleItem.add_xpath("edu", "//div[@class='terminalpage-left']/ul/li[6]/strong/text()") LaGouArticleItem.add_xpath("job_type", "//div[@class='terminalpage-left']/ul/li[4]/strong/text()") LaGouArticleItem.add_xpath("work_city","//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()") LaGouArticleItem.add_css("company_name",".inner-left a ::text") LaGouArticleItem.add_css("company_url",".inner-left a::attr(href)") LaGouArticleItem.add_css("work_addr",".terminalpage-main h2::text") #LaGouArticleItem.add_xpath("feedback","//div[@class='publisher_data']/div[2]/span[@class='tip']/i/text()") LaGouArticleItem.add_xpath("create_date","//div[@class='terminalpage-left']/ul/li[3]/strong") LaGouArticleItem.add_value("job_url", response.url) LaGouArticleItem.add_value("job_url_id",get_md5(response.url)) LaGouArticleItem.add_css("job_advantage", ".welfare-tab-box ::text") LaGouArticleItem.add_xpath("job_desc","//div[@class='tab-inner-cont'][1]/p") LaGouArticleItem.add_xpath("tag","//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()") ArticleItemLoder = LaGouArticleItem.load_item() return ArticleItemLoder
def parse_detail(self, response): article_item=JobBoleArticleItem() # 通过css选择器提取字段 front_image_url = response.meta.get("front_image_url", "") #文章封面图 title = response.css(".entry-header h1::text").extract()[0] create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip() praise_nums = response.css(".vote-post-up h10::text").extract()[0] fav_nums = response.css(".bookmark-btn::text").extract()[0] match_re = re.match(".*?(\d+).*", fav_nums) if match_re: fav_nums = int(match_re.group(1)) else: fav_nums = 0 comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] match_re = re.match(".*?(\d+).*", comment_nums) if match_re: comment_nums = int(match_re.group(1)) else: comment_nums = 0 content = response.css("div.entry").extract()[0] tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() tag_list = [element for element in tag_list if not element.strip().endswith("评论")] tags = ",".join(tag_list) article_item["url_object_id"] = get_md5(response.url) article_item["title"] = title article_item["url"] = response.url try: create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() article_item["create_date"] = create_date article_item["front_image_url"] = [front_image_url] article_item["praise_nums"] = praise_nums article_item["comment_nums"] = comment_nums article_item["fav_nums"] = fav_nums article_item["tags"] = tags article_item["content"] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = LvChaSoftItem() # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=LvChaSoftItem(), response=response) item_loader.add_xpath("title", "//div[@id='soft_title']/text()") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("type", "//*[@id='main1k']/div[3]/a[3]/text()") item_loader.add_xpath("size", "//em[@id='ljdx']/text()") item_loader.add_xpath( "update_time", "//*[@id='main1k']/div[4]/div[2]/div[2]/div[1]/p[6]/em/text()") item_loader.add_xpath("content", "//*[@class='rjjsbox']/p/text()") item_loader.add_xpath("tag", "//*[@class='fllist clearfix']/p[4]/em/text()") item_loader.add_xpath("fav_nums", "//*[@class='fllist clearfix']/p[5]/em/@class") item_loader.add_xpath( "download_urls", "//*[@class='clearfix count_down']/dd/a[1]/@href") article_item = item_loader.load_item() yield article_item
def parse_detail(self,response): # article_item = JobBoleArticleItem() # front_image_url = response.meta.get("front_image_url", "") # 文章封面图 # title = response.css('div.entry-header h1::text').extract_first() # create_data = response.css('p.entry-meta-hide-on-mobile::text').extract_first().strip().replace("·","").strip() # praise_nums = response.css('span.vote-post-up h10::text').extract_first() # fav_nums = response.css(".bookmark-btn::text").extract_first() # match_re = re.match(".*?(\d+).*", fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # comment_nums = response.css("a[href='#article-comment'] span::text").extract_first() # match_re = re.match(".*?(\d+).*", comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # content = response.css('div.entry').extract_first() # tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_data = datetime.datetime.strptime(create_data, "%Y/%m/%d").date() # except Exception as e: # create_data = datetime.datetime.now().date() # article_item['create_date'] = create_data # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] = comment_nums # article_item['fav_nums'] = fav_nums # article_item['tags'] = tags # article_item['content'] = content # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) # 通过css选择器将后面的指定规则进行解析。 item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") # 调用这个方法来对规则进行解析生成item对象 article_item = item_loader.load_item() # 已经填充好了值调用yield传输至pipeline yield article_item
def parse_detail(self, response): try: # 使用Crawl api记录文章详情页请求成功的Request self.crawler.stats.inc_value("ArticleDetail_Success_Reqeust") except Exception as e: _ = e article_item = JobBoleArticleItem() #通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 提取文章信息 """ # 通过自定义的item_loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath("title", '//div[@class="entry-header"]/h1/text()') front_image_url = response.meta.get("front_image_url", "") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath( "create_date", "//p[@class='entry-meta-hide-on-mobile']/text()") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath( "praise_nums", "//span[contains(@class, 'vote-post-up')]/h10/text()") item_loader.add_xpath("comment_nums", "//a[@href='#article-comment']/span/text()") item_loader.add_xpath( "fav_nums", "//span[contains(@class, 'bookmark-btn')]/text()") item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") item_loader.add_xpath("content", "//div[@class='entry']") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() #提取文章的具体字段 # front_image_url = response.meta.get('front_image_url','') #文章封面图 # title = response.css('.entry-header h1::text').extract_first('') # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip() # praise_nums = response.css('.vote-post-up h10::text').extract()[0] # fav_nums = response.css('.bookmark-btn::text').extract()[0] # match_re = re.match('.*?(\d+).*',fav_nums) # if match_re: # fav_nums = int(match_re.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re = re.match('.*?(\d+).*', comment_nums) # if match_re: # comment_nums = int(match_re.group(1)) # else: # comment_nums = 0 # # content = response.css("div.entry").extract()[0] # taglist = response.css("p.entry-meta-hide-on-mobile a::text").extract() # taglist = [element for element in taglist if not element.strip().endswith('评论')] # tags = ','.join(taglist) # # article_item['url_object_id'] = get_md5(response.url) # article_item['title'] = title # article_item['url'] = response.url # try: # create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.datetime.now().date() # article_item['create_date'] = create_date # article_item['front_image_url'] = [front_image_url] # article_item['praise_nums'] = praise_nums # article_item['comment_nums'] =comment_nums # article_item['content'] = content # article_item['fav_nums'] =fav_nums # article_item['tags'] =tags #通过item_loader加载item front_image_url = response.meta.get('front_image_url', '') #文章封面图 item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', [front_image_url]) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): """ 提取文章的具体字段 :type response: HtmlResponse :param response: :return: """ # 通过Item loader加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_value('url', response.url) item_loader.add_css('create_date', 'p.entry-meta-hide-on-mobile::text') item_loader.add_value('front_image_url', response.meta.get("front_image_url", "")) item_loader.add_css('praise_nums', '.vote-post-up h10::text') item_loader.add_css('comment_nums', "a[href='#article-comment'] span::text") item_loader.add_css('fav_nums', '.bookmark-btn::text') item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', 'div.entry') article_item = item_loader.load_item() yield article_item # 传递到pipelines.py
def parse_detail(self, response): # article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) front_image_url = response.meta.get("front_image_url", "") item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item