def download_picture(pic_url): try: if not os.path.exists('img_picture'): os.mkdir('img_picture') picture = requests.get(pic_url, headers=heders, stream=True) filename = ( 'E:/pythonwork/spiderworks/scraping/articlespider/articlespider/picture_spider/img_picture/' + get_md5(pic_url) + '.jpg') pass with open(filename, 'wb') as f: f.write(picture.content) f.close() except: print('下载图片出错!', get_md5(pic_url))
def parse_detail(self, response): """ 爬取新闻详情页 :param response: :return: """ item = hangjian_Item() title = response.css(".zixun h1::text").extract_first("") create_date = response.xpath( '/html/body/div/div[2]/div[1]/div[2]/div[1]/div[1]/em/text()' ).extract_first("") author = response.xpath( '/html/body/div/div[2]/div[1]/div[2]/div[1]/div[6]/span/em' ).extract_first("") from_web = response.xpath( '/ html/body/div/div[2]/div[1]/div[2]/div[1]/div[5]/span/em/text()' ).extract_first("") content = response.css(".zixun").extract_first("") url = response.url crawl_time = datetime.datetime.now() url_object_id = get_md5(response.url) item['title'] = title item['create_date'] = create_date item['author'] = author item['from_web'] = from_web item['content'] = content item['url'] = url item['crawl_time'] = crawl_time item['url_object_id'] = url_object_id yield item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # i['name'] = response.xpath('//div[@id="name"]').extract() # i['description'] = response.xpath('//div[@id="description"]').extract() item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request p span.salary::text") item_loader.add_xpath("job_city", "//dd[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//dd[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".job_request p.publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div p") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("tags", ".position-label.clearfix li::text") item_loader.add_css("company_name", ".job_company dt a img::attr(alt)") item_loader.add_css("company_url", ".job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.datetime.now()) # item_loader.add_css("crawl_update_time",".work_addr") lagou_item = item_loader.load_item() return lagou_item
def get_detail_use_item_loader(self, response): ''' 使用item_loader,这里得到的字段是列表 :return: ''' article_item = JobBoleArticleItem() item_loader = ArticleItemLoader(item=article_item, response=response) item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_xpath("title", "//div[@class = 'entry-header']/h1/text()") # 标题 item_loader.add_xpath("create_date", "//div[@class='entry-meta']/p/text()") item_loader.add_xpath("praise_nums", "//div[@class='post-adds']//h10/text()") # 点赞数 item_loader.add_xpath( "fav_nums", "//div[@class='post-adds']/span[2]/text()") # 收藏数 item_loader.add_xpath( "comment_nums", "//span[@class='btn-bluet-bigger href-style hide-on-480']/text()" ) # 评论数 item_loader.add_xpath("content", "//div[@class='entry']") # 内容 item_loader.add_xpath( "tags", "//p[@class='entry-meta-hide-on-mobile']/a/text()") # 内容 article_item = item_loader.load_item() yield article_item
def parse_job(self, response): item_loader = LagouItemLoader(item=LagouJobItem(), response=response) item_loader.add_css('title', '.job-name .name::text') item_loader.add_value('url', response.url) item_loader.add_value('url_md5', get_md5(response.url)) item_loader.add_css('salary', '.job_request .salary::text') item_loader.add_xpath('job_city', '//dd[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath('work_years', '//dd[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath('degree_need', '//dd[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath('job_type', '//dd[@class="job_request"]/p/span[5]/text()') item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') # job_desc = response.css('.job_bt')[0].xpath('string()').extract_first() # 不用ItemLoader可以这么写 item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '.job_company dt a img::attr(alt)') item_loader.add_css('company_url', '.job_company dt a::attr(href)') item_loader.add_css('tags', '.position-label li::text') item_loader.add_value('crawl_time', datetime.datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) item_job = item_loader.load_item() return item_job
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css('title', '.job-name::attr("title")') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.job_request .salary::text') item_loader.add_css( 'job_city', '.job_request > p:nth-child(1) > span:nth-child(2)::text') item_loader.add_css( 'work_years', '.job_request > p:nth-child(1) > span:nth-child(3)::text') item_loader.add_css( 'degree_need', '.job_request > p:nth-child(1) > span:nth-child(4)::text') item_loader.add_css( 'job_type', '.job_request > p:nth-child(1) > span:nth-child(5)::text') item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('tags', '.position-label .labels::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '.b2::attr("alt")') item_loader.add_css('company_url', '#job_company dt a::attr("href")') item_loader.add_value('crawl_time', datetime.now().strftime(SQL_DATETIME_FORMAT)) job_item = item_loader.load_item() print('parse job 函数返回:', job_item) return job_item
def parse_job(self, response): #解析猎聘网职位 item_loader = LiepinJobItemLoader(item=LiepinJobItem(), response=response) item_loader.add_css("title", ".title-info h1::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) salary = response.css(".job-item-title::text").extract()[0].strip() item_loader.add_value("salary", salary) item_loader.add_css("job_addr", ".basic-infor span a::text") publish_time = response.css( ".basic-infor span::text").extract()[4].strip() item_loader.add_value("publish_time", publish_time) degree_need = response.css( ".job-qualifications span::text").extract()[0] item_loader.add_value("degree_need", degree_need) work_years = response.css( ".job-qualifications span::text").extract()[1] item_loader.add_value("work_years", work_years) tags = response.css(".tag-list span::text").extract() tag = ",".join(tags) item_loader.add_value("tags", tag) item_loader.add_css("company_name", ".title-info h3 a::text") item_loader.add_css("company_url", ".word::attr(href)") job_descs = response.css(".content.content-word::text").extract() job_desc = "".join(job_descs) item_loader.add_value("job_desc", job_desc) item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_detail(self, response): if response.url == 'http://www.xunyingwang.com/movie/': print( "------------------url为www.xunyingwang.com/movie------------------" ) return None movie_item = XiaojianrenItemLoader() # 实例化 # 通过item loader加载item front_image_url = response.meta.get("front_image_url", "") # 文章封面图 tags = response.meta.get("tags", "") item_loader = XiaojianrenItemLoader( item=xunyingItem(), response=response) # 默认ItemLoader是一个list,自定义TakeFirst() item_loader.add_css("title", ".movie-info h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_xpath( "create_date", '/html/body/div[2]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[7]/td[2]' ) item_loader.add_value("front_image_url", front_image_url) item_loader.add_value("tags", ','.join(str(n) for n in tags)) item_loader.add_xpath( "duration", '/html/body/div[2]/div/div/div[1]/div[1]/div[2]/table/tbody/tr[8]/td[2]/text()' ) item_loader.add_css("score", '.score::text') item_loader.add_xpath( "description", '/html/body/div[2]/div/div/div[1]/div[2]/div[2]/p/text()') movie_item = item_loader.load_item() yield movie_item # 将传到piplines中
def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") # 以下四个item通过span拿到,用xpath比较好写 item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") # 这里把全文html提取 item_loader.add_css("job_desc", ".job_bt div") # 这里有些地址放在<a>下面,不能直接取text。先全拿到,后面再处理 item_loader.add_css("job_addr", ".work_addr") # 注意:job_company是一个id,所以用"#"不用"." item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() # TODO: 这里不是yield? return job_item
def parse_detail(self, response): youchong_item = PetItem() # 提取宠物名称 kind = response.css(".basic").xpath("h1/text()").extract()[0] # 不容易筛选,暂时提取取全部,之后使用正则匹配出具体的值 base_info = response.css(".basic").extract()[0] # name_en = re.match(".*英文名.*", base_info) # 提取宠物简介 intro1 = response.css(".j-pedia").xpath("div[1]").extract()[0] intro2 = response.css(".j-pedia").xpath("div[2]").extract()[0] intro3 = response.css(".j-pedia").xpath("div[3]").extract()[0] intro4 = response.css(".j-pedia").xpath("div[4]").extract()[0] introall = intro1+intro2+intro3+intro4 # 提取宠物的图片 image_url = response.css(".pet-desc-l img").xpath("@src").extract()[0] # 为youchong_item传递值 youchong_item["url_object_id"] = get_md5(response.url) youchong_item["kind"] = kind youchong_item["url"] = response.url youchong_item["base_info"] = base_info youchong_item["intro"] = introall # 此处要写成数组的格式,因为传递到pipline的时候需要传递一个数组 youchong_item["image_url"] = [image_url] # 将得到的youchong_item传递到pipelines 中去,模版已经自动生成了pipeline的配置文件,需要在settings中将“ITEM_PIPELINES”参数配置打开 yield youchong_item pass # 调试阶段可以在虚拟环境中先制定某个页面,打开scrapy shell url('爬取动作网址')下载好单个网址,检测爬取动作的正确性
def parse_job(self, response): """ 解析职位信息页面 """ item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", '.job-name::attr(title)') # 这里最小最大薪资和最短最长工作年限等到item里再去处理 item_loader.add_css("salary", '.job_request .salary::text') item_loader.add_xpath("job_city", '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath("work_years", '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath("degree_need", '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath("job_type", '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", '.publish_time::text') item_loader.add_css("job_advantage", '.job-advantage p::text') item_loader.add_css("job_desc", '.job_bt div') # 地点的提取处理放在item里完成。 item_loader.add_css("job_address", '.work_addr') item_loader.add_css("company_name", '.job_company dt a img::attr(alt)') item_loader.add_css("company_url", '.job_company dt a::attr(href)') item_loader.add_value("crawl_time", datetime.now()) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("url", response.url) job_item = item_loader.load_item() return job_item
def parse_job(self, response): """ 提取数据,填充到itemloader :param response: :return: """ # 实例化一个 ItemLoader 对象, 注意,下面是传递的item对象,不是类 job_itemloader = LagoujobItemLoader(item=LagoujobItem(), response=response) # 提取数据 job_itemloader.add_value('url', response.url) job_itemloader.add_value('url_object_id', get_md5(response.url)) job_itemloader.add_css('title', '.job-name::attr(title)') job_itemloader.add_css('salary_min', '.job_request .salary::text') job_itemloader.add_css('salary_max', '.job_request .salary::text') job_itemloader.add_css('work_years_min', '.job_request span::text') job_itemloader.add_css('work_years_max', '.job_request span::text') job_itemloader.add_css('job_city', '.job_request span::text') job_itemloader.add_css('job_type', '.job_request span::text') job_itemloader.add_css('degree_need', '.job_request span::text') job_itemloader.add_css('publish_time', '.publish_time::text') job_itemloader.add_css('tags', '.position-label li::text') job_itemloader.add_css('job_advantage', '.job-advantage p::text') job_itemloader.add_css( 'job_descript', '.job_bt div') # 这里保存内容时,最好连着html一起保存, 后面查询的时候好用 job_itemloader.add_css('job_address', '.work_addr a::text') job_itemloader.add_css('company_name', '#job_company a img::attr(alt)') job_itemloader.add_css('company_url', '#job_company a::attr(href)') job_itemloader.add_value('crawl_time', datetime.now()) job_itemloader.load_item() return job_itemloader
def parse_detail(self, response): item_loader = QianchengJobItemLoader(item=QianchengJobItem(), response=response) item_loader.add_css('title', '.in .cn h1::attr(title)') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.in .cn strong::text') detail = response.css('.in .cn .msg.ltype::text').extract() if len(detail) < 5: item_loader.add_value('job_city', detail[0]) item_loader.add_value('work_years', detail[1]) item_loader.add_value('degree_need', '无学历要求') item_loader.add_value('people_need', detail[2]) item_loader.add_value('publish_time', detail[3]) else: item_loader.add_value('job_city', detail[0]) item_loader.add_value('work_years', detail[1]) item_loader.add_value('degree_need', detail[2]) item_loader.add_value('people_need', detail[3]) item_loader.add_value('publish_time', detail[4]) item_loader.add_css('job_advantage', '.in .cn .jtag .t1 span::text') item_loader.add_css('job_desc', '.bmsg.job_msg.inbox') item_loader.add_xpath('job_addr', '//div[@class="bmsg inbox"]/p[1]/text()') item_loader.add_css('company_name', '.com_msg .com_name p::text') item_loader.add_css('company_url', '.com_msg .com_name::attr(href)') item_loader.add_value('crawl_time', datetime.now()) qiancheng_item = item_loader.load_item() return qiancheng_item
def parse_job(self, response): # item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") # item_loader.add_css("job_city", ".job_request span:nth-child(2)::text") # item_loader.add_css("work_years", ".job_request span:nth-child(3)::text") # item_loader.add_css("degree_need", ".job_request span:nth-child(4)::text") # item_loader.add_css("job_type", ".job_request span:nth-child(5)::text") # item_loader.add_css("tags", ".position-label li::text") # item_loader.add_css("publish_time", ".publish_time::text") # item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job-detail") item_loader.add_css("job_addr", ".work_addr") # item_loader.add_css("company_name", "#job_company img::attr(alt)") # item_loader.add_xpath("company_url", "//*[@id='job_company']/dt/a/@href") # item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_detail(self, response): """ 企業名 name ポジション position 年収 income 画像 images 特徴 content url url_object_id(プレミアムキー代わり) :return: """ article_item = PaizaArticleItem() title = response.xpath("//h2[@class='ttl mt0 mb0']/text()").get() position = response.xpath("//td[@class='font16']/strong/text()").get() income = response.xpath( "//div[@class='strong font18 color_blue']/text()").get() images = response.meta.get('front_img_url', '') content = response.xpath( "//div[@class='rBox font13 lineHeight17']/p/text()").getall() content = ''.join(content) article_item['url_object_id'] = get_md5(response.url) article_item['url'] = response.url article_item['name'] = title article_item['position'] = position article_item['income'] = income article_item['images'] = [images] article_item['content'] = content article_item['create_date'] = datetime.now().date() yield article_item
def parse_detail(self,response): item_loader = XmrcItemLoader(item=XmrcItem(), response=response) item_loader.add_value('zhuanye',response.meta.get('zhuanye')) item_loader.add_value('job_type',response.meta.get('job_type')) item_loader.add_value('object_id',get_md5(response.url)) item_loader.add_value('link',response.url) item_loader.add_value('addr','厦门') item_loader.add_value('select_time',time.strftime(settings.TIME_SELECT_FORMAT)) item_loader.add_value('crawl_name',self.name) item_loader.add_value('crawl_time',time.strftime(settings.SQL_DATE_FORMAT)) item_loader.add_value('ident',settings.IDENT) item_loader.add_value('company_type','') item_loader.add_value('company_size','') item_loader.add_value('company_industry','') item_loader.add_xpath('title','//tr[1]/td/font[1]/a/u/text()') item_loader.add_xpath('company_name','//*[@id="container"]/table[2]/tr/td[3]/table[4]/tr[1]/td[2]/table[1]/tr[2]/td[2]/table/tr/td[contains(text(),"招聘单位")]/text()') item_loader.add_xpath('salarys','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"参考月薪")]/text()') item_loader.add_xpath('experience','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"招聘对象")]/text()') item_loader.add_xpath('education','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"学历要求")]/text()') item_loader.add_xpath('job_nums','//tr[1]/td/font[1]/font/b/text()') item_loader.add_xpath('job_desc','//*[@id="container"]/table[2]/tr/td[3]/table[4]/tr[1]/td[2]/table[1]/tr[last()-1]/td[2]/text()') item_loader.add_xpath('company_addr','//*[@id="ctl00_Body_Repeater1_ctl00_ctl02_Repeater1_ctl00_ctl03_ctl00_Tr2"]/td[2]/text()') item_loader.add_xpath('phone',"//tr[@id='ctl00_Body_Repeater1_ctl00_ctl02_Repeater1_ctl00_ctl03_ctl00_Tr1']/following-sibling::*[1]/td[2]/text()") item_loader.add_xpath('contact','//*[@id="ctl00_Body_Repeater1_ctl00_ctl02_Repeater1_ctl00_ctl03_ctl00_Tr1"]/td[2]/text()') item_loader.add_xpath('release_time','//*[@id="container"]/table[2]/tr/td[3]/table[4]/tr[1]/td[2]/table[1]/tr/td[contains(text(),"招聘期限")]/text()') item_loader.add_xpath('max_salary','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"参考月薪")]/text()') item_loader.add_xpath('min_salary','//tr[1]/td[2]/table[1]/tr[last()-3]/td/table/tr/td[contains(text(),"参考月薪")]/text()') yield item_loader.load_item()
def parse_detail(self, response): item_loader = ShixisengJobItemLoader(item=ShixisengJobItem(), response=response) item_loader.add_css('title', '.job-header .new_job_name span::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.job_money.cutom_font::text') item_loader.add_css('job_city', '.job_msg .job_position::attr(title)') item_loader.add_css('work_days', '.job_msg .job_week.cutom_font::text') item_loader.add_css('degree_need', '.job_msg .job_academic::text') item_loader.add_xpath('shixi_needed', '//div[@class="job_msg"]/span[5]/text()') item_loader.add_css('publish_time', '.job_date .cutom_font::text') item_loader.add_css('job_advantage', '.job_good_list span::text') item_loader.add_xpath('job_desc', '//div[@class="content_left"]/div[1]') item_loader.add_css('job_addr', '.con-job.job_city .com_position::text') item_loader.add_css('company_name', '.com-name::text') company_post_url = response.css( '.com-name::attr(href)').extract_first() item_loader.add_value('company_url', 'www.shixiseng.com{}'.format(company_post_url)) item_loader.add_value('crawl_time', datetime.now()) shixiseng_item = item_loader.load_item() return shixiseng_item
def parse_job(self, response): """ 解析拉钩网的职位 """ item_loader = LaGouItemLoad(item=LaGouItem(), response=response) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('title', '.job-name::attr(title)') item_loader.add_css('salary', '.salary::text') item_loader.add_xpath('job_city', "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath('work_years', "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath('degree_need', "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath('job_type', "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('tags', ".position-label.clearfix li::text") item_loader.add_css('job_advantage', ".job-advantage p::text") item_loader.add_css('job_desc', ".job_bt div") item_loader.add_css('job_addr', ".work_addr") item_loader.add_css('company_url', "#job_company dt a img::attr(src)") item_loader.add_css('company_name', "#job_company dt a img::attr(alt)") item_loader.add_value('crawl_time', datetime.now()) job_item = item_loader.load_item() return job_item
def parse_item(self, response): """解析拉勾网的职位""" loader = LagouLoader(item=LagouItem(), response=response) loader.add_xpath('title', '//div[@class="job-name"]/@title') loader.add_value('url', response.url) loader.add_value('url_object_id', get_md5(response.url)) loader.add_xpath( 'salary', '//dd[@class="job_request"]//span[@class="salary"]/text()') loader.add_xpath('job_city', '//dd[@class="job_request"]//span[2]/text()') loader.add_xpath('work_year', '//dd[@class="job_request"]//span[3]/text()') loader.add_xpath('degree_need', '//dd[@class="job_request"]//span[4]/text()') loader.add_xpath('job_type', '//dd[@class="job_request"]//span[5]/text()') loader.add_xpath('publish_time', '//p[@class="publish_time"]/text()') loader.add_xpath('tags', '//ul[contains(@class, "position-label")]/li/text()') loader.add_xpath('job_advantage', '//dd[@class="job-advantage"]/p/text()') loader.add_xpath('job_desc', '//dd[@class="job_bt"]/div') loader.add_xpath('job_addr', '//div[@class="work_addr"]') loader.add_xpath('company_name', '//dl[@id="job_company"]/dt/a/img/@alt') loader.add_xpath('company_url', '//dl[@id="job_company"]/dt/a/@href') loader.add_value('crawl_time', datetime.now()) job_item = loader.load_item() return job_item
def parse_detail(self, response): #article_item=JobBoleArticleItem() # article_item['title']=response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0] # # # # # 提取meta中的值,使用get方法遇到空的键值才不会报错,默认值为空,此处使用的是元祖非[] # # # image的url要改为数组,不然在使用自动下载器会报错,即setting中的IMAGES_URLS_FILELD # article_item['front_image_url']=[response.meta.get('front_image_url','')] # date_time=re.match('.*?(\d{4}/\d+/\d+).*',response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]').extract()[0]) # try: # article_item['create_date']=datetime.datetime.strptime(date_time,'%Y/%m/%d').date() # except Exception as e: # article_item['create_date']=datetime.datetime.now().date() # article_item['tag']=','.join(response.xpath('//div/p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()) # article_item['content']=''.join(response.xpath('//div[@class="entry"]/p/text()').extract()) # article_item['praise_nums']=response.xpath('//div[@class="post-adds"]/span/h10/text()').extract()[0] # fav_num=response.xpath('//div[@class="post-adds"]/span[2]/text()').extract()[0] # match_re=re.match(".*?(\d+).*",fav_num) # if match_re: # article_item['fav_nums']=match_re.group(1) # else: # article_item['fav_nums']=0 # comment_num=response.xpath('//div[@class="post-adds"]/a/span/text()').extract()[0] # match_re = re.match(".*?(\d+).*", comment_num) # if match_re: # article_item['comment_nums'] = match_re.group(1) # else: # article_item['comment_nums']=0 # article_item['url_object_id'] =common.get_md5(response.url) # 使用ItemLoader加载item #item_loader=ItemLoader(item=JobBoleArticleItem(),response=response) # 使用自定义ArticleItemLoader item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()') item_loader.add_xpath( 'tag', '//div/p[@class="entry-meta-hide-on-mobile"]/a/text()') item_loader.add_value('front_image_url', [response.meta.get('front_image_url', '')]) item_loader.add_value('url', response.url) item_loader.add_xpath('content', '//div[@class="entry"]/p/text()') item_loader.add_xpath('praise_nums', '//div[@class="post-adds"]/span/h10/text()') item_loader.add_xpath('comment_nums', '//div[@class="post-adds"]/a/span/text()') item_loader.add_xpath('fav_nums', '//div[@class="post-adds"]/span[2]/text()') item_loader.add_xpath( 'create_date', '//div/p[@class="entry-meta-hide-on-mobile"]/text()[1]') item_loader.add_value('url_object_id', common.get_md5(response.url)) article_item = item_loader.load_item() yield article_item pass
def parse_detail(self, response): article_item = JobboleArticleItem() # title = response.css('div.entry-header > h1::text').extract_first() # create_date = response.css('.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip() # praise_num = response.css('.post-adds .vote-post-up h10::text').extract_first(0) front_img_url = response.meta.get('front_img_url', '') # # fav_num_info = response.css('.post-adds .bookmark-btn::text').extract_first() # fav_num_re = re.match(".*(\d+).*", fav_num_info) # if fav_num_re: # fav_num = fav_num_re.group(1) # else: # fav_num = 0 # comment_num_info = response.css('a[href="#article-comment"] span::text').extract_first() # comment_num_re = re.findall("\d+",comment_num_info) # if comment_num_re: # comment_num = comment_num_re[0] # else: # comment_num = 0 # # tag_list = response.css('.entry-meta .entry-meta-hide-on-mobile a::text').extract() # tags = ','.join([tag for tag in set(tag_list) if not tag.strip().endswith('评论')]) # content = response.css('.entry').extract_first() # # article_item['url_object_id'] = get_md5(response.url) # article_item['url'] = response.url # article_item['title'] = title # try: # create_date = datetime.strptime(create_date,'%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now() # article_item['create_date'] = create_date # article_item['praise_num'] = praise_num # article_item['fav_num'] = fav_num # article_item['comment_num'] = comment_num # article_item['front_img_url'] = [front_img_url] # article_item['tags'] = tags # article_item['content'] = content #通过item loader价值item item_loader = ArticleItemLoader(item=JobboleArticleItem(), response=response) item_loader.add_css('title', 'div.entry-header > h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('praise_num', '.post-adds .vote-post-up h10::text') item_loader.add_css('fav_num', '.post-adds .bookmark-btn::text') #re item_loader.add_css('comment_num', 'a[href="#article-comment"] span::text') #re item_loader.add_css( 'tag', '.entry-meta .entry-meta-hide-on-mobile a::text') #处理函数 item_loader.add_css('content', '.entry') item_loader.add_value('front_img_url', [front_img_url]) item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): item_loader = FundsciencenetItemLoader(item=FundsciencenetItem(), response=response) item_loader.add_css('title', '.v_con h1::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) content_vcon = response.xpath('//*[@class="v_con"]/table/tr/td') item_loader.add_value( 'approval_number', content_vcon[0].xpath('text()').extract_first('')) item_loader.add_value( 'subject_classification', content_vcon[1].xpath('text()').extract_first('')) item_loader.add_value( 'project_leader', content_vcon[2].xpath('text()').extract_first('')) item_loader.add_value( 'title_of_leader', content_vcon[3].xpath('text()').extract_first('NA')) item_loader.add_value( 'dependent_unit', content_vcon[4].xpath('text()').extract_first('')) item_loader.add_value( 'subsidized_amount', content_vcon[5].xpath('text()').extract_first('')) item_loader.add_value( 'project_category', content_vcon[6].xpath('text()').extract_first('NA')) item_loader.add_value( 'time_start', response.xpath('//*[@class="v_con"]/table/tr[3]/td[3]/text()[1]'). extract_first('')) item_loader.add_value( 'time_end', response.xpath('//*[@class="v_con"]/table/tr[3]/td[3]/text()[2]'). extract_first('')) item_loader.add_value( 'chinese_keywords', content_vcon[8].xpath('text()').extract_first('NA')) item_loader.add_value( 'english_keywords', content_vcon[9].xpath('text()').extract_first('NA')) content_usual = response.xpath('//*[@class="usual"]/div/table/tr/td') item_loader.add_value( 'chinese_abstract', content_usual[0].xpath('text()').extract_first('NA')) item_loader.add_value( 'english_abstract', content_usual[1].xpath('text()').extract_first('NA')) item_loader.add_value( 'summary_abstract', content_usual[2].xpath('text()').extract_first('NA')) fundsciencenet_item = item_loader.load_item() yield fundsciencenet_item
def parse_content(self, response): # 通过css选择器提取数据 # front_image_url = response.meta.get("front_image_url", "") #文章封面图 # title = response.css('.entry-header h1::text').extract_first() # create_date = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace("·","").strip() # praise_num = response.css('.vote-post-up h10::text').extract_first() #点赞数 # fav_num = response.css('.bookmark-btn::text').extract_first() #收藏数 # match_re = re.match(".*?(\d+).*", fav_num) # if match_re: # fav_num = int(match_re.group(1)) # else: # fav_num = 0 # comments_num = response.css('a[href="#article-comment"] span::text').extract_first() # 评论数 # match_re = re.match(".*?(\d+).*", comments_num) # 正则获取字符串中的数字 # if match_re: # comments_num = int(match_re.group(1)) # else: # comments_num = 0 # content = response.css('div.entry').extract_first() # 正文 # tag_selecter = response.css('p.entry-meta-hide-on-mobile a::text').extract() # tag_list = [element for element in tag_selecter if not element.strip().endswith('评论')] # tags = ",".join(tag_list) # 标签 # # article_item = JobboleArticleItem() # article_item["title"] = title # try: # create_date = datetime.strptime(create_date, '%Y/%m/%d').date() # except Exception as e: # create_date = datetime.now().date() # article_item["create_date"] = create_date # article_item["url"] = response.url # article_item["url_object_id"] = get_md5(response.url) # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_num # article_item["comment_nums"] = comments_num # article_item["fav_nums"] = fav_num # article_item["tags"] = tags # article_item["content"] = content # 通过item loader加载item 使用自定义的loader:ArticleItemLoader 由list变成str front_image_url = response.meta.get("front_image_url", "") # 文章封面图 item_loader = ArticleItemLoader(item=JobboleArticleItem(), response = response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_detail(self, response): article_item = JobBoleArticleItem() # 提取文章的具体字段 front_image_url = response.meta.get("front_image_url", "") # title = response.css(".entry-header h1::text").extract()[0] # create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].replace('·', '').strip() # praise_nums = response.css(".vote-post-up h10::text").extract()[0] # fav_nums = response.css(".bookmark-btn::text").extract()[0] # match_re1 = re.match(".*?(\d+).*", fav_nums) # if match_re1: # fav_nums = int(match_re1.group(1)) # else: # fav_nums = 0 # # comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0] # match_re2 = re.match(".*?(\d+).*", comment_nums) # if match_re2: # comment_nums = int(match_re2.group(1)) # else: # comment_nums = 0 # content = response.css("div.entry").extract()[0] # tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract() # tag_list = [element for element in tag_list if not element.strip().endswith("评论")] # tags = ",".join(tag_list) # # # article_item["url_object_id"] = get_md5(response.url) # article_item["title"] = title # article_item["url"] = response.url # # article_item["create_date"] = create_date # article_item["front_image_url"] = [front_image_url] # article_item["praise_nums"] = praise_nums # article_item["comment_nums"] = comment_nums # article_item["fav_nums"] = fav_nums # article_item["tags"] = tags # article_item["content"] = content # 通过item loader 加载item item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response) item_loader.add_css("title", ".entry-header h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text") item_loader.add_value("front_image_url", [front_image_url]) item_loader.add_css("praise_nums", ".vote-post-up h10::text") item_loader.add_css("fav_nums", ".bookmark-btn::text") item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text") item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text") # item_loader.add_css("content", "div.entry") article_item = item_loader.load_item() yield article_item
def parse_job(self, response): item_loader = JobItemLoader(item=JobItem(), response=response) item_loader.add_css("title", ".job-title.clearfix .job-name::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job-brief .job-info .salary::text") item_loader.add_css("job_desc", ".job-desc") item_loader.add_css("job_addr", ".job-brief .job-info .where::text") job_item = item_loader.load_item() return job_item # 返回的job_item将传入pipeline
def parse_detail(self, response): item_loader = archinaItemLoader(item=archina_Item(), response=response) item_loader.add_css('title', ".col-left h1::text") #列表第一个 item_loader.add_xpath('create_date', '//*[@id="Article"]/h1/span/text()') item_loader.add_css('content', ".col-left .content") item_loader.add_value('url', response.url) item_loader.add_value("crawl_time", datetime.now()) item_loader.add_value('url_object_id', get_md5(response.url)) ar_item = item_loader.load_item() yield ar_item
def parse_job(self, response): item_loader = JobItemLoader(item=JobItem(), response=response) item_loader.add_css("title", ".job-primary .info-primary .name h1::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job-primary .info-primary .name .salary::text") item_loader.add_css("job_desc", ".detail-content .job-sec .text") item_loader.add_css("job_addr", ".job-primary .info-primary p") job_item = item_loader.load_item() return job_item # 返回的job_item将传入pipeline
def parse_page(self,response): # 使用自定义ArticleItemLoader item_loader=TianYanChaItemLoader(item=TianYanChaItem(),response=response) item_loader.add_xpath('compname',"//div[@class='header']/h1/text()") item_loader.add_xpath('phone',"//div[@class='detail ']/div[1]/div[1]/span[2]/text()|//div[@class='detail']/div[1]/div[1]/span[2]/text()") item_loader.add_xpath('email',"//div[@class='detail']/div/div[2]/span[@class='email']/text()") item_loader.add_value('url',response.url) item_loader.add_value('object_id',common.get_md5(response.url)) # 法定代表人 item_loader.add_xpath('fddb',"//div[@id='_container_baseInfo']/table/tbody/tr/td[1]/div[1]/div/div[2]/div/a/@title") # 注册资本 item_loader.add_xpath('zczb',"//div[@id='_container_baseInfo']/table/tbody/tr/td[2]/div[2]/@title") # 注册时间 item_loader.add_xpath('zctime',"//div[@id='_container_baseInfo']/table[1]/tbody/tr[2]/td[1]/div[2]/text/text()") # 公司状态 item_loader.add_xpath('gszt',"//div[@id='_container_baseInfo']/table/tbody/tr[3]/td/div[2]/@title") # 工商id item_loader.add_xpath('gsid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[1]/td[2]/text()") # 组织机构id item_loader.add_xpath('orgid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[1]/td[4]/text()") # 信用id item_loader.add_xpath('xyid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[2]/td[2]/text()") # 公司类型 item_loader.add_xpath('gstype',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[2]/td[4]/text()") # 纳税人id item_loader.add_xpath('nsrid',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[3]/td[2]/text()") # 行业 item_loader.add_xpath('hy',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[3]/td[4]/text()") # 营业期限 item_loader.add_xpath('yyqx',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[4]/td[2]/span/text()") # 核准日期 item_loader.add_xpath('hzrq',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[4]/td[4]/text/text()") # 公司规模 item_loader.add_xpath('size',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[5]/td[4]/text()") # 实缴资本 item_loader.add_xpath('sjzb',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[6]/td[2]/text()") # 登记机构 item_loader.add_xpath('djjg',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[6]/td[4]/text()") # 地址 item_loader.add_xpath('addr',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[8]/td[2]/text()") # 经营范围 item_loader.add_xpath('jyfw',"//div[@id='_container_baseInfo']/table[2]/tbody/tr[9]/td[2]/span/span/span[1]/text()") # 高管姓名 item_loader.add_xpath('zyry',"//div[@id='_container_staff']/div/table/tbody/tr/td/div/a[1]/text()") # 股东信息 item_loader.add_xpath('gdxx',"//div[@id='_container_holder']/table/tbody/tr/td/div/div[2]/a/text()") tianyancha_item=item_loader.load_item() yield tianyancha_item
def parse_job(self, response): item_loader = ZhilianJobItemLoader(item=ZhilianJobItem(), response=response) item_loader.add_css("title", ".job-title.clearfix .job-name::text") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job-brief .job-info .salary::text") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def generate_time_md5(): t = str(time.time()) return common.get_md5(t)
def get_user_file(username, filename): d = os.path.join(xdg_cache_home, PROGRAM_NAME, common.get_md5(username)) if not os.path.isdir(d): os.makedirs(d) return os.path.join(d, filename)