def parse_job(self, response): """ 解析拉勾网的职位 :param response: :return: """ item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name span::text") item_loader.add_value("url", response.url) item_loader.add_css("salary", ".salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a div h2::text") job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_value("post_url", response.url) item_loader.add_css("title", "div .job-name::attr(title)") item_loader.add_css("company", "div .company::text") item_loader.add_css("min_salary", "span.salary::text") item_loader.add_xpath("city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("min_work_years", "//*[@class='job-request']/p/span[3]/text()") item_loader.add_xpath("degree_req", "//*[@class='job-request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job-request']/p/span[5]/text()") item_loader.add_css("tags", ".position-label li::text") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("advantage", ".job-advantage p::text") item_loader.add_css("description", ".job_bt div") # hot about .job_bt p item_loader.add_css("addr", ".work_addr") job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_xpath("title", '//div/span[@class="name"]/text()') item_loader.add_xpath("url", response.url) item_loader.add_xpath("url_object_id", get_md5(response.url)) item_loader.add_xpath("salary", '//dd/p/span[@class="salary"]/text()') item_loader.add_xpath("job_city", '//dd/p/span[2]/text()') item_loader.add_xpath("work_years", '//dd/p/span[3]/text()') item_loader.add_xpath("degree_need", '//dd/p/span[4]/text()') item_loader.add_xpath("job_type", '//dd/p/span[5]/text()') item_loader.add_xpath("tags", '//dd[@class="job_request"]/ul/li/text()') item_loader.add_xpath("publish_time", '//dd/p[@class="publish_time"]/text()') item_loader.add_xpath("job_advantage", '//dl/dd/p/text()') item_loader.add_xpath("job_desc", '//dd/div/p/text()') item_loader.add_xpath("job_addr", '//dd/div[@class="work_addr"]') item_loader.add_xpath("company_url", '//dl/dt/a/@href') item_loader.add_xpath("company_name", '//dl/dt/a/img/@alt') item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): print(response.text) #解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") # item_loader.add_value("crawl_time", datetime.datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): item_load = LagouJobItemLoader(item=LagouJobItem(), response=response) item_load.add_value("url", response.url) item_load.add_value("url_object_id", get_md5(response.url)) item_load.add_css("title", "div.job-name::attr(title)") item_load.add_css("salary", ".salary::text") item_load.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_load.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_load.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_load.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_load.add_css("pulish_time", ".publish_time::text") item_load.add_xpath("tags", "//*[@class='position-label clearfix']/li/text()") item_load.add_xpath("job_advantage", "//*[@class='job-advantage']/p/text()") item_load.add_xpath("job_desc", "//*[@class='job_bt']/div") item_load.add_xpath("job_addr", "//*[@class='work_addr']/a/text()") item_load.add_xpath("company_url", "//*[@class='c_feature']/li/a/@title") item_load.add_css("company_name", ".job_company dt img::attr(alt)") item_load.add_value("crawl_time", datetime.datetime.now()) item_load.add_value("crawl_update_time", datetime.datetime.now()) lagou_item = item_load.load_item() #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return lagou_item
def parse_job(self, response): #解析拉勾网职位 item_loader = LagouJobItemLoader(item=LagouJobItem(),response=response) item_loader.add_css('title','.job-name::attr(title)') item_loader.add_css('url', '') item_loader.add_value('url_obj_id', '') item_loader.add_css('salary', '') item_loader.add_css('job_city', '') item_loader.add_css('work_years', '') item_loader.add_css('degree_need', '') item_loader.add_css('job_type', '') item_loader.add_css('publish_time', '') item_loader.add_css('job_advantage', '') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '') item_loader.add_css('company_url', '') item_loader.add_css('tags', '') item_loader.add_value('crawl_time', datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): #解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css('title', '.job-name::attr(title)') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.salary::text') item_loader.add_xpath('job_city', '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath('work_years', '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath('degree_need', '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath('job_type', '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css('tags', '.position-label li::text') item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '#job_company dt a img::attr(alt)') item_loader.add_css('company_url', '#job_company dt a::attr(href)') item_loader.add_value('crawl_time', datetime.datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", "") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text") item_loader.add_css("work_years", ".job_request p span:nth-child(3)::text" ) # 这里使用css ,是为了在学习时,熟悉css选择器用法 item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("tags", ".position-label.clearfix li::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_value("crawl_time", datetime.datetime.now()) # item_loader.add_css("crawl_update_time", datetime.datetime.now()) job_item = item_loader.load_item( ) # 这里先赋值给一个变量,是考虑到便于调试以及代码可读性,而不是为了代码简洁而直接return return job_item
def parse_job(self, response): # 解析拉勾网的职位 item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']/p/span[5]/text()") item_loader.add_css("tags", ".position-label li::text") item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() # response_text = response.text #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return job_item
def parse_item(self, response): # item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() # return item # if 'utrack/track' in response.url: # # 本想解决302重定向问题,但此法无效 # num = re.match('.*2F(\d+).html.*', response.url) # num = str(num.group(1)) # url = 'https://www.lagou.com/jobs/' + num + '.html' # print(url) # time.sleep(2) # return scrapy.Request(url, dont_filter=True, headers=self.headers) if response.status == 302: self.redirect_url.append(response.url) self.crawler.stats.inc_value("redirected_url") item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) item_loader.add_css('title', 'span.name:nth-child(2)::text') item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.job_request .salary::text') item_loader.add_xpath('job_city', '//*[@class="job_request"]/p/span[2]/text()') item_loader.add_xpath('work_years', '//*[@class="job_request"]/p/span[3]/text()') item_loader.add_xpath('degree_need', '//*[@class="job_request"]/p/span[4]/text()') item_loader.add_xpath('job_type', '//*[@class="job_request"]/p/span[5]/text()') item_loader.add_css('tags', '.position-label li::text') item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '#job_company dt a img::attr(alt)') item_loader.add_css('company_url', '#job_company dt a::attr(href)') item_loader.add_value('crawl_time', datetime.datetime.now()) job_item = item_loader.load_item() return job_item
def parse_item(self, response): """解析拉勾网的职位""" # 实例化一个对象, LagouJobItem是拉勾的item # LagouJobItemLoader是自定义的一个给lagou的item # Lg = LagouJobItem() item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) # print(response.text) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css( "job_addr", ".work_addr" ) # 获取的字段里面包含了一段html,所以不能用::text,我们在itemloader里去除不必要的html item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item
def parse_job(self, response): # 返回不为200,删掉该ip if delete_ip(response): url = response.url match_re = r'(https://www.lagou.com/jobs/?\d+.html).*$' match = re.match(match_re, url) #同理也是通过xpath进行相关数据的获取和利用正则、字符串一些方法来处理拿下来的数据 if match: url = match.group(1) # 判断数据库是否有该url if check_table_url('lagou_job', url): jobItemLoader = LagouJobItemLoader(item=LagouJob(), response=response) jobItemLoader.add_xpath( 'title', "//div[@class='job-name']//h1/text()") jobItemLoader.add_value('url', url) url_object_id = get_md5(url) jobItemLoader.add_value('url_object_id', url_object_id) job_request = response.xpath( "//dd[@class='job_request']//span/text()").extract() salary = job_request[0].strip() jobItemLoader.add_value('max_salary', get_max_min_salary(salary, True)) jobItemLoader.add_value('min_salary', get_max_min_salary(salary, False)) job_city = get_city(job_request[1]) jobItemLoader.add_value('job_city', job_city) work_years = job_request[2] jobItemLoader.add_value('work_years', work_years) degree_need = job_request[3] jobItemLoader.add_value('degree_need', degree_need) job_type = job_request[4] jobItemLoader.add_value('job_type', job_type) jobItemLoader.add_xpath( 'publish_time', "//p[@class='publish_time']/text()") jobItemLoader.add_xpath( 'job_advantage', "//dd[@class='job-advantage']//p/text()") jobItemLoader.add_xpath( 'job_desc', "//div[@class='job-detail']//text()") job_addr = ''.join( response.xpath( "//div[@class='work_addr']//text()").extract()) jobItemLoader.add_value('job_addr', clear_str(job_addr)) jobItemLoader.add_value( "company_name", response.xpath("//h3[@class='fl']/em/text()").extract( )[0].strip()) jobItemLoader.add_xpath( "company_url", "//dl[@class='job_company']//a/@href") jobItemLoader.add_xpath( "company_url_id", "//dl[@class='job_company']//a/@href") tags = response.xpath( "//ul[@class='position-label clearfix']//li/text()" ).extract() if len(tags) != 0: tags = clear_str((',').join(tags)) else: tags = '' jobItemLoader.add_value('tags', tags) jobItemLoader.add_value('crawl_time', get_now()) job_item = jobItemLoader.load_item() return job_item
def parse_company(self, response): if delete_ip(response): url = response.url match_re = r'(https://www.lagou.com/gongsi/?\d+.html).*$' match = re.match(match_re, url) if match: url = match.group(1) if check_table_url('lagou_company', url): response = return_new_company_response( response.request, response) companyItemLoader = LagouJobItemLoader(item=LagouCompany(), response=response) companyItemLoader.add_value('url', url) companyItemLoader.add_value('url_object_id', get_md5(url)) tags = response.xpath( "//div[@id='tags_container']//li//text()").extract() if len(tags) != 0: tags = clear_str((',').join(tags)) else: tags = '' companyItemLoader.add_value('tags', tags) company_name = clear_str(''.join( response.xpath( "//h1[@class='company_main_title']//text()"). extract())) companyItemLoader.add_value('company_name', company_name) companyItemLoader.add_value( 'industry', response.xpath( "//div[@id='basic_container']//li//i[@class='type']/following-sibling::span[1]//text()" ).extract_first("")) companyItemLoader.add_value( 'finance', response.xpath( "//div[@id='basic_container']//li//i[@class='process']/following-sibling::span[1]//text()" ).extract_first("")) companyItemLoader.add_value( 'people_count', response.xpath( "//div[@id='basic_container']//li//i[@class='number']/following-sibling::span[1]//text()" ).extract_first("")) companyItemLoader.add_value( 'city', response.xpath( "//div[@id='basic_container']//li//i[@class='address']/following-sibling::span[1]//text()" ).extract_first("")) score = response.xpath( "//span[@class='score']//text()").extract_first("0") companyItemLoader.add_value('score', score) create_date = response.xpath( r"//div[@class='company_bussiness_info_container']//div[@class='content']//text()" ).extract() if len(create_date) != 0: create_date = create_date[1] else: create_date = '' companyItemLoader.add_value('create_date', create_date) company_desc = response.xpath( "//div[@id='company_intro']/div[@class='item_content']/div[@class='company_intro_text']//text()" ).extract() company_desc = clear_str(('').join(company_desc)) companyItemLoader.add_value('company_desc', company_desc.strip()) companyItemLoader.add_value('crawl_time', get_now()) company_data = response.xpath( "//div[@class='company_data']//li//strong//text()" ).extract() companyItemLoader.add_value('review_count', company_data[3].strip()) companyItemLoader.add_value('job_count', company_data[0].strip()) company_item = companyItemLoader.load_item() return company_item