def jobdetail(self, response): # inspect_response(response, self) item = JobsearchItem() item['postn_name'] = str(response.meta['post']['jobName']) item['postn_web'] = str('https:' + response.meta['post']['jobHref']) item['postn_area'] = str(response.meta['post']['pubCity']) item['postn_adds'] = str( response.xpath("//div [@class='ads-msg']/span/text()").extract()) item['postn_salary'] = str(response.meta['post']['salary']) if 'pubEx' in response.meta['post'].keys(): item['postn_experience'] = str(response.meta['post']['pubEx']) else: item['postn_experience'] = 'null' if 'pubEdu' in response.meta['post'].keys(): item['postn_edu'] = str(response.meta['post']['pubEdu']) else: item['postn_edu'] = 'null' item['postn_numHire'] = str( response.xpath("//li [@class='recruiting']/span/text()").re( "(\d+)\xa0人")) item['postn_benifit'] = str( response.xpath( "//div [@class='job-msg-bottom']/ul/li/text()").extract()) item['com_name'] = str(response.meta['post']['compName']) item['com_web'] = 'null' if 'industryName' in response.meta['post'].keys(): item['com_simInfo'] = str(response.meta['post']['industryName']) else: item['com_simInfo'] = 'null' item['post_date'] = 'null' item['resource'] = self.name yield item
def jobdetail(self, response): # inspect_response(response, self) if response.status == 200: benifit = [] item = JobsearchItem() item['postn_name'] = str(response.meta['post']['positionName']) item['postn_web'] = str('https://www.lagou.com/jobs/' + str(response.meta['post']['positionId']) + '.html') item['postn_area'] = str(response.meta['post']['city']) tem = response.xpath("//div [@class='work_addr']/a/text()").extract() tem.remove(tem[-1]) item['postn_salary'] = str(response.meta['post']['salary']) item['postn_experience'] = str(response.meta['post']['workYear']) item['postn_edu'] = str(response.meta['post']['education']) item['postn_numHire'] = 'Null' benifit.extend(response.meta['post']['companyLabelList']) benifit.append(response.meta['post']['positionAdvantage']) item['postn_benifit'] = str(benifit) item['com_name'] = str(response.meta['post']['companyFullName']) item['com_web'] = str(response.xpath("//i [@class='icon-glyph-home']/following-sibling::a/@href").extract()) item['com_simInfo'] = str(response.meta['post']['industryField']) item['post_date'] = str(response.meta['post']['formatCreateTime']) item['resource'] = self.name yield item elif response.status == 302: # inspect_response(response, self) reRequest = response.request.copy() reRequest.dont_filter = True yield reRequest
def parse_job(self, response): item_loader = ItemLoader(item=JobsearchItem(), response=response) for i in range(5, 58): company = item_loader.add_xpath( "company_name", "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[1]/a/text())" % i) job = item_loader.add_xpath( "job_name", "normalize-space(/html/body/div[2]/div[4]/div[%i]/p/span/a/text())" % i) working = item_loader.add_xpath( "working_city", "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[2]/text())" % i) salary = item_loader.add_xpath( "salary", "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[3]/text())" % i) time = item_loader.add_xpath( "release_time", "normalize-space(/html/body/div[2]/div[4]/div[%i]/span[4]/text())" % i) job_item = item_loader.load_item() return job_item
def parse_infos(self, response): item = JobsearchItem() item['job_link'] = response.url item['job_name'] = response.xpath( '//div[@class="cn"]/h1/@title').extract_first() item['job_city'] = response.xpath( '//div[@class="cn"]/span[@class="lname"]/text()').extract_first() item['salary'] = response.xpath( '//div[@class="cn"]/strong/text()').extract_first() item['gs_name'] = response.xpath( '//div[@class="cn"]/p[@class="cname"]/a/@title').extract_first() item['gs_link'] = response.xpath( '//div[@class="cn"]/a/@href').extract_first() msg = response.xpath( '//p[contains(@class,"msg") and contains(@class,"ltype")]/text()' ).extract_first().strip() item['gs_msg'] = re.sub("\s", "", msg) item['gs_fl'] = response.xpath( '//p[@class="t2"]/span/text()').extract() # 学历要求,可能不存在,这个必须先找到子节点,然后往回找父节点 item['req_xl'] = response.xpath( '//em[@class="i2"]/../text()').extract_first() # 经验要求,可能不存在 item['req_jy'] = response.xpath( '//em[@class="i1"]/../text()').extract_first() item['create_date'] = response.xpath( '//em[@class="i4"]/../text()').extract_first() infos = response.xpath( '//div[contains(@class,"job_msg")]/text()').extract() item['job_info'] = re.sub("\s", "", "".join(infos)) address = response.xpath('//p[@class="fp"]/text()').extract() item['address'] = re.sub("\s", "", "".join(address)) yield item
def jobdetail(self, response): # inspect_response(response, self) item = JobsearchItem() def postInfo(info): post = { 'experience': 'null', 'edu': 'null', 'numhire': 'null', 'postdate': 'null' } for i in info: if '发布' in i: post['postdate'] = i.strip() elif '经验' in i: post['experience'] = i.strip() elif '招' in i: post['numhire'] = i.strip() else: post['edu'] = i.strip() return post postInfom = postInfo( response.xpath("//p [@class='msg ltype']/text()").extract()[1:]) item['postn_name'] = str(response.xpath("//h1/@title").extract()) item['postn_web'] = str(response.url) item['postn_area'] = str( response.xpath("//p [@class='msg ltype']/text()").extract() [0].strip()) item['com_name'] = str( response.xpath("//p [@class='cname']/a/@title").extract()) item['com_web'] = str( response.xpath("//p [@class='cname']/a/@href").extract()) item['com_simInfo'] = str( response.xpath("//div [@class='com_tag']/p/@title").extract()) item['post_date'] = str(postInfom['postdate']) item['postn_adds'] = str( response.xpath("//div [@class='bmsg inbox']/p//text()").extract() [2].strip()) item['postn_salary'] = str( response.xpath("//strong/text()").extract()[1]) item['postn_experience'] = str(postInfom['experience']) item['postn_edu'] = str(postInfom['edu']) item['postn_numHire'] = str(postInfom['numhire']) item['postn_benifit'] = str( response.xpath("//div [@class='t1']/span/text()").extract()) item['resource'] = self.name yield item @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(Job51Spider, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) return spider def spider_closed(self, spider): spider.logger.info('Spider closed: %s', spider.name)
def parse(self, response): item = JobsearchItem() for h in response.css('.e.eck'): item['title'] = h.css('span::text').extract_first() yield item for href in response.css('a.next').xpath('./@href'): yield response.follow(href, callback=self.parse)
def jobDetail(self, response): # inspect_response(response, self) def postInfo(info): post = {'city': 'null', 'experience': 'null', 'edu': 'null'} for i in info: if '城市' in i: post['city'] = getIt(i.strip()) elif '经验' in i: post['experience'] = getIt(i.strip()) else: post['edu'] = getIt(i.strip()) return post def getIt(info): return info[info.find(':') + 1:] def comInfor(*info): info = list(chain.from_iterable(info)) company = {'info': 'null', 'web': 'null'} tem = [] for i in info: if ('http' or 'www' or 'com') in i: company['web'] = i else: tem.append(i) company['info'] = tem return company item = JobsearchItem() post = postInfo( response.xpath("//div [@class='job-primary detail-box']/div [@class='info-primary']/p/text()").extract()) company = comInfor( response.xpath("//div [@class='info-company']/p/text()").extract(), response.xpath("//div [@class='info-company']/p/a/text()").extract() ) item['postn_name'] = str(response.xpath( "//div [@class='job-primary detail-box']/div [@class='info-primary']/div [@class='name']/h1/text()").extract()) item['postn_web'] = str(response.request.url) item['postn_area'] = str(post['city']) item['postn_adds'] = str(response.xpath("//div [@class='location-address']/text()").extract()) item['postn_salary'] = str(response.xpath( "normalize-space(//div [@class='job-primary detail-box']/div [@class='info-primary']/div [@class='name']/span/text())").extract()) item['postn_experience'] = str(post['experience']) item['postn_edu'] = str(post['edu']) item['postn_numHire'] = 'null' item['postn_benifit'] = str(response.xpath( "//div [@class='detail-content']/div [@class='job-sec']/div [@class='job-tags']/span/text()").extract()) item['com_name'] = str(response.xpath("//div [@class='info-company']/h3/a/text()").extract()) item['com_web'] = str(company['web']) item['com_simInfo'] = str(company['info']) item['post_date'] = str(response.xpath("//div [@class='job-author']/span/text()").re(r"发布于(.*)")) item['resource'] = self.name yield item
def jobdetail(self, response): # inspect_response(response, self) def comInfo(info): company = {'info': 'null', 'addrs': 'null'} tem = [] for i in info: if '公司地址' in i: company['addrs'] = i.strip() elif '行业' in i: tem.append('行业:' + response.xpath("//ul [@class='new-compintro']/li/a/text()").extract()[0]) else: if len(i.strip()) > 0: tem.append(i.strip()) company['info'] = tem return company item = JobsearchItem() companyinfo = comInfo(response.xpath("//ul [@class='new-compintro']/li/text()").extract()) if 'https://www.liepin.com/job/' in response.request.url: item['postn_name'] = str(response.xpath("//div [@class='title-info']/h1/@title").extract()) item['postn_web'] = str(response.request.url) item['postn_area'] = str(response.xpath("//p [@class='basic-infor']/span/a/text()").extract()) item['postn_adds'] = str(companyinfo['addrs']) item['postn_salary'] = str(response.xpath("normalize-space(//p [@class='job-item-title']/text())").extract()) item['postn_experience'] = str(response.xpath("//div [@class='job-qualifications']/span/text()").extract()[1]) item['postn_edu'] = str(response.xpath("//div [@class='job-qualifications']/span/text()").extract()[0]) item['postn_numHire'] = 'null' item['postn_benifit'] = str(response.xpath("//div [@class='tag-list']/span/text()").extract()) item['com_name'] = str(response.xpath("//div [@class='title-info']/h3/a/@title").extract()) item['com_web'] = str(response.xpath("//div [@class='title-info']/h3/a/@href").extract()) item['com_simInfo'] = str(companyinfo['info']) item['post_date'] = str(response.xpath("normalize-space(//p [@class='basic-infor']/time/text())").extract()) else: item['postn_name'] = str(response.xpath("//div [@class='title-info ']/h1/@title").extract()) item['postn_web'] = str(response.request.url) item['postn_area'] = str(response.xpath("normalize-space(//p [@class='basic-infor']/span/text())").extract()) item['postn_adds'] = str(companyinfo['addrs']) item['postn_salary'] = str(response.xpath("normalize-space(//p [@class='job-main-title']/text())").extract()) item['postn_experience'] = str(response.xpath("//div [@class='resume clearfix']/span/text()").extract()[1]) item['postn_edu'] = str(response.xpath("//div [@class='resume clearfix']/span/text()").extract()[0]) item['postn_numHire'] = 'null' item['postn_benifit'] = str(response.xpath("//div [@class='tag-list']/span/text()").extract()) item['com_name'] = str(response.xpath("//p [@class='company-name']/@title").extract()) item['com_web'] = str(response.xpath("//div [@class='title-info']/h3/a/@href").extract()) item['com_simInfo'] = str(companyinfo['info']) item['post_date'] = str(response.xpath("normalize-space(//p [@class='basic-infor']/time/text())").extract()) item['resource'] = self.name yield item
def parseDetail(self, response): item = JobsearchItem() # item['detail'] = response.xpath('//article').xpath('string(.)').extract_first().strip() # same as next item['detail'] = response.xpath( 'string(//article)').extract_first().strip() item['salary'] = response.xpath( 'string(//p[@class="jp"])').extract_first() item['experience'] = response.xpath( 'string(//span[@class="s_n"])').extract_first() item['area'] = response.xpath( 'string(//div[@class="jt"]/em)').extract_first() yield item
def jobdetail(self, response): # inspect_response(response, self) item = JobsearchItem() item['postn_name'] = str( response.xpath( "//div [@class='base_info']/div/h1/span/text()").extract()) item['postn_web'] = str(response.request.url) item['postn_area'] = str( response.xpath( "//div [@class='job_require']/span [@class='job_loc']/text()"). extract()) item['postn_adds'] = 'null' item['postn_salary'] = str( response.xpath("//span [@class='job_price']/text()").extract()) item['postn_experience'] = str( response.xpath("//span [@class='job_exp']/text()").extract()) item['postn_edu'] = str( response.xpath( "//div [@class='job_require']/span/text()").extract()[3]) item['postn_numHire'] = 'null' item['postn_benifit'] = str( response.xpath( "//div [@class='job_fit_tags']/ul/li/text()").extract()) item['com_name'] = str( response.xpath( "//div [@class='company_intro jpadding mt15']/h4/a/text()"). extract()) item['com_web'] = str( response.xpath( "//div [@class='company_intro jpadding mt15']/h4/a/@href"). extract()) item['com_simInfo'] = str( response.xpath( "//div [@class='compny_tag']/span/text()").extract()) item['post_date'] = str( response.xpath("//p [@class='updatetime']/text()").extract()) item['resource'] = self.name yield item