def parse(self, response): print("request -> " + response.url) job_list = response.css('ul.list-items > li') if (len(job_list) > 0): print("neitui Nums:" + str(len(job_list))) for job in job_list: item = WwwJobComItem() job_primary = job.css('div.positionleft > div') item['position_id'] = job_primary[0].css('a::attr(href)').extract_first().strip().replace("/j/", "") item["position_name"] = job_primary[0].css('a::text').extract_first().strip() item['time'] = job_primary[0].css('span::text').extract_first().strip() item["salary"] = job_primary[1].css('span.mr10::text').extract_first().strip().replace("k", "K") salary = item["salary"].split("-") item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2 info_primary = job_primary[1].css('span::text').extract() item['city'] = info_primary[5].strip() item['work_year'] = info_primary[1].strip() item['education'] = info_primary[3].strip() item['company_name'] = job_primary[2].css('span >a::text').extract_first().strip() item['finance_stage'] = job_primary[2].css('span::text').extract()[1].strip() item['industry_field'] = "" item['company_size'] = "" item['position_lables'] = "" item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "neitui" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) job_list = response.css('div.dw_table > div.el') if (len(job_list) > 1): print("51job Nums:" + str(len(job_list))) for job in job_list: item = WwwJobComItem() str_time = job.css('span.t5::text').extract_first().strip() if (str_time == "发布时间"): continue else: item['position_id'] = job.css( 'p.t1 > input::attr(value)').extract_first().strip() item["position_name"] = job.css( 'p.t1 > span > a::text').extract_first().strip() salary = job.css('span.t4::text').extract_first().strip() if (salary.find("万/月") > -1): salary = salary.replace("万/月", "").split("-") item["salary"] = str( float(salary[0]) * 10) + "K-" + str( float(salary[1]) * 10) + "K" item["avg_salary"] = (float(salary[0]) * 10 + float(salary[1]) * 10) / 2 elif (salary.find("万/年") > -1): salary = salary.replace("万/年", "").split("-") item["salary"] = str( float(salary[0]) / 12) + "K-" + str( float(salary[1]) / 12) + "K" item["avg_salary"] = (float(salary[0]) / 12 + float(salary[1]) / 12) / 2 elif (salary.find("元/天") > -1): continue else: salary = salary.replace("千/月", "").split("-") item["salary"] = salary[0] + "K-" + salary[1] + "K" item["avg_salary"] = (float(salary[0]) + float(salary[1])) / 2 item['city'] = job.css( 'span.t3::text').extract_first().strip() item['work_year'] = "" item['education'] = "" item['company_name'] = job.css( 'span.t2 > a::text').extract_first().strip() item['industry_field'] = "" item['finance_stage'] = "" item['company_size'] = "" item['position_lables'] = "" item['time'] = str_time item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "51job" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) job_list = response.css('div.job-list > ul > li') if (len(job_list) > 0): print("zhipin Nums:" + str(len(job_list))) for job in job_list: item = WwwJobComItem() job_primary = job.css('div.job-primary') item['position_id'] = job.css( 'div.info-primary > h3 > a::attr(data-jobid)' ).extract_first().strip() item["position_name"] = job_primary.css( 'div.info-primary > h3 > a > div::text').extract_first( ).strip() item["salary"] = job_primary.css( 'div.info-primary > h3 > a > span::text').extract_first() item["avg_salary"] = '' # if '·' in item["salary"]: # salary_year = float(item["salary"].split("·")[1].replace("薪", "")) # else: # salary_year = 12 # salary = item["salary"].split("·")[0].split("-") # if len(salary) > 1: # item["avg_salary"] = ((float(salary[0].replace("K", "")) + # float(salary[1].replace("K", ""))) / 2) * (salary_year / 12) # else: # item["avg_salary"] = item["salary"] info_primary = job_primary.css( 'div.info-primary > p::text').extract() item['city'] = info_primary[0].strip() item['work_year'] = info_primary[1].strip() item['education'] = info_primary[2].strip() item['company_name'] = job_primary.css( 'div.info-company > div.company-text > h3 > a::text' ).extract_first().strip() company_infos = job_primary.css( 'div.info-company > div.company-text > p::text').extract() if len(company_infos) == 3: item['industry_field'] = company_infos[0].strip() item['finance_stage'] = company_infos[1].strip() item['company_size'] = company_infos[2].strip() else: item['industry_field'] = company_infos[0].strip() item['finance_stage'] = "" item['company_size'] = company_infos[1].strip() item[ 'position_lables'] = "" # job_primary.css('div.info-detail > div.tags > span::text').extract() item['time'] = '' # job.css('div.info-publis > p::text') item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "zhipin" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) try: html = json.loads(response.body.decode("utf-8")) except ValueError: print(response.body) yield self.next_request() if (html.get("result") == 0): print("dajie Num:" + str(html.get('data').get('total'))) results = html.get('data').get('list') if len(results) > 0: for result in results: item = WwwJobComItem() item['salary'] = result.get('salary').replace(" ", "").replace( "/月", "") if (item["salary"].find("-") > -1): salary = item["salary"].split("-") item["avg_salary"] = (int(salary[0].replace( "K", "")) + int(salary[1].replace("K", ""))) / 2 else: item["avg_salary"] = item["salary"].replace("K", "") item['city'] = result.get('pubCity') item['finance_stage'] = "" item['industry_field'] = result.get('industryName') item['position_lables'] = "" item['position_id'] = result.get('jobseq') item['company_size'] = result.get('scaleName') item['position_name'] = result.get('jobName') item['work_year'] = result.get('pubEx') item['education'] = result.get('pubEdu') item['company_name'] = result.get('compName') item['time'] = result.get("time") item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "dajie" yield item totalPage = html.get('data').get("totalPage") self.curPage = self.curPage + 1 if (self.curPage <= totalPage): self.url = 'https://so.dajie.com/job/ajax/search/filter?keyword=' + self.job_name + '&order=0&city=' + self.city_id + '&recruitType=&salary=&experience=&page=' + str( self.curPage) + '&positionFunction=&_CSRFToken=&ajax=1' yield self.next_request() else: time.sleep(10) yield self.next_request()
def parse(self, response): print("request -> " + response.url) try: html = json.loads(response.body) except ValueError: print(response.body) yield self.next_request() if (html.get("success")): if html.get('content').get('positionResult').get( 'resultSize') != 0: results = html.get('content').get('positionResult').get( 'result') print('lagou Nums:' + str(len(results))) for result in results: item = WwwJobComItem() item['salary'] = result.get('salary').replace("k", "K") salary = item["salary"].split("-") item["avg_salary"] = (int(salary[0].replace("K", "")) + int(salary[1].replace("K", ""))) / 2 item['city'] = result.get('city') item['finance_stage'] = result.get('financeStage') item['industry_field'] = result.get('industryField') item['position_lables'] = result.get('positionAdvantage') item['position_id'] = result.get('positionId') item['company_size'] = result.get('companySize') item['position_name'] = result.get('positionName') item['work_year'] = result.get('workYear') item['education'] = result.get('education') item['company_name'] = result.get('companyShortName') item['time'] = result.get("formatCreateTime") item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "lagou" yield item totalPage = math.floor( int( html.get('content').get('positionResult').get( "totalCount")) / int(html.get('content').get("pageSize"))) self.curPage = self.curPage + 1 if (self.curPage <= totalPage): yield self.next_request() else: time.sleep(60) yield self.next_request()
def parse(self, response): for box in response.xpath( '//ul[@class="item_con_list" and @style="display: block;"]/li' ): item = WwwJobComItem() item['position_id'] = box.xpath('./@data-positionid').extract()[0] item["position_name"] = box.xpath( './@data-positionname').extract()[0] item["salary"] = box.xpath('./@data-salary').extract()[0] item["avg_salary"] = '' item['city'] = box.xpath( './/span[@class="add"]/em/text()').extract()[0] tmp = box.xpath( './/div[@class="p_bot"]/div[@class="li_b_l"]/text()').extract( )[2].strip().split('/') item['work_year'] = tmp[0] item['education'] = tmp[1] item['company_name'] = box.xpath( './/div[@class="company_name"]/a/text()').extract()[0] tmp = box.xpath('.//div[@class="industry"]/text()').extract( )[0].strip().split('/') item['industry_field'] = tmp[0] item['finance_stage'] = tmp[1] item['company_size'] = tmp[2] item['position_lables'] = "" item['time'] = box.xpath( './/span[@class="format-time"]/text()').extract()[0] item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "lagou" yield item self.totalPage = response.xpath( '//div[@class="page-number"]/span[@class="span totalNum"]/text()' ).extract()[0] self.curPage += 1 if int(self.curPage) <= int(self.totalPage): print('next') yield scrapy.Request(url=self.url, callback=self.parse, dont_filter=True)
def parse(self, response): print("request -> " + response.url) job_list = response.css('div.job-parttime > dl') if (len(job_list) > 0): print("ganji Nums:" + str(len(job_list))) for job in job_list: item = WwwJobComItem() item['position_id'] = job.css('dt > div > input::attr(value)' ).extract_first().strip().split( ",")[0] item["position_name"] = "php开发工程师" salary = job.css('em.unit::text').extract_first().strip() if (salary == "面议"): item["salary"] = "面议" item["avg_salary"] = 0 else: salary = job.css( 'dt > div > p > em.lipay > i > strong::text' ).extract_first().strip().split("-") item["salary"] = str(math.ceil( int(salary[0]) / 1000)) + "K-" + str( math.ceil(int(salary[1]) / 1000)) + "K" item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 item['city'] = job.css('dt > div > p.site > a::text' ).extract_first().strip().replace( "地址:", "") item['work_year'] = job.css('dt > div > p > em.liexp::text' ).extract_first().strip().replace( "经验:", "") item['education'] = "" item['company_name'] = job.css( 'div.j-comp > a::text').extract_first().strip() item['industry_field'] = "" item['finance_stage'] = "" item['company_size'] = "" item['position_lables'] = "" item['time'] = job.css('p.time::text').extract_first().strip() item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "ganji" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) job_list = response.css('li.job_item') if (len(job_list) > 0): print("job58 Nums:" + str(len(job_list))) for job in job_list: item = WwwJobComItem() item['time'] = job.css('span.sign::text').extract_first().strip() if (item['time'] == "优选" or item['time'] == "精准"): continue else: item['position_id'] = job.css('div.job_name > a::attr(urlparams)').extract_first().strip().replace( "psid=", "").replace("&entinfo=", "").replace("_p", "").replace("_j", "") item[ "position_name"] = job.css('div.job_comp > p.job_require >span::text').extract()[ 0].strip() salary = job.css('p.job_salary::text').extract_first().strip() if (salary == "面议"): new_salary = salary item["avg_salary"] = 0 elif (salary == "1000"): new_salary = "1K" item["avg_salary"] = 1.0 else: salary = salary.split("-") new_salary = str(math.ceil(int(salary[0]) / 1000)) + "K-" + str( math.ceil(int(salary[1]) / 1000)) + "K" item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 item["salary"] = new_salary item['city'] = "郑州" item['work_year'] = job.css("div.job_comp > p.job_require > span::text").extract()[2].strip() item['education'] = job.css("div.job_comp > p.job_require > span::text").extract()[1].strip() item['company_name'] = job.css('div.comp_name > a::text').extract_first().strip() item['industry_field'] = "" item['finance_stage'] = "" item['company_size'] = "" label = job.css("div.job_wel > span::text").extract() item['position_lables'] = ",".join(label) item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "job58" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) try: html = json.loads(response.body) except ValueError: print(response.body) yield self.next_request() if 'data' in html.keys(): if 'results' in html['data'].keys(): results = html.get('data').get('results') print('zhilian Nums:' + str(len(results))) for result in results: item = WwwJobComItem() item['salary'] = result.get('salary').replace("k", "K") item["avg_salary"] = '' # salary = item["salary"].split("-") # if len(salary) > 1: # item["avg_salary"] = (float(salary[0].replace("K", "")) + float(salary[1].replace("K", ""))) / 2 # else: # item["avg_salary"] = item["salary"] item['city'] = result.get('city').get("display") item['finance_stage'] = '' item['industry_field'] = '' item['position_lables'] = result.get('jobType').get( 'items')[0].get('name') item['position_id'] = result.get('number') item['company_size'] = result.get('company').get( 'size').get('name') item['position_name'] = result.get('jobName') item['work_year'] = result.get('workingExp').get('name') item['education'] = result.get('eduLevel').get('name') item['company_name'] = result.get('company').get('name') item['time'] = result.get("updateDate") item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "zhilianzhaopin" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) job_list = response.css('table.newlist > tr') if (len(job_list) > 1): print("zhaopin Nums:" + str(len(job_list))) i = 0 for job in job_list: i += 1 if (i > 1 and (i % 2) == 0): item = WwwJobComItem() item['position_id'] = job.css( 'td.zwmc > input::attr(data-monitor)').extract_first( ).strip().replace("|", "") name = job.css('td.zwmc > div > a').extract_first().strip() if (name.find("php") > -1 or name.find("Php") > -1 or name.find("PHP") > -1): item["position_name"] = "php研发工程师" salary = job.css( 'td.zwyx::text').extract_first().strip().split("-") item["salary"] = str(int( int(salary[0]) / 1000)) + "K-" + str( int(int(salary[1]) / 100)) + "K" item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 item['city'] = "郑州" item['work_year'] = "" item['education'] = "" item['company_name'] = job.css( 'td.gsmc > a::text').extract_first().strip() item['industry_field'] = "" item['finance_stage'] = "" item['company_size'] = "" item['position_lables'] = "" item['time'] = job.css( 'td.gxsj > span::text').extract_first().strip() item['updated_at'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "zhaopin" yield item yield self.next_request()
def parse(self, response): print("request -> " + response.url) job_list = response.css('div.jobList > ul') if (len(job_list) > 0): print("chinahr Nums:" + str(len(job_list))) for job in job_list: item = WwwJobComItem() item['position_id'] = job.css( 'li.l1 > span.e1 > a::attr(href)').extract_first().strip( ).replace(".html?searchplace=" + CITY_DICT[CITY], "").replace("http://www.chinahr.com/job/", "") item["position_name"] = job.css( 'li.l1 > span.e1 > a::text').extract_first().strip() item["salary"] = job.css( 'li.l2 > span.e2::text').extract_first() item["avg_salary"] = '' # salary = job.css('li.l2 > span.e2::text').extract_first().strip().split("-") # item["salary"] = str(int(int(salary[0]) / 1000)) + "K-" + str(int(int(salary[1]) / 1000)) + "K" # item["avg_salary"] = (int(salary[0]) + int(salary[1])) / 2000 info_primary = job.css( 'li.l2 > span.e1::text').extract_first().strip().split("/") item['city'] = info_primary[0] + info_primary[1] item['work_year'] = info_primary[2].replace( "]\r\n\t\t\t\t\t\t\t", "") item['education'] = info_primary[3] item['company_name'] = job.css( 'li.l1 > span.e3 > a::text').extract_first().strip() item['industry_field'] = "" item['finance_stage'] = "" item['company_size'] = "" item['position_lables'] = "" item['time'] = job.css( 'li.l1 > span.e2::text').extract_first().strip() item['updated_at'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['platform'] = "chinahr" yield item yield self.next_request()