def parse_item(self, response): """ 已经正确获取到 岗位列表 的json返回,进行结构化处理 :param response: :return: """ text = response.text json_data = json.loads(text) results = json_data['content']['positionResult']['result'] # 每页 15 个结果,进行遍历 for result in results: # 数据结构化 item = LagouItem() item['item_key'] = response.meta['item_key'] item['positionName'] = result['positionName'] item['workYear'] = result['workYear'] item['education'] = result['education'] item['jobNature'] = result['jobNature'] item['createTime'] = result['createTime'] item['salary'] = result['salary'] item['city'] = result['city'] item['financeStage'] = result['financeStage'] item['industryField'] = result['industryField'] item['companyFullName'] = result['companyFullName'] item['district'] = result['district'] item['positionAdvantage'] = result['positionAdvantage'] # 提交 item 给 pipeline 进行处理 yield item
def parse(self, response): ''' 从起始url抓取类目 parse()是spider的一个方法。 被调用时,每个初始URL完成下载后生成的 Response 对象将会作为唯一的参数传递给该函数。 该方法负责解析返回的数据(response data),提取数据(生成item)以及生成需要进一步处理的URL的 Request 对象。 :param response: 起始页面html,从起始页面中的每个类目url中抓取类目url :return: 回调函数:遍历每个类目url的页码,生成所有类目的所有页码的request传入parse_url方法 ''' for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'): #使用xpath表达式,提取每个类目的url与类目 jobClass = item.xpath('text()').extract()[ 0] #extract()把selector对象转化为list,然后取list中的第一个string jobUrl = item.xpath( "@href").extract_first() #extract_first()把selector对象转化为string oneItem = LagouItem() #继承items.py中的LagouItem() oneItem["jobClass"] = jobClass #继承jobClass oneItem["jobUrl"] = jobUrl #继承jobUrl #每个类目下最多有30页,生成每个类目的不同页码的url,用于下一步的requests for i in range(self.page): jobUrl2 = jobUrl + str(i + 1) try: yield scrapy.Request(url=jobUrl2, cookies=self.cookie, meta={'oneItem': oneItem}, callback=self.parse_url) except: pass
def parse(self, response): item = LagouItem() json_dic = json.loads(response.body) # print(json_dic) data_list = json_dic['content']['positionResult']['result'] for data in data_list: item['position'] = data['positionName'] item['company'] = data['companyFullName'] item['createTime'] = data['createTime'] yield item
def parse_datas(self, response): """ 获取数据 """ # 转换为JSON数据 datas = json.loads("[{}]".format(response.text)) base_url = '/'.join(self.position_list_page.split('/')[0:-1]) try: for positioninfo in datas[0]['content']['positionResult'][ 'result']: loader = ItemLoader(item=LagouItem(), response=response) # 构造职位ID loader.add_value( 'id', get_md5( base_url + '/{}.html'.format(str(positioninfo['positionId'])))) # 职位详情页URL loader.add_value( 'positiondetailurl', base_url + '/{}.html'.format(str(positioninfo['positionId']))) # 职位 loader.add_value('positionname', positioninfo['positionName']) # 薪水 loader.add_value('salary', positioninfo['salary']) loader.add_value('avg_salary', positioninfo['salary']) # 发布时间 loader.add_value('publishtime', positioninfo['createTime']) # 工作经验 loader.add_value('workyear', positioninfo['workYear']) # 学历 loader.add_value('education', positioninfo['education']) # 类型 loader.add_value('jobnature', positioninfo['jobNature']) # 职位诱惑 loader.add_value('positionadvantage', positioninfo['positionAdvantage']) # 工作城市 loader.add_value('city', positioninfo['city']) # 招聘公司 loader.add_value('companyfullName', positioninfo['companyFullName']) # 公司详情页URL loader.add_value( 'companyurl', 'https://www.lagou.com/gongsi/{}.html'.format( str(positioninfo['companyId']))) yield loader.load_item() self.crawler.stats.inc_value("Success_Reqeust") except Exception as e: self.logger.debug("GET ERROR: {}".format(e)) self.crawler.stats.inc_value('Failed_Request')
def parse(self, response): for menu_sub in response.xpath("//div[@class='menu_sub dn']/dl"): job_classify = menu_sub.xpath("./dt/span/text()").extract()[0] for job in menu_sub.xpath("./dd/a"): job_name = job.xpath("./text()").extract()[0] job_url = job.xpath("./@href").extract()[0] + "1/" item = LagouItem() item['job_classify'] = job_classify item['job_name'] = job_name item['job_url'] = job_url # yield item yield scrapy.Request(job_url, cookies=self.cookie, meta={'item': item}, callback=self.parse_url)
def parse(self, response): # print(response.text) result = json.loads(response.body) jobs = result['content']['positionResult']['result'] for job in jobs: item = LagouItem() item['city'] = job['city'] item['companyName'] = job['companyFullName'] item['companyShortName'] = job['companyShortName'] item['companySize'] = job['companySize'] item['district'] = job['district'] item['education'] = job['education'] item['jobNature'] = job['jobNature'] item['positionName'] = job['positionName'] sal = job['salary'] sal = sal.split('-') if len(sal) == 1: item['salaryMax'] = int(sal[0][:sal[0].find('k')]) else: item['salaryMax'] = int(sal[1][:sal[1].find('k')]) item['salaryMin'] = int(sal[0][:sal[0].find('k')]) item['salaryAvg'] = (item['salaryMin'] + item['salaryMax']) / 2 item['positionAdvantage'] = job['positionAdvantage'] yield item if self.curpage <= self.total_page: self.curpage += 1 yield scrapy.http.FormRequest(self.start_urls, method='POST', headers=self.headers, cookies=self.my_cookies, formdata={ 'first': 'true', 'pn': str(self.curpage), 'kd': self.kd }, callback=self.parse)
def parse(self, response): # print(response.text) item = LagouItem() divs = response.xpath('//*[@id="s_position_list"]/ul/li/div[1]') for div in divs: title = div.xpath('./div[1]/div[1]/a/h3/text()').extract_first() location = div.xpath( './div[1]/div[1]/a/span/em/text()').extract_first() salary = div.xpath( './div[1]/div[2]/div/span/text()').extract_first() company = div.xpath('./div[2]/div[1]/a/text()').extract_first() fintance = div.xpath('./div[2]/div[2]/text()').extract_first() item['title'] = title.strip() item['location'] = location.strip() item['salary'] = salary.strip() item['company'] = company.strip() item['fintance'] = fintance.strip() yield item
def parse(self, response): #json.loads()将字符串转换成字典.然后通过键找到对应的值 jsonBody = json.loads(response.body.decode()) #jsonBody为:{'':'' , '':'' , '':'' , '':{'':'', ......}}这样的格式 results = jsonBody['content']['positionResult']['result'] items = [] for result in results: item = LagouItem() item['positionName'] = result['positionName'] # 职位名称 item['month_salary'] = result['salary'] # 薪资水平 item['companyName'] = result['companyFullName'] # 公司名称 item['companyField'] = result['industryField'] # 公司所属行业 item['companySize'] = result['companySize'] #公司规模 item['city'] = result['city'] # 工作地点 item['experience'] = result['workYear'] #经验要求 item['qualification'] = result['education'] #学历要求 item['full_or_parttime'] = result['jobNature'] #全职/兼职 item['detailLink'] = "https://www.lagou.com/jobs/" + str( result['positionId']) + ".html" #职位详情页连接 #职位描述 items.append(item) return items
def parse_url(self, response): ''' 从parse()中返回的url中抓取各个类目所有页码的页面下的招聘url :param response: :return: 回调函数:遍历每个类目招聘url下的每个招聘信息,生成的jobDetailUrl传入parse_Details()方法 ''' for sel2 in response.xpath('//ul[@class="item_con_list"]/li'): #xpath表达式,提取每个response中的职位详情url,职位发布时间 jobDetailUrl = sel2.xpath( 'div[1]/div[1]/div[1]/a/@href').extract_first() jobReleaseTime = sel2.xpath( 'div[1]/div[1]/div[1]/span/text()').extract()[0] urlItem = LagouItem() #继承items.py中的LagouItem() urlItem["jobDetailUrl"] = jobDetailUrl #职位详情的url urlItem["jobReleaseTime"] = jobReleaseTime #工作发布时间 urlItem["jobClass"] = response.meta['oneItem']['jobClass'] ''' url去重:每个职位信息的jobDetailUrl是唯一的 使用url_check()方法,将抓到的jobDetailUrl通过sql语句在数据库中检查。 如果存在此url,返回True, pass,不进行下一步请求; 如果不存在此url,返回False, 则yield scrapy.Request ''' # try: #不启用url去重 # yield scrapy.Request(url=jobDetailUrl, cookies=self.cookie, meta={'urlItem': urlItem}, # callback=self.parse_Details) # except: # pass code = url_check(jobDetailUrl) #启用url去重 if code == False: print("不存在本职位,continue!") try: yield scrapy.Request(url=jobDetailUrl, cookies=self.cookie, meta={'urlItem': urlItem}, callback=self.parse_Details) except: pass else: print("已存在本职位,pass!") return
def parse_url(self, response): """ 解析每个工作类的url """ item_base = response.meta['item'] job_classify = item_base['job_classify'] job_name = item_base['job_name'] job_url = item_base['job_url'] node_list = response.xpath("//ul[@class='item_con_list']/li[@class='con_list_item default_list']") for node in node_list: item = LagouItem() item['job_classify'] = job_classify item['job_name'] = job_name item['job_url'] = job_url item['company'] = \ node.xpath("./div[@class='list_item_top']/div/div[@class='company_name']/a/text()").extract()[0] item['company_scale'] = str.strip( node.xpath("./div[@class='list_item_top']/div/div[@class='industry']/text()").extract()[0]) item['position'] = \ node.xpath("./div[@class='list_item_top']/div/div[@class='p_top']/a/h3/text()").extract()[0] item['address'] = \ node.xpath("./div[@class='list_item_top']/div/div[@class='p_top']/a/span/em/text()").extract()[0] item['salary'] = \ node.xpath("./div[@class='list_item_top']/div/div[@class='p_bot']/div/span/text()").extract()[0] item['experience'] = str.strip( node.xpath("./div[@class='list_item_top']/div/div[@class='p_bot']/div/text()[3]").extract()[0]) item['work'] = node.xpath("./div[@class='list_item_bot']/div[@class='li_b_l']/span/text()").extract() yield item # 翻页 next_url = response.xpath( "//div[@class='pager_container']/a[@class='page_no' and text()='下一页']/@href").extract() if next_url: # yield scrapy.Request(response.urljoin(next_url[0] + "?filterOption=3"), self.parse) # yield scrapy.Request(next_url[0], cookies=self.cookie, callback=self.parse) item_base['job_url'] = next_url[0] yield scrapy.Request(next_url[0], cookies=self.cookie, meta={'item': item_base}, callback=self.parse_url)
def parse(self, response): # chrome = webdriver.Chrome # datas = response.xpath('//ul[@class="item_con_list"]/li') # for data in datas: # item = LagouItem() # item['title'] = data.xpath('//div//div/a/h3[1]/text()').get() # yield item soup = bs4.BeautifulSoup(response.text, features='lxml') for i in soup.find_all('a', class_='position_link'): link = i['href'] s = requests.Session() s.get(link, headers=self.headers, timeout=3) cookie = s.cookies res = requests.get(link, headers=self.headers, cookies=cookie, timeout=5) bs = bs4.BeautifulSoup(res.text, 'lxml') # print(res.text) positionname = bs.find(class_='name').string salary = bs.select( '.job_request h3 span:nth-child(1)')[0].get_text() location = bs.select( '.job_request h3 span:nth-child(2)')[0].get_text() experience = bs.select( '.job_request h3 span:nth-child(3)')[0].get_text() education = bs.select( '.job_request h3 span:nth-child(4)')[0].get_text() detail = bs.select('.job-detail')[0].get_text() item = LagouItem() item['positionname'] = positionname item['salary'] = salary item['location'] = location item['experience'] = experience item['education'] = education item['detail'] = detail yield item
def parse_lagou(self, response): text = json.loads(response.text) print(text) res = [] try: res = text["content"]["positionResult"]["result"] except: pass if len(res) > 0: item = LagouItem() for position in res: try: item['title'] = position['positionName'] item['education'] = position['education'] item['company'] = position['companyFullName'] item['experience'] = position['workYear'] item['location'] = position['city'] item['salary'] = position['salary'] print(item) except: pass yield item self.page += 1 url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' formdata = { 'first': 'false', 'kd': str(self.kd), 'pn': str(self.page), 'city': str(self.ct) } print('===========================', formdata) yield scrapy.FormRequest(url, callback=self.parse_lagou, formdata=formdata, headers=self.headers) else: print('爬虫结束了!')
def parse_item(self, response): pn = response.meta['pn'] print('Json API 第%s开始解析' % pn) print(response.text) data_list = json.loads(response.text) for position in data_list['content']['positionResult']['result']: loader = LagouItemLoader(item=LagouItem(), response=response) loader.add_value('position_name', position['positionName']) loader.add_value('exp_lvl', position['workYear']) loader.add_value('edu_lvl', position['education']) loader.add_value('position_type', position['jobNature']) loader.add_value('position_id', str(position['positionId'])) loader.add_value( 'position_url', 'https://www.lagou.com/jobs/' + str(position['positionId']) + '.html') loader.add_value('finance_stage', position['financeStage']) loader.add_value('industry_field', position['industryField']) loader.add_value('company_name', position['companyFullName']) loader.add_value('work_city', position['city']) loader.add_value('salary', position['salary']) loader.add_value('position_advantage', position['positionAdvantage']) loader.add_value('publish_date', position['createTime']) if position['companyLabelList']: loader.add_value('company_attr', position['companyLabelList']) else: loader.add_value('company_attr', 'None') if position['skillLables']: loader.add_value('skill_label', position['skillLables']) else: loader.add_value('skill_label', 'None') item = loader.load_item() # print(item) # break yield item
def parse_job(self, response): item = LagouItem() item['title'] = response.xpath( '//div[@class="job-name"]/@title').extract_first() item['url'] = response.url item['url_object_id'] = get_md5(response.url) item['publish_time'] = response.xpath( "//*[@class='publish_time']/text()").extract_first() item['salary'] = response.xpath( '//dd[@class="job_request"]/p/span[@class="salary"]/text()' ).extract_first() item['job_city'] = response.xpath( "//*[@class='job_request']/p/span[2]/text()").extract_first()[1:-1] item['work_years'] = response.xpath( "//*[@class='job_request']/p/span[3]/text()").extract_first()[:-1] item['degree_need'] = response.xpath( "//*[@class='job_request']/p/span[4]/text()").extract_first()[:-1] item['job_type'] = response.xpath( "//*[@class='job_request']/p/span[5]/text()").extract_first() item['job_advantage'] = response.xpath( "//dd[@class='job-advantage']/p/text()").extract_first() job_desc = response.xpath( '//dd[@class="job_bt"]/div/p/text()').extract() item['job_desc'] = "".join(job_desc) job_addr = response.xpath( '//div[@class="work_addr"]/a/text()').extract()[:-1] item['job_addr'] = "-".join(job_addr) item['company_name'] = response.xpath( '//dl[@class="job_company"]/dt/a/img/@alt').extract_first() item['company_url'] = response.xpath( "//dl[@class='job_company']//a[@rel='nofollow']/@href" ).extract_first() tags = response.xpath( '//*[@class="job_request"]/ul/li/text()').extract() item['tags'] = "/".join(tags) item['crawl_time'] = datetime.datetime.now() yield item
def parse(self, response): job = LagouItem() soup = BeautifulSoup(response.body, 'lxml') position_head_soup = soup.find('div', attrs={'class': 'position-head'}) job['ID'] = soup.find('input', attrs={ 'class': 'erma_codep' }).get('value') job['companyId'] = position_head_soup.get('data-companyid') #get_text()方法返回标签内的文本内容, name=C软件工程师(西安) name = position_head_soup.find('span', attrs={ 'class': 'name' }).get_text() job['name'] = name container_soup = soup.find('div', attrs={ 'class': 'container clearfix', 'id': 'container' }) #Get the advantage of job job_advantage_soup = container_soup.find( 'dd', attrs={'class': 'job-advantage'}) job_advantage_des = job_advantage_soup.find('p').get_text() job['advantages'] = job_advantage_des job_description_soup = container_soup.find('dd', attrs={'class': 'job_bt'}) requirements = [] requirements_soup = job_description_soup.find_all('p') for requirement in requirements_soup: requirements.append(requirement.get_text().replace(' ', '').replace( '\xa0', '')) job['description'] = requirements work_addr = container_soup.find('div', attrs={ 'class': 'work_addr' }).get_text() #去除空白和换行 work_addr = work_addr.replace(' ', '').replace('\n', '') job['workLocation'] = work_addr #获取公司信息 company = {} company_soup = soup.find('div', attrs={ 'class': 'content_r' }).find('ul').find_all('li') for info in company_soup: company[info.find('span').get_text()] = info.get_text().replace( '\n', '').replace(' ', '') company_name = position_head_soup.find('div', attrs={ 'class': 'company' }).get_text() company['公司名称'] = company_name job['company'] = company job_briefs_soup = soup.find('dd', attrs={'class': 'job_request'}) job_brief_soup = job_briefs_soup.find_all('span') job['salary'] = job_brief_soup[0].get_text().replace('/', '') job['location'] = job_brief_soup[1].get_text().replace('/', '') job['experience'] = job_brief_soup[2].get_text().replace('/', '') job['background'] = job_brief_soup[3].get_text().replace('/', '') job['isFulltime'] = job_brief_soup[4].get_text().replace('/', '') labels = [] labels_soup = job_briefs_soup.find('ul', attrs={ 'class': 'position-label clearfix' }).find_all('li') for label in labels_soup: labels.append(label.get_text()) job['labels'] = labels idFollows = self.getNextID(soup) for ID in idFollows: url = "https://www.lagou.com/jobs/%s.html" % ID r = redis.Redis(host='127.0.0.1', port=6379) r.lpush('lagou:start_urls', url) yield job
def parse_Details(self, response): ''' 从职位详情页面抓取需要的数据 :param response: :return: Item ''' for detail in response.xpath('/html/body'): #//*[@id="job_detail"] #通过scrapy.selector的xpath,css表达式提取各个数据 jobDivision = detail.css( '.job-name .company::text').extract_first() # 公司名字+部门 jobName = detail.css('.job-name span::text').extract()[0] # 职位名称 jobMoney = detail.css(".job_request .salary ::text").extract()[ 0] # 薪资 pattern_salary = re.compile(r'\d+') # 正则匹配数字 salary_left = pattern_salary.findall(jobMoney)[0] # 薪资范围下限 salary_right = pattern_salary.findall(jobMoney)[1] # 薪资范围上限 jobReleaseTime = response.meta['urlItem']['jobReleaseTime'] # 发布时间 jobReleaseTime = transport_time(jobReleaseTime) # 转化为yyyy-mm-dd jobPlace = detail.xpath('div[2]/div/div[1]/dd/p[1]/span[2]/text()' ).extract()[0] # 城市地点 jobPlace = replace_splash(jobPlace) # 移除 / jobNeed = detail.xpath('div[2]/div/div[1]/dd/p[1]/span[3]/text()' ).extract()[0] # 经验不限 jobNeed = replace_splash(jobNeed) # 移除 / jobEducation = detail.xpath( 'div[2]/div/div[1]/dd/p[1]/span[4]/text()').extract()[ 0] # 本科及以上 jobEducation = replace_splash(jobEducation) # 移除 / jobType = detail.xpath('div[2]/div/div[1]/dd/p[1]/span[5]/text()' ).extract()[0] # 全职/兼职 jobLabel = detail.css('.position-label li::text').extract()[ 0] # 职位标签 jobSpeak = detail.xpath( '//*[@id="job_detail"]/dd[1]/p/text()').extract()[0] # 职位诱惑 city = detail.xpath( '//*[@id="job_detail"]/dd[3]/div[1]/a[1]/text()' ).extract_first() #上海 Area = detail.xpath( '//*[@id="job_detail"]/dd[3]/div[1]/a[2]/text()' ).extract_first() #浦东新区 address = detail.xpath( '//*[@id="job_detail"]/dd[3]/div[1]/text()').extract() # 地址 address = address[3] # 取list中的第三个 address = replace_spalsh2(address) # 移除地址中的 ' - ' jobCompany = detail.css('.job_company img::attr(alt)').extract()[ 0] # 公司名称 CompanyUrl = detail.css('.job_company a::attr(href)').extract()[ 0] # 公司地址 CompanyDomain = detail.xpath( '//*[@id="job_company"]/dd/ul/li[1]/text()').extract()[ 1] # 公司领域 DevelopmentStage = detail.xpath( '//*[@id="job_company"]/dd/ul/li[2]/text()').extract()[ 1] # 融资情况 # 继承items.py中的LagouItem() Item = LagouItem() Item["jobDivision"] = jobDivision Item["jobName"] = jobName Item["jobMoney"] = jobMoney Item["salary_left"] = salary_left Item["salary_right"] = salary_right Item["jobReleaseTime"] = jobReleaseTime Item["jobPlace"] = jobPlace Item["jobNeed"] = jobNeed Item["jobEducation"] = jobEducation Item["jobType"] = jobType Item["jobLabel"] = jobLabel Item["jobSpeak"] = jobSpeak Item["city"] = city Item["Area"] = Area Item["address"] = address Item["jobCompany"] = jobCompany Item["CompanyUrl"] = CompanyUrl Item["jobDetailUrl"] = response.meta['urlItem']['jobDetailUrl'] Item["jobClass"] = response.meta['urlItem']['jobClass'] Item["CompanyDomain"] = CompanyDomain Item["DevelopmentStage"] = DevelopmentStage print(Item) return Item