def page_parse(self,response): # 从初始页获取部分信息 page = response.meta['meta_0'] offset = response.meta['meta_2'] items = [] for j in range(0, len(response.xpath("//div[@class='newlist_detail newlist']").extract())): item = ZhilianItem() # 公司名称 item['companyName'] = response.xpath('//form//ul/li[2]/a[1]/text()').extract()[j].replace('\'', '\"') # 公司性质 item['companyNature'] = response.xpath('//form//ul/li[3]/span[2]/text()').extract()[j][5:].replace('\'', '\"') # 公司规模 item['companyScale'] = response.xpath("//li[@class='newlist_deatil_two']/span[3]/text()").extract()[j][ 5:].replace('\'', '\"') # 详细链接 item['subUrl'] = response.xpath('//form//ul/li[1]/div//@href').extract()[j] items.append(item) for item in items: # 发送详情页请求,传递item,并用detail_parse()处理所得响应 print "request: " + item['subUrl'] yield scrapy.Request(url=item['subUrl'], meta={'meta_1': item}, callback=self.detail_parse) offset += 1 if offset <= page: pos = response.url.rfind('=') yield scrapy.Request(url=response.url[0:pos+1] + str(offset), meta={'meta_0':page,'meta_2':offset },callback=self.page_parse)
def detail_parse(self, response): item = ZhilianItem() job = response.css( '.top-fixed-box .fixed-inner-box .fl h1::text').extract_first() company = response.css( '.top-fixed-box .fixed-inner-box .fl h2 a::text').extract_first() salary = response.css( '.terminalpage.clearfix .terminalpage-left > ul > li:nth-child(1) > strong::text' ).extract_first() educational = response.css( '.terminalpage.clearfix .terminalpage-left > ul > li:nth-child(6) > strong::text' ).extract_first() experience = response.css( '.terminalpage.clearfix .terminalpage-left > ul > li:nth-child(5) > strong::text' ).extract_first() requirement = '' for terminalpage in response.css( '.terminalpage-main .tab-cont-box .tab-inner-cont > p::text' ).extract(): requirement += terminalpage.replace("\n", "").strip() item['job'] = job item['company'] = company item['salary'] = salary item['educational'] = educational item['experience'] = experience item['requirement'] = requirement yield item
def parse_detail(self, response): zhilian = ZhilianItem() zls = Selector(response) zhilian['bussiessname'] = zls.xpath( '//div[@class="inner-left fl"]/h2/a/text()').extract() zhilian['bussiessurl'] = zls.xpath( '//div[@class="inner-left fl"]/h2/a/@href').extract() zhilian['jobname'] = zls.xpath( '//div[@class="inner-left fl"]/h1/text()').extract() bussiessaddress = zls.xpath( '//div[contains(@class, "tab-inner-cont")]/h2/text()').extract() zhilian['bussiessaddress'] = ''.join(bussiessaddress).strip() zhilian['workaddress'] = zls.xpath( '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[2]/strong/a/text()' ).extract() zhilian['money'] = zls.xpath( '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[1]/strong/text()' ).extract() zhilian['education'] = zls.xpath( '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[6]/strong/text()' ).extract() zhilian['releasetime'] = zls.xpath( '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[5]/strong/text()' ).extract() zhilian['num'] = zls.xpath( '//div[contains(@class, "terminalpage-left")]/ul[contains(@class, "terminal-ul clearfix")]/li[7]/strong/text()' ).extract() description = zls.xpath( '//div[contains(@class, "tab-inner-cont")]/p/text()').extract() descriptionStr = '' for des in description: currentdes = des.strip() descriptionStr += currentdes zhilian['description'] = descriptionStr yield zhilian
def parse(self, response): #----------主页数据操作----------------------- print('url=',self.start_urls) titles = response.xpath('//a[@name="itemlist-title"]/@title').extract() prices = response.xpath('//span[@class ="price_n"]/text()').extract() list = [] for i in range(0, len(titles)): map = {} map['title'] = titles[i] map['price'] = prices[i] list.append(map) print('list=', len(list)) print('当前请求地址', response.request.url); print('获取上一次请求传递的数据meta=',response.meta) #获取页面 # print('本页面=',response.body_as_unicode()) str_url = response.request.url str_url = str_url[str_url.index('com/') + 4:str_url.index('-')] print('截取地址 str1=', str_url) zlitem=ZhilianItem() #禁止这样写 不然yield不起作用 # zlitem.list=list zlitem['list']=list #每一页都存一个json zlitem['page']=str_url+'_data.json' zlitem['type'] = '主页' #保存主页数据 # yield把数据给引擎在扔给pipelines.py的item #yield只是把任务加入队列 yield zlitem # ------获取子页面,子页面太多这里只做2个-------- #主页面每个item的子页面连接 a_list=response.xpath('//a[contains(@name,"itemlist-picture") and contains(@target,"_blank") and contains(@class,"pic") and contains(@dd_name,"单品图片")]/@href').getall() print('子页面连接=',len(a_list)) a_list=a_list[0:2] for i in range(0,len(a_list)): #callback=self.childPage:回调的函数 #meta:给下一个页面传递的数据 yield scrapy.Request(url=a_list[i], callback=self.childPage,meta={'info':'子页面','url':a_list[i],'jsonName': str_url + '子页面'}) print('子连接=',a_list) #------下一页-------- # <a href="/pg3-cid4008154.html" title="下一页">下一页</a> #get()=/pg3-cid4008154.html url_str = response.xpath('//a[@title="下一页"]/@href').get() next_url = 'http://category.dangdang.com' + url_str print('next_url=',next_url) if not url_str: # 退出方法 return else: # 在把请求放到队列里面 # callback=self.parse 解析的是同一个页面所以还用parse当前方法 # allowed_domains和start_urls一定要一样不然scrapy.Request无效 yield scrapy.Request(url=next_url, callback=self.parse)
def parse_list(self,response): resp=response.text title=resp.xpath('div[@class="contentpile__content__wrapper__item clearfix]//span[@class="contentpile__content__wrapper__item__info__box__jobname__title]/@title"').extract() company=resp.xpath('div[@class="contentpile__content__wrapper__item clearfix]//a[@class="contentpile__content__wrapper__item__info__box__cname__title company_title]/text()"').extract() item=ZhilianItem() item['title']=title item['company']=company yield item
def parse_job_info(self, response): """ 解析工作信息 :param response: :return: """ zl_table_list = response.xpath( "//div[@id='newlist_list_content_table']/table[@class='newlist']") for zl_table in zl_table_list[1:]: # tbody 是网页自动生成的 运行起来看效果/或者右键查看源码 # zl_td_list = zl_table.xpath("tr[1]/td") # 问题:td 数不是5个,会报错--索引越界 # td1 = zl_table_list[0] # td2 = zl_table_list[1] # td3 = zl_table_list[2] # td4 = zl_table_list[3] # td5 = zl_table_list[4] # 查找元素尽量用xpath定位,少用索引,因为有可能出现索引越界错误 # 只有在不明确错误时使用异常捕获 # //text()获取标签内所有文本 # extract()把列表里的元素转换成文本,本身还是列表 # extract_first('默认值')把列表里的元素转换成文本并取出第一个,如果取不到,返回默认值 td1 = zl_table.xpath( "tr/td[@class='zwmc']/div/a//text()").extract() # map返回的是一个列表 td1 = list(map(str.strip, td1)) td1 = map(str.strip, td1) job_name = "".join(td1).replace(",", "/") # strip()只能清除两端的 fan_kui_lv = zl_table.xpath("tr/td[@class='fk_lv']/span/text()" ).extract_first('没有反馈率').strip() job_company_name = zl_table.xpath( "tr/td[@class='gsmc']/a[1]/text()").extract_first( '没有公司名称').strip() job_salary = zl_table.xpath( "tr/td[@class='zwyx']/text()").extract_first('面议').strip() job_place = zl_table.xpath( "tr/td[@class='gzdd']/text()").extract_first('没有工作地点').strip() print(job_name, fan_kui_lv, job_company_name, job_salary, job_place) item = ZhilianItem() item['job_name'] = job_name item['job_company_name'] = job_company_name item['job_place'] = job_place item['job_salary'] = job_salary item['job_time'] = "没有时间" item['job_type'] = "智联招聘" item['fan_kui_lv'] = fan_kui_lv yield item yield scrapy.Request( url=response.url, callback=self.parse_next_page, meta={}, dont_filter=True, )
def index_parse(self, response): detail_urls = re.findall(r'<td class="zwmc".*?href="(.*?)"', response.text, re.S) for detail_url in detail_urls: item = ZhilianItem() item['detail_url'] = detail_url print("详细信息:", detail_url) yield item yield Request(url=item['detail_url'], callback=self.detail_parse, dont_filter=True)
def parse(self, response): item = ZhilianItem() title = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[1]/a/text()' ).extract() #名称 address = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[2]/span[1]/span/em/text()' ).extract() #地址 numbers = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[2]/span[2]/span/em/text()' ).extract() #人数 industry = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[3]/span/em/text()' ).extract() #行业 introduction = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[6]' ).extract() #未处理的简介 s = re.findall(r'1.+',soup.text) company_type = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[1]/span/em/text()' ).extract() #公司类型 scale = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[2]/span/em/text()' ).extract() #公司规模 job_type = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[4]/span[4]/span/em/text()' ).extract() #职位类别 company = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[2]/p[2]/span/text()' ).extract() #公司名 time = response.xpath( '/html/body/div[2]/div[3]/div[1]/div/div[2]/ul/li/div[1]/div[2]/p[3]/span/span/text()[2]' ).extract() #发布时间 for a, b, c, d, e, f, g, h, i, j in zip(title, address, numbers, industry, introduction, company_type, company, scale, job_type, time): item['title'] = a item['time'] = j item['address'] = b item['numbers'] = c item['industry'] = d item['job_type'] = i item['company'] = g item['company_type'] = f item['scale'] = h item['introduction'] = re.findall(r'<span>.+?</span>', e) yield item
def parse(self, response): web_data = response.text soup = BeautifulSoup(web_data, 'lxml') job_name = soup.select( "table.newlist > tr > td.zwmc > div > a:nth-of-type(1)") salary = soup.find_all('td', {'class': 'zwyx'}) for name, salary in zip(job_name, salary): item = ZhilianItem() item["job_name"] = name.get_text() url = name.get('href') item["salary"] = salary.get_text() yield Request(url=url, meta={"item": item}, callback=self.parse_moive, dont_filter=True)
def parse_item(self, response): itme = ZhilianItem() response = Selector(response) itme['职位月薪'] = response.re('职位月.*>([\u4e00-\u9fa5/\d-]+)') itme['发布日期'] = response.re('发布日期.*>([\d: -]+)') itme['工作地点'] = response.re('工作地点.*?>-([\u4e00-\u9fa5]+)') itme['工作性质'] = response.re('工作性质.*?>([\u4e00-\u9fa5]+)') itme['工作经验'] = response.re('工作经验.*?>([\u4e00-\u9fa5]+)') itme['最低学历'] = response.re('最低学历.*?>([\u4e00-\u9fa5]+)') itme['招聘人数'] = response.re('招聘人数.*?>([\u4e00-\u9fa5\d]+)') itme['职位类别'] = response.re('职位类别.*?>([\u4e00-\u9fa5]+)') #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() #i['name'] = response.xpath('//div[@id="name"]').extract() #i['description'] = response.xpath('//div[@id="description"]').extract() return itme
def childPage(self,response): print('------子页面开始------') title=response.xpath('//div[contains(@class,"name_info") and contains(@ddt-area,"001")]/h1/@title').get() price=response.xpath('//p[@id="dd-price"]/text()').get() map=response.meta print('子页面meta=',map) list={'title':title,'price':price} print('子页面数据=',list) zlitem = ZhilianItem() zlitem['type']='子页' zlitem['list']=list zlitem['page']=map['jsonName'] print('子页面数据=', zlitem) yield zlitem print('------子页面结束------')
def parse_zhilian(self, response): info = response.xpath('//table[@class="newlist"]') print info for each in info[1:]: item = ZhilianItem() item["post_name"] = each.xpath( './/td[@class="zwmc"]//a[@style="font-weight: bold"]/text()' ).extract()[0] item["company_name"] = each.xpath( './/td[@class="gsmc"]/a/text()').extract()[0] item["pay"] = each.xpath( './/td[@class="zwyx"]/text()').extract()[0] item["location"] = each.xpath( './/td[@class="gzdd"]/text()').extract()[0] yield item
def get_info(self, response): soup = BeautifulSoup(response.text, 'lxml') infos = soup.find('ul',class_='terminal-ul').find_all('li') item = ZhilianItem() item['zwyx'] = infos[0].find("strong").get_text().encode('utf-8')#职位月薪 item['gzdd'] = infos[1].find("strong").find('a').get_text().encode('utf-8') # 工作地点 item['gzxz'] = infos[3].find("strong").get_text().encode('utf-8')#工作性质 item['gzjy'] = infos[4].find("strong").get_text().encode('utf-8')#工作经验 item['zdxl'] = infos[5].find("strong").get_text().encode('utf-8')#最低学历 item['zprs'] = infos[6].find("strong").get_text().encode('utf-8') # 招聘人数 item['zwlb'] = infos[7].find("strong").find('a').get_text().encode('utf-8') # 职位类别 item['zwmc'] = soup.find('div',class_='top-fixed-box').find('h1').get_text().encode('utf-8') #职位名称 #fldy_list = soup.find('div', class_='welfare-tab-box').find_all('span') #print (item) return item
def parse(self, response): js = json.loads(response.text) results = js["data"]["results"] for r in results: zhilian = ZhilianItem() zhilian["number"] = r["number"] zhilian["jobname"] = r["jobName"] zhilian["company"] = r["company"]["name"] zhilian["position"] = r["city"]["display"] zhilian["size"] = r["company"]["size"]["name"] zhilian["edulevel"] = r["eduLevel"]["name"] zhilian["salary"] = r["salary"] zhilian["workexp"] = r["workingExp"]["name"] workUrl = "https://jobs.zhaopin.com/"+zhilian["number"]+".htm" yield scrapy.Request(workUrl,meta = {"zzll":copy.deepcopy(zhilian)},callback = self.getResult)
def parse(self, response): table_list = response.xpath('//table[@class="newlist"]') # print(table_list) for table in table_list[2:]: item = ZhilianItem() position = table.xpath('.//td[@class="zwmc"]/div/a[1]/text()').extract_first() item["position"] = position.strip() if position else None url = table.xpath('.//td[@class="zwmc"]/div/a[1]/@href').extract_first() item["link"] = url.strip() if url else None if url: yield scrapy.Request(url=url, meta={"item": item}, callback=self.parse_link, dont_filter=True) next_page = response.xpath('//li[@class="pagesDown-pos"]/a/@href').extract_first() if next_page: next_url = response.urljoin(next_page) yield scrapy.Request(url=next_url, callback=self.parse, dont_filter=True)
def content_parse(self, response): item = ZhilianItem() city_name = self.city jobname = response.xpath( '//div[@class="main1 cl main1-stat"]/div/ul/li/h1/text()' )[0].extract() salary = response.xpath( '//div[@class="main1 cl main1-stat"]/div/ul/li/div/strong/text()' )[0].extract() position = response.xpath( '//div[@class="info-three l"]/span/a/text()')[0].extract() span = response.xpath( '//div[@class="info-three l"]/span/text()').extract() if len(span) > 3: workingExp = span[1] eduLevel = span[2] else: workingExp = response.xpath( '//div[@class="info-three l"]/span/text()')[0].extract() eduLevel = response.xpath( '//div[@class="info-three l"]/span/text()')[1].extract() company_name = response.xpath( '//div[@class="company l"]/a/text()')[0].extract() update_time = response.meta['update_time'] require = response.xpath('//div[@class="pos-ul"]/p/text()').extract() if require == []: require = response.xpath( '//div[@class="pos-ul"]/p/span/text()').extract() job_require = require sha1 = hashlib.sha1() string = (company_name + '' + update_time) stri = string.encode('utf8') sha1.update(stri) hash_id = sha1.hexdigest() for field in item.fields.keys(): item[field] = eval(field) yield item
def parse_xiangqin(self, response): item = ZhilianItem() #公司名称 company = response.xpath( '//h2//a[@target="_blank"]/text()').extract()[0] #薪水 pay = response.xpath( '//ul[@class="terminal-ul clearfix"]//strong/text()').extract()[0] #工作地点 site = response.xpath( '//ul[@class="terminal-ul clearfix"]//strong//a/text()').extract( )[0] #发布时间 times = response.xpath( '//ul[@class="terminal-ul clearfix"]//strong//span/text()' ).extract()[0] #职位名 gangwei = response.xpath( '//ul[@class="terminal-ul clearfix"]//strong//a/text()').extract( )[1] #url url = response.url #岗位职责 miaoshu = response.xpath( './/div[@class="tab-inner-cont"]/p/text() | .//div[@class="tab-inner-cont"]//span/text() | .//div[@class="tab-inner-cont"]//div/text()' ).extract() num = 0 while num < len(miaoshu): miaoshu[0] += miaoshu[num].strip() num += 1 #print miaoshu[0] #print company,pay,site,times,gangwei,miaoshu item['company'] = company item['pay'] = pay item['site'] = site item['times'] = times item['gangwei'] = gangwei item['url'] = self.md5(url) if miaoshu: item['miaoshu'] = miaoshu[0] else: item['miaoshu'] = '' print '生成item' yield item
def parse_item(self, response): job = ZhilianItem() node = response.xpath('//div[@class="terminalpage-left"]') #公司名 job["company_name"] = response.xpath( '//div[@class="fixed-inner-box"]//h2/a/text()').extract()[0].strip( ) #职位 job["job_name"] = response.xpath( '//div[contains(@class,"f")]/h1/text()').extract()[0].strip() #工资 job["wages"] = node.xpath( './/li[1]/strong/text()').extract()[0].strip() #工作地点 fulllocal = node.xpath('.//li[2]') job["local"] = fulllocal.xpath('string(.)').extract()[0].strip() #发布日期 job["push_date"] = node.xpath( './/li[3]/strong//text()').extract()[0].strip() #工作经验 job["experience"] = node.xpath( './/li[5]/strong/text()').extract()[0].strip() #学历 job["education"] = node.xpath( './/li[6]/strong/text()').extract()[0].strip() #招聘人数 job["quantity"] = node.xpath( './/li[7]/strong/text()').extract()[0].strip() #工作要求:因为都是同级的P标签 所以先把该node下面的全部P标签匹配出来 再把末尾的信息进行切片 rlist = node.xpath( './/div[@class="tab-inner-cont"][1]/p/text()').extract()[:-4] job["require"] = " ".join(rlist) # print ("公司名:"+job["company_name"]) # print ("职位:"+job["job_name"]) # print ("工资:"+job["wages"]) # print ("工作地点:"+job["local"]) # print ("发布日期:"+job["push_date"]) # print ("工作经验:"+job["experience"]) # print ("学历:"+job["education"]) # print ("招聘人数:"+job["quantity"]) # print ("工作要求:"+job["require"]) return job
def parse(self, response): #处理下载的response的默认方法 result = json.loads(response.text) results = result['data']['results'] item = ZhilianItem() for items in results: item['jobName'] = items['jobName'] item['salary'] = items['salary'] item['welfare'] = items['welfare'] item['url'] = items['positionURL'] item['city_display'] = items['city']['display'] item['company'] = items['company']['name'] item['company_size'] = items['company']['size']['name'] item['company_type'] = items['company']['type']['name'] item['emplType'] = items['emplType'] item['eduLevel'] = items['eduLevel']['name'] item['updateDate'] = items['updateDate'] item['workingExp'] = items['workingExp']['name'] yield item
def parse(self, response): result = json.loads(response.text) if result.get("data"): results = result["data"]["results"] for i in results: item = ZhilianItem() item["city"] = i["city"] item["companyLogo"] = i["companyLogo"] item["createDate"] = i["createDate"] item["eduLevel"] = i["eduLevel"] item["emplType"] = i["emplType"] item["endDate"] = i["endDate"] item["positionURL"] = i["positionURL"] item["salary"] = i["salary"] item["updateDate"] = i["updateDate"] item["welfare"] = i["welfare"] item["workingExp"] = i["workingExp"] yield item
def parse(self, response): item = ZhilianItem() result = json.loads(response.text) for i in range(0, 60): # 一个页面有59个数据字典,遍历每个字典 results = result['data']['results'][i] jobName = results['jobName'] createDate = results['createDate'] company = results['company']['name'] eduLevel = results['eduLevel']['name'] workingExp = results['workingExp']['name'] city = results['city']['display'] item['jobName'] = jobName item['createDate'] = createDate item['company'] = company item['eduLevel'] = eduLevel item['salary'] = results.get('salary') item['workingExp'] = workingExp item['city'] = city yield item
def parse(self, response): for x in response.xpath( "//div[@id='listContent']/div").extract_first(): name = x.xpath("//span[@title]/text()").extract_first() salary = x.xpath("//p[@class='job_saray']/text()").extract_first() company = x.xpath( "//*[@class='company_title']/text()").extract_first() city = x.xpath( "//ul/li[@class='demand_item']/text()").extract_first() welfare = x.xpath("//div[@class='welfare_item']/text()").extract() url = x.xpath("//div[@class='commpanyName']/a/@href").extract() item = ZhilianItem() item['name'] = name item['salary'] = salary item['company'] = company item['city'] = city item['welfare'] = welfare item['url'] = url yield item
def parse(self, response): soup = BeautifulSoup(response.body.decode('utf-8'),'lxml') for site in soup.find_all('table',class_='newlist'): item = ZhilianItem() try: item['name'] = site.find('td',class_='zwmc').get_text().strip() #print(item['name']) item['rate'] = site.find('td',class_='fk_lv').get_text() #print(item['rate']) item['compy'] = site.find('td',class_='gsmc').get_text() #print(item['compy']) item['money'] = site.find('td',class_='zwyx').get_text() #print(item['money']) item['place'] = site.find('td',class_='gzdd').get_text() #print(item['place']) #print(item) yield item except: pass
def parse_urls(self, response): job_list = json.loads(response.text)['data']['results'] for j in range(0, len(job_list)): items = ZhilianItem() items['date'] = job_list[j]['updateDate'] items['job_name'] = job_list[j]['jobName'] items['job_url'] = job_list[j]['positionURL'] items['company_id'] = job_list[j]['company']['number'] items['city'] = job_list[j]['city']['display'] items['company'] = job_list[j]['company']['name'] items['company_type'] = job_list[j]['company']['type']['name'] items['salary'] = job_list[j]['salary'] items['experience'] = job_list[j]['workingExp']['name'] items['education'] = job_list[j]['eduLevel']['name'] items['address'] = job_list[j]['businessArea'] job_detail_url = job_list[j]['positionURL'] yield scrapy.Request(job_detail_url, meta={'items': items}, callback=self.parse_job_detail_url, dont_filter=True)
def parse(self, response): titles = response.xpath('//a[@name="itemlist-title"]/@title').extract() prices = response.xpath('//span[@class ="price_n"]/text()').extract() list = [] for i in range(0, len(titles)): map = {} map['title'] = titles[i] map['price'] = prices[i] list.append(map) print('list=', len(list)) zlitem = ZhilianItem() #禁止这样写 不然yield不起作用 # zlitem.list=list zlitem['price'] = prices zlitem['title'] = titles zlitem['list'] = list zlitem['data'] = list # yield把数据给引擎在扔给pipelines.py的item yield zlitem
def work_parse(self, response): print(response.url) print(response.status) logger.info('url:' + response.url) logger.info('url:' + str(response.status)) item = ZhilianItem() item['JobTitle'] = response.xpath( "//div[@class='fixed-inner-box']/div[1]/h1/text()").extract()[0] item['company'] = response.xpath( "//div[@class='fixed-inner-box']/div[1]/h2/a/text()").extract()[0] item['JobTag'] = response.xpath( "//div[@class='fixed-inner-box']/div[1]/div[1]/span/text()" ).extract() item['MonthSalanry'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[1]/strong/text()" ).extract()[0] item['WorkPlace'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[2]/strong/a/text()" ).extract()[0] item['ReleaseData'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[3]/strong/span/text()" ).extract()[0] item['WorkNature'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[4]/strong/text()" ).extract()[0] item['WorkExperience'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[5]/strong/text()" ).extract()[0] item['MinDegree'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[6]/strong/text()" ).extract()[0] item['RecruitingNumbers'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[7]/strong/text()" ).extract()[0] item['JobCategory'] = response.xpath( "//div[@class='terminalpage-left']/ul/li[8]/strong/a/text()" ).extract()[0] yield item
def job_parse(self, response): item = ZhilianItem() item['link'] = response.meta['link'] item['company'] = response.meta['company'] description = {} description["payment"] = re.compile(u'职位月薪:') description["place"] = re.compile(u'工作地点:') description["date"] = re.compile(u'发布日期:') description["prop"] = re.compile(u'工作性质:') description["exp"] = re.compile(u'工作经验:') description["academic"] = re.compile(u'最低学历:') description["num"] = re.compile(u'招聘人数:') description["job_type"] = re.compile(u'职位类别:') prefix = "//div[@class='terminalpage clearfix']/div[@class='terminalpage-left']/" for li in response.xpath(prefix + "ul[@class='terminal-ul clearfix']/li"): text = self.encode(li.xpath('span/text()').extract()) for desc in description: if description[desc].search(text.decode('utf-8')): text = '' for childnode in li.xpath( 'strong/descendant-or-self::node()'): t = self.encode(childnode.xpath('text()').extract()) text += t item[desc] = text break text = '' xpath = prefix xpath += "div[@class='terminalpage-main clearfix']/div[@class='tab-cont-box']/div[@class='tab-inner-cont']" xpath += "/p/descendant-or-self::node()" for t in response.xpath(xpath): t = self.encode(t.xpath("text()").extract()) if len(t.strip()) > 9: text += t + '\n '.encode('utf-8') item['description'] = text return item
def parse(self, response): totalCount = response.xpath( "/html/body/div[3]/div[3]/div[2]/span[1]/em/text()").extract()[0] pages = int(int(totalCount) / 60) if pages >= 100: pages = 100 else: pages = pages for result in response.xpath( '//*[@id="newlist_list_content_table"]/table')[1:]: item = ZhilianItem() try: item['jobName'] = result.xpath( './/tr[1]/td[1]/div/a/text()').extract()[0] except Exception as e: item['jobName'] = '' try: item['jobUrl'] = result.xpath( './/tr[1]/td[1]/div/a/@href').extract()[0] except Exception as e: item['jobUrl'] = '' try: item['companyName'] = result.xpath( './/tr[1]/td[3]/a[1]/text()').extract()[0] except Exception as e: item['companyName'] = '' try: item['companyUrl'] = result.xpath( './/tr[1]/td[3]/a[1]/@href').extract()[0] except Exception as e: item['companyUrl'] = '' try: item['companySize'] = (result.xpath( u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"公司规模")]/text()' ).extract()[0]).replace(u'公司规模:', '') except Exception as e: item['companySize'] = '' try: item['companyNature'] = (result.xpath( u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"公司性质")]/text()' ).extract()[0]).replace(u'公司性质:', '') except Exception as e: item['companyNature'] = '' try: item['salary'] = result.xpath( './/tr[1]/td[4]/text()').extract()[0] salaryList = re.split('-', item['salary']) item['minSalary'] = salaryList[0] item['maxSalary'] = salaryList[1] except Exception as e: item['salary'] = '' item['minSalary'] = '' item['maxSalary'] = '' try: item['location'] = result.xpath( './/tr[1]/td[5]/text()').extract()[0] except Exception as e: item['location'] = '' try: item['deatil'] = result.xpath( './/tr[2]/td/div/div/ul/li[2]/text()').extract()[0] except Exception as e: item['deatil'] = '' try: item['edu'] = (result.xpath( u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"学历")]/text()' ).extract()[0]).replace(u'学历:', '') except Exception as e: item['edu'] = '' try: item['workYear'] = (result.xpath( u'.//tr[2]/td/div/div/ul/li[1]//span[contains(text(),"经验")]/text()' ).extract()[0]).replace(u'经验:', '') except Exception as e: item['workYear'] = '' yield item p = int(response.meta.get('p')) + 1 if p <= pages: yield scrapy.Request(self.url + str(p), method='GET', meta={'p': p}, headers=self.headers, callback=self.parse)
def parse_item(self, response): # 校园招聘遇301重定向 if response.status == 301: newurl = response.headers['Location'].decode('utf8') yield scrapy.Request( newurl, callback=self.parse_item, dont_filter=True, priority=3, meta={'post_time': response.meta['post_time']}) # 网页解析 elif response.status == 200: try: selector = Selector(response) link = response.url place = 'Null' if '//xiaoyuan' in link: # 解析校园招聘详情页 job_name = selector.xpath( '//div[@class="cJobDetailInforWrap"]/h1/text()' ).extract_first() if job_name: _id = ''.join(re.findall('com/job/(.*)', link)) job_data = selector.xpath( '//ul["@class=cJobDetailInforBotWrap clearfix c3"]/li[@class="cJobDetailInforWd2 marb"]/text()' ).extract() place, job_kind, job_number, post_time, job_nature, education = \ job_data[0], job_data[1], job_data[2], job_data[3], job_data[4], job_data[5] place = place.split('-')[0] job_content = selector.xpath( '//div[@class="cJob_Detail f14"]/p//text()' ).extract() company_name = selector.xpath( '//li[@id="jobCompany"]/a/text()').extract_first() company_industry = selector.xpath( '//ul[@class="cJobDetailInforTopWrap clearfix c3"]/li[4]/@title' ).extract_first() company_size = selector.xpath( '//ul[@class="cJobDetailInforTopWrap clearfix c3"]/li[6]/text()' ).extract_first() company_nature = selector.xpath( '//ul[@class="cJobDetailInforTopWrap clearfix c3"]/li[8]/text()' ).extract_first() job_place = selector.xpath( '//div[@class="clearfix p20"]/p[@class="c9"]/text()' ).extract_first() company_address = selector.xpath( '//div[@class="clearfix p20"]/p[@class="c9"]/text()' ).extract_first() company_homepage = selector.xpath( '//div[@class="clearfix p20"]/p[@class="c9 mt5"]/a/@href' ).extract_first() item = XiaozhaoItem() else: return else: # 解析社会招聘详情页 job_name = selector.xpath( '//h3[@class="summary-plane__title"]/text()' ).extract_first() if job_name: _id = ''.join(re.findall('com/(.*?).htm', link)) salary = selector.xpath( '//span[@class="summary-plane__salary"]/text()' ).extract_first() place = selector.xpath( '//ul[@class="summary-plane__info"]/li[1]/a/text()' ).extract_first() experience = selector.xpath( '//ul[@class="summary-plane__info"]/li[2]/text()' ).extract_first() education = selector.xpath( '//ul[@class="summary-plane__info"]/li[3]/text()' ).extract_first() job_number = selector.xpath( '//ul[@class="summary-plane__info"]/li[4]/text()' ).extract_first() job_kind = selector.xpath( '//span[@class="pos-name"]/a/text()' ).extract_first() # miss job_content = selector.xpath( '//div[@class="describtion__detail-content"]//text()' ).extract() job_place = selector.xpath( '//span[@class="job-address__content-text"]/text()' ).extract_first() company_name = selector.xpath( '//a[@class="company__title"]/text()' ).extract_first() advantage = selector.xpath( '//div[@class="highlights__content"]//text()' ).extract() company_industry = selector.xpath( '//button[@class="company__industry"]//text()' ).extract_first() company_nature = selector.xpath( '//ul[@class="promulgator-ul cl"]/li[2]/strong/text()' ).extract_first() # miss company_size = selector.xpath( '//button[@class="company__size"]/text()' ).extract_first() company_homepage = selector.xpath( '//a[@class="company__home-page"]/@href' ).extract_first() company_address = selector.xpath( '//ul[@class="promulgator-ul cl"]/li[5]/strong/text()' ).extract_first() # miss job_content = ''.join(job_content).strip() post_time = response.meta['post_time'] post_time = '20' + post_time if post_time else 'NULL' job_nature = "全职" salary = salary_formater( redundancy_cleaner(salary, '元/月')) advantage = '/'.join(advantage) item = ZhilianItem() item["salary"] = salary if salary else "NULL" item[ "experience"] = experience if experience else "NULL" item["advantage"] = advantage if advantage else "NULL" # 公有字段清洗 place = redundancy_cleaner(place) job_content = content_cleaner(job_content) job_name = redundancy_cleaner(job_name) job_place = redundancy_cleaner(job_place) job_number = get_num(job_number) item["_id"] = _id item["job_name"] = job_name if job_name else "NULL" item["link"] = link if link else "NULL" item["place"] = place if place else "NULL" item["post_time"] = post_time item["job_number"] = job_number if job_number else "NULL" item["education"] = education if education else "NULL" item["job_nature"] = job_nature if job_nature else "NULL" item["job_kind"] = job_kind if job_kind else "NULL" item["job_place"] = job_place if job_place else "NULL" item["job_content"] = job_content if job_content else "NULL" item["company_size"] = company_size if company_size else "NULL" item[ "company_nature"] = company_nature if company_nature else "NULL" item[ "company_industry"] = company_industry if company_industry else "NULL" item["company_name"] = company_name if company_name else "NULL" item[ "company_homepage"] = company_homepage if company_homepage else "NULL" item["company_address"] = company_address.strip( ) if company_address else "NULL" yield item # except ValueError as ve: # log.msg("Error: " + _id + ' ' + str(ve), level=log.ERROR) except Exception as ex: log.msg("Error when parsing {0}".format(response.url), level=log.ERROR) raise ex else: log.msg("Page Not Found, reason: {0}".format(response.status), level=log.WARNING)
def filter_Career(self, response): url = response.url print(url) company_url = response.xpath( '//div[@class="inner-left fl"]/h2/a/@href').extract_first() name = response.xpath( '//div[@class="inner-left fl"]/h1/text()').extract_first() company = response.xpath( '//div[@class="inner-left fl"]/h2/a/text()').extract_first() welfare = ','.join( response.xpath( '//div[@class="welfare-tab-box"]/span/text()').extract()) ul_xpath = '//ul[@class="terminal-ul clearfix"]/li' salary = response.xpath(ul_xpath + '/strong/text()').extract_first() location = response.xpath(ul_xpath + '/strong/a/text()').extract_first() job = response.xpath(ul_xpath + '/strong/a/text()').extract()[1] job_kind = response.xpath(ul_xpath + '[4]/strong/text()').extract_first() job_time = response.xpath(ul_xpath + '[5]/strong/text()').extract_first() xueli = response.xpath(ul_xpath + '[6]/strong/text()').extract_first() push_time = response.xpath( '//*[@id="span4freshdate"]/text()').extract_first() people_num = response.xpath(ul_xpath + '[7]/strong/text()').extract_first() bs4 = BeautifulSoup(response.text, 'lxml') for terminalpage in bs4.find_all('div', class_='terminalpage-main clearfix'): for box in terminalpage.find_all('div', class_='tab-cont-box'): cont = box.find_all('div', class_='tab-inner-cont')[0] ms = cont.contents list = [] for i in ms: try: if i.text.split() == []: pass else: job_ms = ''.join(i.text.split()) list.append(job_ms) except: pass jog_location = ''.join(list).split('工作地址:').pop() job_mss = ''.join(list).split('工作地址:')[0] if job_mss == '': job_ms = bs4.find('div', class_="tab-inner-cont") job_mss = ''.join(job_ms.text.split()).split('工作地址')[0] loader = ItemLoader(item=ZhilianItem(), response=response) loader.add_value('url', url) loader.add_value('name', name) loader.add_value('company', company) loader.add_value('company_url', company_url) loader.add_value('welfare', welfare) loader.add_value('salary', salary) loader.add_value('location', location) loader.add_value('job', job) loader.add_value('job_kind', job_kind) loader.add_value('job_time', job_time) loader.add_value('xueli', xueli) loader.add_value('push_time', push_time) loader.add_value('people_num', people_num) loader.add_value('jog_location', jog_location) loader.add_value('job_mss', job_mss) print(loader.load_item()) yield loader.load_item()