示例#1
0
    def parse_item(self, response):
        """
            已经正确获取到 岗位列表 的json返回,进行结构化处理
        :param response:
        :return:
        """
        text = response.text
        json_data = json.loads(text)
        results = json_data['content']['positionResult']['result']

        # 每页 15 个结果,进行遍历
        for result in results:

            # 数据结构化
            item = LagouItem()
            item['item_key'] = response.meta['item_key']

            item['positionName'] = result['positionName']
            item['workYear'] = result['workYear']
            item['education'] = result['education']
            item['jobNature'] = result['jobNature']
            item['createTime'] = result['createTime']
            item['salary'] = result['salary']
            item['city'] = result['city']
            item['financeStage'] = result['financeStage']
            item['industryField'] = result['industryField']
            item['companyFullName'] = result['companyFullName']
            item['district']  = result['district']
            item['positionAdvantage'] = result['positionAdvantage']
            #  提交 item 给 pipeline 进行处理
            yield item
示例#2
0
 def parse(self, response):
     '''
     从起始url抓取类目
     parse()是spider的一个方法。 被调用时,每个初始URL完成下载后生成的 Response 对象将会作为唯一的参数传递给该函数。 该方法负责解析返回的数据(response data),提取数据(生成item)以及生成需要进一步处理的URL的 Request 对象。
     :param response: 起始页面html,从起始页面中的每个类目url中抓取类目url
     :return: 回调函数:遍历每个类目url的页码,生成所有类目的所有页码的request传入parse_url方法
     '''
     for item in response.xpath('//div[@class="menu_box"]/div/dl/dd/a'):
         #使用xpath表达式,提取每个类目的url与类目
         jobClass = item.xpath('text()').extract()[
             0]  #extract()把selector对象转化为list,然后取list中的第一个string
         jobUrl = item.xpath(
             "@href").extract_first()  #extract_first()把selector对象转化为string
         oneItem = LagouItem()  #继承items.py中的LagouItem()
         oneItem["jobClass"] = jobClass  #继承jobClass
         oneItem["jobUrl"] = jobUrl  #继承jobUrl
         #每个类目下最多有30页,生成每个类目的不同页码的url,用于下一步的requests
         for i in range(self.page):
             jobUrl2 = jobUrl + str(i + 1)
             try:
                 yield scrapy.Request(url=jobUrl2,
                                      cookies=self.cookie,
                                      meta={'oneItem': oneItem},
                                      callback=self.parse_url)
             except:
                 pass
示例#3
0
文件: lagou.py 项目: zy145/Lagou
 def parse(self, response):
     item = LagouItem()
     json_dic = json.loads(response.body)
     # print(json_dic)
     data_list = json_dic['content']['positionResult']['result']
     for data in data_list:
         item['position'] = data['positionName']
         item['company'] = data['companyFullName']
         item['createTime'] = data['createTime']
         yield item
示例#4
0
 def parse_datas(self, response):
     """
     获取数据
     """
     # 转换为JSON数据
     datas = json.loads("[{}]".format(response.text))
     base_url = '/'.join(self.position_list_page.split('/')[0:-1])
     try:
         for positioninfo in datas[0]['content']['positionResult'][
                 'result']:
             loader = ItemLoader(item=LagouItem(), response=response)
             # 构造职位ID
             loader.add_value(
                 'id',
                 get_md5(
                     base_url +
                     '/{}.html'.format(str(positioninfo['positionId']))))
             # 职位详情页URL
             loader.add_value(
                 'positiondetailurl', base_url +
                 '/{}.html'.format(str(positioninfo['positionId'])))
             # 职位
             loader.add_value('positionname', positioninfo['positionName'])
             # 薪水
             loader.add_value('salary', positioninfo['salary'])
             loader.add_value('avg_salary', positioninfo['salary'])
             # 发布时间
             loader.add_value('publishtime', positioninfo['createTime'])
             # 工作经验
             loader.add_value('workyear', positioninfo['workYear'])
             # 学历
             loader.add_value('education', positioninfo['education'])
             # 类型
             loader.add_value('jobnature', positioninfo['jobNature'])
             # 职位诱惑
             loader.add_value('positionadvantage',
                              positioninfo['positionAdvantage'])
             # 工作城市
             loader.add_value('city', positioninfo['city'])
             # 招聘公司
             loader.add_value('companyfullName',
                              positioninfo['companyFullName'])
             # 公司详情页URL
             loader.add_value(
                 'companyurl',
                 'https://www.lagou.com/gongsi/{}.html'.format(
                     str(positioninfo['companyId'])))
             yield loader.load_item()
         self.crawler.stats.inc_value("Success_Reqeust")
     except Exception as e:
         self.logger.debug("GET ERROR: {}".format(e))
         self.crawler.stats.inc_value('Failed_Request')
示例#5
0
    def parse(self, response):
        for menu_sub in response.xpath("//div[@class='menu_sub dn']/dl"):
            job_classify = menu_sub.xpath("./dt/span/text()").extract()[0]
            for job in menu_sub.xpath("./dd/a"):
                job_name = job.xpath("./text()").extract()[0]
                job_url = job.xpath("./@href").extract()[0] + "1/"

                item = LagouItem()
                item['job_classify'] = job_classify
                item['job_name'] = job_name
                item['job_url'] = job_url
                # yield item
                yield scrapy.Request(job_url, cookies=self.cookie, meta={'item': item}, callback=self.parse_url)
示例#6
0
    def parse(self, response):
        # print(response.text)

        result = json.loads(response.body)

        jobs = result['content']['positionResult']['result']

        for job in jobs:
            item = LagouItem()

            item['city'] = job['city']
            item['companyName'] = job['companyFullName']
            item['companyShortName'] = job['companyShortName']
            item['companySize'] = job['companySize']
            item['district'] = job['district']
            item['education'] = job['education']
            item['jobNature'] = job['jobNature']
            item['positionName'] = job['positionName']
            sal = job['salary']
            sal = sal.split('-')
            if len(sal) == 1:
                item['salaryMax'] = int(sal[0][:sal[0].find('k')])
            else:
                item['salaryMax'] = int(sal[1][:sal[1].find('k')])
            item['salaryMin'] = int(sal[0][:sal[0].find('k')])
            item['salaryAvg'] = (item['salaryMin'] + item['salaryMax']) / 2

            item['positionAdvantage'] = job['positionAdvantage']

            yield item

        if self.curpage <= self.total_page:
            self.curpage += 1
            yield scrapy.http.FormRequest(self.start_urls,
                                          method='POST',
                                          headers=self.headers,
                                          cookies=self.my_cookies,
                                          formdata={
                                              'first': 'true',
                                              'pn': str(self.curpage),
                                              'kd': self.kd
                                          },
                                          callback=self.parse)
示例#7
0
    def parse(self, response):
        # print(response.text)
        item = LagouItem()
        divs = response.xpath('//*[@id="s_position_list"]/ul/li/div[1]')
        for div in divs:
            title = div.xpath('./div[1]/div[1]/a/h3/text()').extract_first()
            location = div.xpath(
                './div[1]/div[1]/a/span/em/text()').extract_first()
            salary = div.xpath(
                './div[1]/div[2]/div/span/text()').extract_first()
            company = div.xpath('./div[2]/div[1]/a/text()').extract_first()
            fintance = div.xpath('./div[2]/div[2]/text()').extract_first()

            item['title'] = title.strip()
            item['location'] = location.strip()
            item['salary'] = salary.strip()
            item['company'] = company.strip()
            item['fintance'] = fintance.strip()

            yield item
示例#8
0
 def parse(self, response):
     #json.loads()将字符串转换成字典.然后通过键找到对应的值
     jsonBody = json.loads(response.body.decode())
     #jsonBody为:{'':'' , '':'' , '':'' , '':{'':'',  ......}}这样的格式
     results = jsonBody['content']['positionResult']['result']
     items = []
     for result in results:
         item = LagouItem()
         item['positionName'] = result['positionName']  # 职位名称
         item['month_salary'] = result['salary']  # 薪资水平
         item['companyName'] = result['companyFullName']  # 公司名称
         item['companyField'] = result['industryField']  # 公司所属行业
         item['companySize'] = result['companySize']  #公司规模
         item['city'] = result['city']  # 工作地点
         item['experience'] = result['workYear']  #经验要求
         item['qualification'] = result['education']  #学历要求
         item['full_or_parttime'] = result['jobNature']  #全职/兼职
         item['detailLink'] = "https://www.lagou.com/jobs/" + str(
             result['positionId']) + ".html"  #职位详情页连接               #职位描述
         items.append(item)
     return items
示例#9
0
 def parse_url(self, response):
     '''
     从parse()中返回的url中抓取各个类目所有页码的页面下的招聘url
     :param response:
     :return: 回调函数:遍历每个类目招聘url下的每个招聘信息,生成的jobDetailUrl传入parse_Details()方法
     '''
     for sel2 in response.xpath('//ul[@class="item_con_list"]/li'):
         #xpath表达式,提取每个response中的职位详情url,职位发布时间
         jobDetailUrl = sel2.xpath(
             'div[1]/div[1]/div[1]/a/@href').extract_first()
         jobReleaseTime = sel2.xpath(
             'div[1]/div[1]/div[1]/span/text()').extract()[0]
         urlItem = LagouItem()  #继承items.py中的LagouItem()
         urlItem["jobDetailUrl"] = jobDetailUrl  #职位详情的url
         urlItem["jobReleaseTime"] = jobReleaseTime  #工作发布时间
         urlItem["jobClass"] = response.meta['oneItem']['jobClass']
         '''
         url去重:每个职位信息的jobDetailUrl是唯一的
         使用url_check()方法,将抓到的jobDetailUrl通过sql语句在数据库中检查。
         如果存在此url,返回True, pass,不进行下一步请求;
         如果不存在此url,返回False, 则yield scrapy.Request
         '''
         # try:                           #不启用url去重
         #     yield scrapy.Request(url=jobDetailUrl, cookies=self.cookie, meta={'urlItem': urlItem},
         #                          callback=self.parse_Details)
         # except:
         #     pass
         code = url_check(jobDetailUrl)  #启用url去重
         if code == False:
             print("不存在本职位,continue!")
             try:
                 yield scrapy.Request(url=jobDetailUrl,
                                      cookies=self.cookie,
                                      meta={'urlItem': urlItem},
                                      callback=self.parse_Details)
             except:
                 pass
         else:
             print("已存在本职位,pass!")
             return
示例#10
0
    def parse_url(self, response):
        """
        解析每个工作类的url
        """
        item_base = response.meta['item']
        job_classify = item_base['job_classify']
        job_name = item_base['job_name']
        job_url = item_base['job_url']
        node_list = response.xpath("//ul[@class='item_con_list']/li[@class='con_list_item default_list']")
        for node in node_list:
            item = LagouItem()

            item['job_classify'] = job_classify
            item['job_name'] = job_name
            item['job_url'] = job_url
            item['company'] = \
                node.xpath("./div[@class='list_item_top']/div/div[@class='company_name']/a/text()").extract()[0]
            item['company_scale'] = str.strip(
                node.xpath("./div[@class='list_item_top']/div/div[@class='industry']/text()").extract()[0])
            item['position'] = \
                node.xpath("./div[@class='list_item_top']/div/div[@class='p_top']/a/h3/text()").extract()[0]
            item['address'] = \
                node.xpath("./div[@class='list_item_top']/div/div[@class='p_top']/a/span/em/text()").extract()[0]
            item['salary'] = \
                node.xpath("./div[@class='list_item_top']/div/div[@class='p_bot']/div/span/text()").extract()[0]
            item['experience'] = str.strip(
                node.xpath("./div[@class='list_item_top']/div/div[@class='p_bot']/div/text()[3]").extract()[0])
            item['work'] = node.xpath("./div[@class='list_item_bot']/div[@class='li_b_l']/span/text()").extract()

            yield item

        # 翻页
        next_url = response.xpath(
            "//div[@class='pager_container']/a[@class='page_no' and text()='下一页']/@href").extract()
        if next_url:
            # yield scrapy.Request(response.urljoin(next_url[0] + "?filterOption=3"), self.parse)
            # yield scrapy.Request(next_url[0], cookies=self.cookie, callback=self.parse)
            item_base['job_url'] = next_url[0]
            yield scrapy.Request(next_url[0], cookies=self.cookie, meta={'item': item_base}, callback=self.parse_url)
示例#11
0
 def parse(self, response):
     # chrome = webdriver.Chrome
     # datas = response.xpath('//ul[@class="item_con_list"]/li')
     # for data in datas:
     #     item = LagouItem()
     #     item['title'] = data.xpath('//div//div/a/h3[1]/text()').get()
     #     yield item
     soup = bs4.BeautifulSoup(response.text, features='lxml')
     for i in soup.find_all('a', class_='position_link'):
         link = i['href']
         s = requests.Session()
         s.get(link, headers=self.headers, timeout=3)
         cookie = s.cookies
         res = requests.get(link,
                            headers=self.headers,
                            cookies=cookie,
                            timeout=5)
         bs = bs4.BeautifulSoup(res.text, 'lxml')
         # print(res.text)
         positionname = bs.find(class_='name').string
         salary = bs.select(
             '.job_request h3 span:nth-child(1)')[0].get_text()
         location = bs.select(
             '.job_request h3 span:nth-child(2)')[0].get_text()
         experience = bs.select(
             '.job_request h3 span:nth-child(3)')[0].get_text()
         education = bs.select(
             '.job_request h3 span:nth-child(4)')[0].get_text()
         detail = bs.select('.job-detail')[0].get_text()
         item = LagouItem()
         item['positionname'] = positionname
         item['salary'] = salary
         item['location'] = location
         item['experience'] = experience
         item['education'] = education
         item['detail'] = detail
         yield item
示例#12
0
文件: lagou.py 项目: XHuskie/crawl
 def parse_lagou(self, response):
     text = json.loads(response.text)
     print(text)
     res = []
     try:
         res = text["content"]["positionResult"]["result"]
     except:
         pass
     if len(res) > 0:
         item = LagouItem()
         for position in res:
             try:
                 item['title'] = position['positionName']
                 item['education'] = position['education']
                 item['company'] = position['companyFullName']
                 item['experience'] = position['workYear']
                 item['location'] = position['city']
                 item['salary'] = position['salary']
                 print(item)
             except:
                 pass
             yield item
         self.page += 1
         url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
         formdata = {
             'first': 'false',
             'kd': str(self.kd),
             'pn': str(self.page),
             'city': str(self.ct)
         }
         print('===========================', formdata)
         yield scrapy.FormRequest(url,
                                  callback=self.parse_lagou,
                                  formdata=formdata,
                                  headers=self.headers)
     else:
         print('爬虫结束了!')
示例#13
0
    def parse_item(self, response):
        pn = response.meta['pn']
        print('Json API 第%s开始解析' % pn)

        print(response.text)
        data_list = json.loads(response.text)
        for position in data_list['content']['positionResult']['result']:
            loader = LagouItemLoader(item=LagouItem(), response=response)

            loader.add_value('position_name', position['positionName'])
            loader.add_value('exp_lvl', position['workYear'])
            loader.add_value('edu_lvl', position['education'])
            loader.add_value('position_type', position['jobNature'])
            loader.add_value('position_id', str(position['positionId']))
            loader.add_value(
                'position_url', 'https://www.lagou.com/jobs/' + str(position['positionId']) + '.html')
            loader.add_value('finance_stage', position['financeStage'])
            loader.add_value('industry_field', position['industryField'])
            loader.add_value('company_name', position['companyFullName'])
            loader.add_value('work_city', position['city'])
            loader.add_value('salary', position['salary'])
            loader.add_value('position_advantage',
                             position['positionAdvantage'])
            loader.add_value('publish_date', position['createTime'])
            if position['companyLabelList']:
                loader.add_value('company_attr', position['companyLabelList'])
            else:
                loader.add_value('company_attr', 'None')
            if position['skillLables']:
                loader.add_value('skill_label', position['skillLables'])
            else:
                loader.add_value('skill_label', 'None')

            item = loader.load_item()
            # print(item)
            # break
            yield item
示例#14
0
 def parse_job(self, response):
     item = LagouItem()
     item['title'] = response.xpath(
         '//div[@class="job-name"]/@title').extract_first()
     item['url'] = response.url
     item['url_object_id'] = get_md5(response.url)
     item['publish_time'] = response.xpath(
         "//*[@class='publish_time']/text()").extract_first()
     item['salary'] = response.xpath(
         '//dd[@class="job_request"]/p/span[@class="salary"]/text()'
     ).extract_first()
     item['job_city'] = response.xpath(
         "//*[@class='job_request']/p/span[2]/text()").extract_first()[1:-1]
     item['work_years'] = response.xpath(
         "//*[@class='job_request']/p/span[3]/text()").extract_first()[:-1]
     item['degree_need'] = response.xpath(
         "//*[@class='job_request']/p/span[4]/text()").extract_first()[:-1]
     item['job_type'] = response.xpath(
         "//*[@class='job_request']/p/span[5]/text()").extract_first()
     item['job_advantage'] = response.xpath(
         "//dd[@class='job-advantage']/p/text()").extract_first()
     job_desc = response.xpath(
         '//dd[@class="job_bt"]/div/p/text()').extract()
     item['job_desc'] = "".join(job_desc)
     job_addr = response.xpath(
         '//div[@class="work_addr"]/a/text()').extract()[:-1]
     item['job_addr'] = "-".join(job_addr)
     item['company_name'] = response.xpath(
         '//dl[@class="job_company"]/dt/a/img/@alt').extract_first()
     item['company_url'] = response.xpath(
         "//dl[@class='job_company']//a[@rel='nofollow']/@href"
     ).extract_first()
     tags = response.xpath(
         '//*[@class="job_request"]/ul/li/text()').extract()
     item['tags'] = "/".join(tags)
     item['crawl_time'] = datetime.datetime.now()
     yield item
示例#15
0
    def parse(self, response):
        job = LagouItem()

        soup = BeautifulSoup(response.body, 'lxml')
        position_head_soup = soup.find('div', attrs={'class': 'position-head'})
        job['ID'] = soup.find('input', attrs={
            'class': 'erma_codep'
        }).get('value')
        job['companyId'] = position_head_soup.get('data-companyid')
        #get_text()方法返回标签内的文本内容, name=C软件工程师(西安)
        name = position_head_soup.find('span', attrs={
            'class': 'name'
        }).get_text()
        job['name'] = name
        container_soup = soup.find('div',
                                   attrs={
                                       'class': 'container clearfix',
                                       'id': 'container'
                                   })
        #Get the advantage of job
        job_advantage_soup = container_soup.find(
            'dd', attrs={'class': 'job-advantage'})
        job_advantage_des = job_advantage_soup.find('p').get_text()
        job['advantages'] = job_advantage_des

        job_description_soup = container_soup.find('dd',
                                                   attrs={'class': 'job_bt'})
        requirements = []
        requirements_soup = job_description_soup.find_all('p')
        for requirement in requirements_soup:
            requirements.append(requirement.get_text().replace(' ',
                                                               '').replace(
                                                                   '\xa0', ''))
        job['description'] = requirements

        work_addr = container_soup.find('div', attrs={
            'class': 'work_addr'
        }).get_text()
        #去除空白和换行
        work_addr = work_addr.replace(' ', '').replace('\n', '')
        job['workLocation'] = work_addr

        #获取公司信息
        company = {}
        company_soup = soup.find('div', attrs={
            'class': 'content_r'
        }).find('ul').find_all('li')
        for info in company_soup:
            company[info.find('span').get_text()] = info.get_text().replace(
                '\n', '').replace(' ', '')
        company_name = position_head_soup.find('div',
                                               attrs={
                                                   'class': 'company'
                                               }).get_text()
        company['公司名称'] = company_name
        job['company'] = company

        job_briefs_soup = soup.find('dd', attrs={'class': 'job_request'})
        job_brief_soup = job_briefs_soup.find_all('span')
        job['salary'] = job_brief_soup[0].get_text().replace('/', '')
        job['location'] = job_brief_soup[1].get_text().replace('/', '')
        job['experience'] = job_brief_soup[2].get_text().replace('/', '')
        job['background'] = job_brief_soup[3].get_text().replace('/', '')
        job['isFulltime'] = job_brief_soup[4].get_text().replace('/', '')

        labels = []
        labels_soup = job_briefs_soup.find('ul',
                                           attrs={
                                               'class':
                                               'position-label clearfix'
                                           }).find_all('li')
        for label in labels_soup:
            labels.append(label.get_text())
        job['labels'] = labels

        idFollows = self.getNextID(soup)
        for ID in idFollows:
            url = "https://www.lagou.com/jobs/%s.html" % ID
            r = redis.Redis(host='127.0.0.1', port=6379)
            r.lpush('lagou:start_urls', url)

        yield job
示例#16
0
 def parse_Details(self, response):
     '''
     从职位详情页面抓取需要的数据
     :param response:
     :return: Item
     '''
     for detail in response.xpath('/html/body'):
         #//*[@id="job_detail"]
         #通过scrapy.selector的xpath,css表达式提取各个数据
         jobDivision = detail.css(
             '.job-name .company::text').extract_first()  # 公司名字+部门
         jobName = detail.css('.job-name span::text').extract()[0]  # 职位名称
         jobMoney = detail.css(".job_request .salary ::text").extract()[
             0]  # 薪资
         pattern_salary = re.compile(r'\d+')  # 正则匹配数字
         salary_left = pattern_salary.findall(jobMoney)[0]  # 薪资范围下限
         salary_right = pattern_salary.findall(jobMoney)[1]  # 薪资范围上限
         jobReleaseTime = response.meta['urlItem']['jobReleaseTime']  # 发布时间
         jobReleaseTime = transport_time(jobReleaseTime)  # 转化为yyyy-mm-dd
         jobPlace = detail.xpath('div[2]/div/div[1]/dd/p[1]/span[2]/text()'
                                 ).extract()[0]  # 城市地点
         jobPlace = replace_splash(jobPlace)  # 移除 /
         jobNeed = detail.xpath('div[2]/div/div[1]/dd/p[1]/span[3]/text()'
                                ).extract()[0]  # 经验不限
         jobNeed = replace_splash(jobNeed)  # 移除 /
         jobEducation = detail.xpath(
             'div[2]/div/div[1]/dd/p[1]/span[4]/text()').extract()[
                 0]  # 本科及以上
         jobEducation = replace_splash(jobEducation)  # 移除 /
         jobType = detail.xpath('div[2]/div/div[1]/dd/p[1]/span[5]/text()'
                                ).extract()[0]  # 全职/兼职
         jobLabel = detail.css('.position-label li::text').extract()[
             0]  # 职位标签
         jobSpeak = detail.xpath(
             '//*[@id="job_detail"]/dd[1]/p/text()').extract()[0]  # 职位诱惑
         city = detail.xpath(
             '//*[@id="job_detail"]/dd[3]/div[1]/a[1]/text()'
         ).extract_first()  #上海
         Area = detail.xpath(
             '//*[@id="job_detail"]/dd[3]/div[1]/a[2]/text()'
         ).extract_first()  #浦东新区
         address = detail.xpath(
             '//*[@id="job_detail"]/dd[3]/div[1]/text()').extract()  # 地址
         address = address[3]  # 取list中的第三个
         address = replace_spalsh2(address)  # 移除地址中的 ' - '
         jobCompany = detail.css('.job_company img::attr(alt)').extract()[
             0]  # 公司名称
         CompanyUrl = detail.css('.job_company a::attr(href)').extract()[
             0]  # 公司地址
         CompanyDomain = detail.xpath(
             '//*[@id="job_company"]/dd/ul/li[1]/text()').extract()[
                 1]  # 公司领域
         DevelopmentStage = detail.xpath(
             '//*[@id="job_company"]/dd/ul/li[2]/text()').extract()[
                 1]  # 融资情况
         # 继承items.py中的LagouItem()
         Item = LagouItem()
         Item["jobDivision"] = jobDivision
         Item["jobName"] = jobName
         Item["jobMoney"] = jobMoney
         Item["salary_left"] = salary_left
         Item["salary_right"] = salary_right
         Item["jobReleaseTime"] = jobReleaseTime
         Item["jobPlace"] = jobPlace
         Item["jobNeed"] = jobNeed
         Item["jobEducation"] = jobEducation
         Item["jobType"] = jobType
         Item["jobLabel"] = jobLabel
         Item["jobSpeak"] = jobSpeak
         Item["city"] = city
         Item["Area"] = Area
         Item["address"] = address
         Item["jobCompany"] = jobCompany
         Item["CompanyUrl"] = CompanyUrl
         Item["jobDetailUrl"] = response.meta['urlItem']['jobDetailUrl']
         Item["jobClass"] = response.meta['urlItem']['jobClass']
         Item["CompanyDomain"] = CompanyDomain
         Item["DevelopmentStage"] = DevelopmentStage
         print(Item)
         return Item