Exemplo n.º 1
0
    def parse(self, response):

        next_page_url = response.xpath('//li[@class="bk"][2]/a/@href')
        job_list = response.xpath('//*[@id="resultList"]/div[@class="el"]')
        # print(job_list.extract())
        for each_job in job_list:
            job_info = JobsItem()
            job_info['job_title'] = each_job.xpath(
                './/p[contains(@class,"t1")]/span/a/text()')
            job_info['company'] = each_job.xpath(
                './/span[contains(@class,"t2")]/a/text()')
            job_info['job_href'] = each_job.xpath(
                './/span[contains(@class,"t2")]/a/@href')
            job_info['location'] = each_job.xpath(
                './/span[contains(@class,"t3")]/text()')
            job_info['salary'] = each_job.xpath(
                './/span[contains(@class,"t4")]/text()')
            job_info['post_date'] = each_job.xpath(
                './/span[contains(@class,"t5")]/text()')  # mm-dd

            for k, v in job_info.items():
                if v:
                    job_info[k] = v.extract_first().strip()
                else:
                    job_info[k] = 'unknown'
            # print(job_info)
            yield job_info
        if next_page_url is not None:
            abs_url = next_page_url.extract_first().strip()
            print('*' * 30)
            # time.sleep(1)
            yield response.follow(abs_url, callback=self.parse)
Exemplo n.º 2
0
 def parse_totalUrl(self, response):
     item = JobsItem()
     headers = {
         'Content-Type':
         'application/x-www-form-urlencoded; charset=UTF-8',
         'Accept-Encoding':
         'gzip, deflate',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
         "Referer":
         "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
     }
     res = json.loads(response.body)
     if (res.get("success")):
         if res.get('content').get('positionResult').get('resultSize') != 0:
             results = res.get('content').get('positionResult').get(
                 'result')
             for result in results:
                 item['url'] = "https://www.lagou.com/jobs/{}.html".format(
                     result['positionId'])
                 item['pubdate'] = result['createTime']
                 item['company'] = result['companyFullName']
                 item['longitude'] = result['longitude']
                 item['latitude'] = result['latitude']
                 item['city'] = result['city']
                 item['jobtitle'] = result['positionName']
                 item['workyear'] = result['workYear']
                 item['salary'] = result['salary']
                 yield Request(url=item['url'],
                               meta={'item_1': item},
                               headers=headers,
                               callback=self.parse_detail)
Exemplo n.º 3
0
 def mycontent(self, response):
     item = JobsItem()
     need_list = []
     content_list = []
     item['job'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()').extract_first()
     item['money'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract_first()
     item['addr'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/span/text()').extract_first()
     item['company'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a/@title').extract_first()
     needs = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div/div/span')
     for need in needs:
         need_list.append(need.xpath('text()').extract_first())
     item['needs'] = need_list
     contents = response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p')
     for content in contents:
         content1 = content.xpath('text()').extract_first()
         if not content1:
             content1 = content.xpath('span/text()').extract_first()
             if not content1:
                 content1 = '暂无信息'
         content_list.append(content1)
     item['content'] = content_list
     data = response.xpath('/html/body/div[3]/div[2]/div[3]/div[3]/div/p')
     # 这里的p标签下面还有其他标签,用text()不能拿出全部文本,所以用string(.)
     work_addr = data.xpath('string(.)').extract_first()
     # 用正则表达式将\t \n \r 全部去掉 r表示转移原生字符
     item['work_addr'] = re.sub(r'\t|\n|\r', '', work_addr)
     yield item
Exemplo n.º 4
0
    def parse(self, response):
        jdict = json.loads(response.body)
        jcontent = jdict['content']
        jposresult = jcontent['positionResult']
        jresult = jposresult['result']
        self.totalPageCount = int(jposresult['totalCount']) / int(
            jcontent['pageSize']) + 1
        for entry in jresult:
            if len(entry) < 10:
                continue
            item = JobsItem()
            item['pid'] = str(entry['positionId']) + "_" + str(
                entry['publisherId'])
            item['keyword'] = self.keyword
            for fn in self.item_fns:
                item[fn] = entry[fn]

            yield item

        if self.curPage <= self.totalPageCount:
            self.curPage += 1
            yield self.next_request()
        elif self.kd_cur < len(self.keywords) - 1:
            self.curPage = 1
            self.totalPageCount = 0
            self.kd_cur += 1
            self.keyword = self.keywords[self.kd_cur]
            yield self.next_request()
Exemplo n.º 5
0
    def parse(self, response):
        jdict = json.loads(response.body)
        jcontent = jdict['content']
        jposresult = jcontent['positionResult']
        jresult = jposresult['result']
        resultSize = int(jposresult['resultSize'])
        pageSize = int(jcontent['pageSize'])
        print('[lagou][%s]resultSize: %d, pageNo: %d, pageSize: %d' %
              (self.keyword, resultSize, self.curPage, pageSize))
        for entry in jresult:
            if len(entry) < 10:
                continue
            item = JobsItem()
            item['keyword'] = self.keyword
            item['pid'] = self.keyword + "_" + str(
                entry['positionId']) + "_" + str(entry['publisherId'])
            for fn in self.item_fns:
                item[fn] = entry[fn]

            yield item

        if pageSize <= resultSize:
            self.curPage += 1
            yield self.next_request()
        elif self.kd_cur < len(self.keywords) - 1:
            self.curPage = 1
            self.kd_cur += 1
            self.keyword = self.keywords[self.kd_cur]
            yield self.next_request()
Exemplo n.º 6
0
 def getDetail(self, response):
     print('got', response.url, response)
     title = response.xpath(
         '//span[@class="pos_title"]/text()')[0].extract()
     try:
         salary = response.xpath(
             '//span[@class="pos_salary"]/text()')[0].extract()
     except:
         salary = 0
     num = response.xpath(
         '//span[@class="item_condition pad_left_none"]/text()')[0].extract(
         )
     edu = response.xpath(
         '//span[@class="item_condition"]/text()')[0].extract()
     exp = response.xpath(
         '//span[@class="item_condition border_right_None"]/text()'
     )[0].extract()
     area = response.xpath(
         '//span[@class="pos_area_span pos_address"]//a/text()').extract()
     # print(title,num,edu,area,exp,salary)
     item = JobsItem()
     item['title'] = title.strip()
     item['salary'] = salary.strip()
     item['num'] = num.strip('招人 ')
     item['edu'] = edu.strip()
     item['exp'] = exp.strip()
     item['area'] = area
     yield item
Exemplo n.º 7
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.xpath("//span[@class='article-title']  ")
     items = []
     for titles in titles:
         item = JobsItem()
         item["title"] = titles.select("a/text()").extract()
         #item["link"] = titles.select("a/@href").extract()
         items.append(item)
     return items
Exemplo n.º 8
0
 def vacansy_parse(self, response: HtmlResponse):
     name_job = response.xpath('//h1/text()').extract_first()
     salary_job = response.css(
         'div._3MVeX span._1OuF_.ZON4b span._3mfro._2Wp8I.PlM3e._2JVkc::text'
     ).extract()
     link = response.url
     source_link = response.css(
         'a.icMQ_._2JivQ::attr(href)').extract_first()
     yield JobsItem(name=name_job,
                    salary=salary_job,
                    link=link,
                    source_link=source_link)
Exemplo n.º 9
0
    def parse_list (self, response):   
        
        for i in range(0,20):
            
             joblist = self.driver.find_element_by_xpath("//div[@id='uxItemLink_"+ str(i+1+JobspiderSpider.get_jobcount())+"']/table/tbody/tr/td[2]")
             joblist.click()
             jobselems = joblist.find_elements_by_tag_name("span")
             job =JobsItem()
             
             job ['vacant'] = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxNoOfVac").text
             job ['company'] = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxCompany").text
             job ['name'] = jobselems[0].text
             job ['job_id'] = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxOrdNo").text
             details = self.driver.find_element_by_xpath("//div[@id='ctl00_ContentPlaceHolder1_uxJobCard_uxJcard']/table[2]/tbody/tr[5]/td").text

             if (details is not None):
                 result =""
                 for i in range (0, len(details.split())):
                     result = ''.join([result, details.split()[i], ' '])
                 job ['detail'] =result
             saldig = jobselems[1].text.find("月薪")
             if saldig > 0 :
                 job ['salary'] = int(jobselems[1].text[1:7].replace(',',''))
             else: 
                 job['salary'] = int(jobselems[1].text[1:3].replace(',',''))
             job ['area'] = jobselems[2].text
             pdate = self.driver.find_element_by_id("ctl00_ContentPlaceHolder1_uxJobCard_uxPostedDate").text
             self.logger.debug(pdate)
             job ['date_posted'] = date(int(pdate.split('/')[2]),int(pdate.split('/')[1]),int( pdate.split('/')[0]))
#             self.logger.info(job)
#             self.LAW_LOG.append(job)
             yield (job)
        JobspiderSpider.update_jobcount(JobspiderSpider.get_jobcount()+20)
        nexturl=None
   #     try:
        nexturl = self.driver.find_element_by_xpath("//span[@id='ctl00_ContentPlaceHolder1_uxPageNum']/a[6]").get_attribute('href')
        self.logger.debug(nexturl +": expected partial")
#        self.LAW_LOG.append(nexturl +": expected partial")
        pos = nexturl.find('?')
        if pos > -1:
            nexturl = nexturl[:pos]
            self.logger.debug(nexturl + ": processed")

        qsdict = {'SearchFor':'simple',
                'ID':'',
                'SortBy':'',
                'from':'',
                'start':str(JobspiderSpider.get_jobcount())} 
        qs = urlencode(qsdict)
        request = Request(nexturl+ '?' +qs, 
                      dont_filter=False,
                      callback=self.parse_list)      
        yield request
Exemplo n.º 10
0
    def parse_detail_page(self, response):
        jobTitle = response.css('.jobtitle> font::text').extract()[0].strip()
        company = response.css('.company::text').extract()[0]
        location = response.css('.location::text').extract()[0]
        jobSummary = response.css('.summary *::text').extract()

        print('Got item...', response.url)

        item = JobsItem()
        item['jobTitle'] = jobTitle
        item['company'] = company
        item['location'] = location
        item['jobSummary'] = jobSummary
        item['url'] = response.url
        yield item
Exemplo n.º 11
0
    def parse_jobs(self, response):
        torrent = JobsItem()
        torrent['title'] = response.css('h1[itemprop="name"]::text').extract()
        torrent['location'] = response.css(
            '#post-location span[itemprop="name"]::text').extract()
        torrent['post_url'] = response.url
        torrent['organization'] = response.css(
            'strong[itemprop="name"]::text').extract()
        torrent['organization_url'] = response.css(
            '#post-location + a::text').extract()
        torrent['job_type'] = response.css('a[href^="/type"]::text').extract()
        torrent['job_category'] = response.css(
            'a[href^="/category"]::text').extract()
        torrent['salary'] = ''

        yield torrent
Exemplo n.º 12
0
    def parse(self, response):

        data = json.loads(response.text)

        if len(data['data']['results']) > 0:

            self.page_count += 1

            for each_group in data['data']['results']:
                item = JobsItem()
                item['job_title'] = each_group['jobName']
                item['salary'] = each_group['salary']
                item['experience'] = each_group['workingExp']['name']
                item['location'] = each_group['city']['items'][0]['name']
                item['detail_url'] = each_group['positionURL']
                item['update_date'] = each_group['updateDate'].split()[0]
                item['welfare'] = ",".join(each_group['welfare'])
                item['key_word'] = self.query_list[self.query_count]
                item['company_title'] = each_group['company']['name']
                item['company_scale'] = each_group['company']['size']['name']
                item['company_nature'] = each_group['company']['type']['name']

                item['crawl_date'] = self.date_time
                item['crawl_url'] = response.request.url

                yield item

            url = "https://fe-api.zhaopin.com/c/i/sou?start={0}&pageSize=90&cityId={1}&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={2}&kt=3&_v=0.34053159&x-zp-page-request-id=d1dd66ee655347939b69acf87870ccc1-1548983897497-953047".format(
                self.page_count * 90, self.city_list[self.city_count],
                self.query_list[self.query_count])
            yield scrapy.Request(url=url, callback=self.parse)
        else:
            if self.query_count < len(self.query_list) - 1:
                self.query_count += 1

            elif self.city_count < len(self.city_list) - 1:
                self.query_count = 0
                self.city_count += 1

            else:
                return

            url = "https://fe-api.zhaopin.com/c/i/sou?start={0}&pageSize=90&cityId={1}&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={2}&kt=3&_v=0.34053159&x-zp-page-request-id=d1dd66ee655347939b69acf87870ccc1-1548983897497-953047".format(
                0, self.city_list[self.city_count],
                self.query_list[self.query_count])
            self.page_count = 0
            yield scrapy.Request(url=url, callback=self.parse)
Exemplo n.º 13
0
    def parse(self, response):
        try:
            headers = {}
            headers[
                "User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"

            time.sleep(5)
            item = JobsItem()
            works = BeautifulSoup(response.text, 'lxml').find(
                "div", class_="infolist").find_all("dl")
            for work in works:
                time.sleep(1)
                job_url = work.find('dt').find('a')
                url = job_url['href']
                request = urllib2.Request(url=url, headers=headers)
                response = urllib2.urlopen(request)
                job_html = response.read().decode('utf-8')
                html = BeautifulSoup(job_html, 'lxml')

                item['id'] = uuid.uuid4()
                item['title'] = html.find('div',
                                          class_='item_con pos_info').find(
                                              'span',
                                              class_='pos_name').get_text()

                item['salary'] = html.find('div', class_='pos_base_info').find(
                    'span', class_='pos_salary').get_text()

                item['company'] = html.find(
                    'div', class_='subitem_con company_baseInfo').find(
                        'p', class_='comp_baseInfo_title').find(
                            'a', class_='baseInfo_link').get_text()
                # item['company'] = "asd"
                item['scale'] = html.find(
                    'div', class_='subitem_con company_baseInfo').find(
                        'p', class_='comp_baseInfo_scale').get_text()

                item['address'] = html.find(
                    'div', class_='subitem_con work_adress').find(
                        'p', class_='detail_adress').get_text()
                item['city'] = u"成都"

                yield item
        except Exception, e:
            print e.message
Exemplo n.º 14
0
    def parse_info(self, response):

        # 实例化
        job_item = JobsItem()

        selector = scrapy.Selector(response)

        item_cn = selector.xpath(
            '/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[@class="tHeader tHjob"]/div/div[@class="cn"]'
        )
        # 1.职位名称
        # job_name = selector.xpath('/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[@class="tHeader tHjob"]/div/div[@class="cn"]/h1/@title').extract()
        job_name = item_cn.xpath('h1/@title').extract()[0]
        job_item.job_name = job_name
        # 2.公司名称
        company_name = item_cn.xpath(
            'p[@class="cname"]/a[1]/@title').extract()[0]
        job_item.company_name = company_name
        # 3.福利
        welfares = item_cn.xpath('div/div/span/text()').extract()
        # 4.月薪
        monthly_pay = item_cn.xpath('strong/text()').extract()[0]
        #  x-y 元/天 千/月 万/月 万以上/月 万/年 万以上/年
        tmp_dict = {
            '元/天': 30,
            '千/月': 1000,
            '千以上/月': 1000,
            '千以下/月': 1000,
            '万/月': 10000,
            '万以上/月': 10000,
            '万/年': 1 / 12,
            '万以上/年': 1 / 12
        }
        money_min = 0
        money_max = 0
        for key, value in tmp_dict.items():
            if monthly_pay.strip() == '':
                money_min = money_max = 0
                break

            elif key in monthly_pay:
                temp_money = monthly_pay.strip(key)
                temp_money_list = temp_money.split('-')
                if len(temp_money_list) == 2:
                    money_min = float(temp_money_list[0]) * value
                    money_max = float(temp_money_list[1]) * value
                    break
                else:
                    money_min = money_max = float(temp_money) * value
                    break
        job_item.monthly_pay_min = money_min
        job_item.monthly_pay_max = money_max
        # print('------------- %d ~ %d' % (money_min, money_max))
        # 5.职位要求
        requirement = item_cn.xpath('p[2]/@title').extract()[0].replace(
            u'\xa0', u' ').replace(' ', '').split('|')  # &nbsp解码

        # 工作地
        addrStr = requirement[0]
        addrList = addrStr.split('-')
        city = addrList[0]
        area = ''
        if len(addrList) == 2:
            area = addrList[1]
        job_item.area = area
        # 经验
        experience = '无工作经验'
        # 学历
        educationArr = ['初中及以下', '高中/中技/中专', '大专', '本科', '硕士', '博士']
        education = '所有'
        # 人数
        count = 0
        for item in requirement:
            if '年经验' in item:
                experience = item[:-3]
            elif item in education:
                education = item
            elif re.match(r'招.*?人', item):
                count_str = item[1:-1]
                if count_str == '若干':
                    count = 0
                else:
                    count = int(count_str)

        # print('工作城市%s, 区:%s, 经验:%s, 学历:%s, 招聘%s人' % (city, area, experience, education, ('若干' if count == 0 else str(count))))

        job_item.education = education
        job_item.experience = experience

        item_detail = selector.xpath(
            '/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[3]'
        )
        # 6.职位详情
        # job_details = item_detail.xpath('div[1]/div[1]/p/text()').extract()
        job_details = item_detail.xpath('string(div[1]/div[1])').extract()[
            0]  #terminal打印不完整
        # print('------------')
        # print(job_details)
        job_item.job_detail = job_details
        com = re.compile(
            u'(岗位职责|工作职责)[::]?(.*?)(任职资格|任职要求)[::]?(.*?)(职能类别)[::]?(.*?)(关键字)[::]?'
        )
        re_list = re.findall(com, unicode(job_details))
        if re_list:
            print(re_list[0][0].strip())

        # print('++++++++++++')
        # job_detail = job_details.replace(' ', '').replace('\n', '')

        # job_details = item_detail.xpath('div[1]/div/')
        # for item in item_detail:
        #     title_h2 = item.xpath('h2/span/text()').extract()[0]
        #     detail_p = item.xpath('string(div)').extract()[0]
        #
        #     print ('------------')
        #     print(detail_p)
        #     print ('++++++++++++')

        # print(selector.xpath('/html/body/div[@class="tCompanyPage"]/div[@class="tCompany_center clearfix"]/div[3]/div[1]/h2/span/text()').extract()[0])

        # 7.上班地址
        job_address = item_detail.xpath('div[2]/div/p/text()').extract()
        job_item.address = job_address
        # 8.地图位置
        job_positions = self.sub_string(
            item_detail.xpath('div[2]/div/a/@onclick').extract()[0])
        job_map = ''
        if job_positions:
            job_map = job_positions[0]
        item_company = selector.xpath('/html/body/div[3]/div[2]/div[4]/div[1]')
        # 9.公司名称
        company_name = item_company.xpath('div[1]/a/p/@title').extract()[0]
        # 10.公司链接
        company_link = ''
        company_links = item_company.xpath('div[1]/a/@href').extract()
        if company_links:
            company_link = company_links[0]
        job_item.company_link = company_link
        # 11.公司性质
        company_nature = item_company.xpath('div[2]/p[1]/text()').extract()[0]
        # 12.员工人数
        company_people = item_company.xpath('div[2]/p[2]/text()').extract()[0]
        # 13.所处行业
        company_industry = item_company.xpath('div[2]/p[3]/a/text()').extract()
        # 14.公司信息
        company_info = item_detail.xpath('string(div[3]/div)').extract()[0]
        job_item.company_info = company_info
        # print(company_info)
        # print(job_name+"--"+company_name+','.join(welfares)+'--'+monthly_pay+'--'+','.join(requirement)+'\n'+job_address[1]+job_map+'\n'+' '.join(company_industry)+company_link)
        # yield job_item

        return job_item  # 将item提交给管道
    def parse_item(self, response):
        item = JobsItem()
        item["post_url"] = response.url
        item["post_name"] = response.xpath("//h1//text()").extract_first()

        #

        salary = response.xpath(
            "//div[@class='cn']//strong//text()").extract_first()
        if len(salary) > 0:
            if salary[-3:] == '万/月':
                min_salary = float(re.findall('(.*?)-(.*?)万',
                                              salary)[0][0]) * 10000
                max_salary = float(re.findall('(.*?)-(.*?)万',
                                              salary)[0][1]) * 10000
                avg_salary = (min_salary + max_salary) / 2
            elif salary[-3:] == '千/月':
                min_salary = float(re.findall('(.*?)-(.*?)千',
                                              salary)[0][0]) * 1000
                max_salary = float(re.findall('(.*?)-(.*?)千',
                                              salary)[0][1]) * 1000
                avg_salary = (min_salary + max_salary) / 2
            else:
                avg_salary = 'N'
            item["post_salary"] = float(avg_salary)

        temp = re.sub(
            r'\xa0', '',
            response.xpath(
                "//*[@class='msg ltype']/@title").extract_first()).split("|")
        if len(temp) >= 5:
            item["post_city"] = temp[0]
            item["post_experience"] = temp[1]
            item["post_education"] = temp[2]
            number = re.findall('招(.*?)人', temp[3])
            if len(number) == 0:
                number = "N"
                item["post_number"] = number
            else:
                item["post_number"] = int(number[0])
            item["post_release_time"] = temp[4]

        item["post_information"] = ''.join(
            response.xpath("//div[@class='bmsg job_msg inbox']//p//text()").
            extract()).strip("\n").strip('\r').strip('\t')
        item["post_category"] = ','.join(
            response.xpath("//div[@class='mt10']/p[1]//a//text()").extract())
        item["post_keywords"] = ','.join(
            response.xpath("//div[@class='mt10']/p[2]//a//text()").extract())

        item["company_url"] = response.xpath(
            "//div[@class='com_msg']//a/@href").extract_first()
        item["company_name"] = response.xpath(
            "//div[@class='com_msg']//a//text()").extract_first()
        item["company_nature"] = response.xpath(
            "//div[@class='com_tag']/p[1]//text()").extract_first()
        item["company_scale"] = response.xpath(
            "//div[@class='com_tag']/p[2]//text()").extract_first()
        item["company_category"] = re.sub(
            r'[\r\n\s]', '', ','.join(
                response.xpath(
                    "//div[@class='com_tag']/p[3]//a//text()").extract()))

        item["crawl_date"] = datetime.datetime.now().strftime('%Y-%m-%d')
        yield item
Exemplo n.º 16
0
    def parse(self, response):
        self.page_count += 1
        self.total_page_count += 1
        print("lnkyzhang", self.total_page_count)

        if response.url.strip():

            groups = response.xpath('//div[@class="job-list"]/ul/li')
            for each_group in groups:
                item = JobsItem()
                item['job_title'] = each_group.xpath(
                    './/div[@class="job-title"]/text()').extract()[0]
                item['salary'] = each_group.xpath(
                    './/span[@class="red"]/text()').extract()[0]
                item['experience'] = each_group.xpath(
                    './/div[@class="info-primary"]/p/text()[2]').extract()[0]
                item['location'] = each_group.xpath(
                    './/div[@class="info-primary"]/p/text()[1]').extract(
                    )[0].split()[0]
                item['detail_url'] = each_group.xpath(
                    './/h3[@class="name"]/a/@href').extract()[0].split()[0]
                item['update_date'] = each_group.xpath(
                    './/div[@class="info-publis"]/p/text()').extract()[0]
                #item['welfare'] = ",".join(each_group['welfare'])
                item['key_word'] = self.query_list[self.query_count]
                item['company_title'] = each_group.xpath(
                    './/div[@class="company-text"]/h3[@class="name"]/a/text()'
                ).extract()[0]

                item['crawl_date'] = self.date_time
                item['crawl_url'] = response.request.url
                item['crawl_pages'] = self.total_page_count
                try:
                    item['company_scale'] = each_group.xpath(
                        './/div[@class="company-text"]/p/text()[3]').extract(
                        )[0]
                except (TypeError, IndexError):
                    pass
                item['company_nature'] = each_group.xpath(
                    './/div[@class="company-text"]/p/text()[2]').extract()[0]

                yield item

            if response.xpath(
                    '//div[@class="page"]/a[@ka="page-next"]/@class'):
                if response.xpath(
                        '//div[@class="page"]/a[@ka="page-next"]/@class'
                ).extract()[0] == 'next':
                    pass
                else:
                    self.page_count = 0

                    if self.query_count < len(self.query_list) - 1:
                        self.query_count += 1

                    elif self.city_count < len(self.city_list) - 1:
                        self.query_count = 0
                        self.city_count += 1
            else:
                self.page_count = 0

                if self.query_count < len(self.query_list) - 1:
                    self.query_count += 1

                elif self.city_count < len(self.city_list) - 1:
                    self.query_count = 0
                    self.city_count += 1

        url = 'https://www.zhipin.com/{}/?query={}&page={}'.format(
            self.city_list[self.city_count], self.query_list[self.query_count],
            self.page_count)
        yield scrapy.Request(url=url, callback=self.parse)
Exemplo n.º 17
0
    def parse(self, response):
        self.page_count += 1
        self.total_page_count += 1
        print("lnkyzhang", self.total_page_count)

        if response.url.strip():

            groups = response.xpath(
                '//div[@class="dw_table"]/div[@class="el"]')
            for each_group in groups:
                item = JobsItem()
                item['job_title'] = each_group.xpath(
                    './/a[@target="_blank"]/@title').extract()[0]

                try:
                    item['salary'] = each_group.xpath(
                        './/span[@class="t4"]/text()').extract()[0]
                except (TypeError, IndexError):
                    pass

                # item['experience'] = each_group.xpath(
                #     './/div[@class="info-primary"]/p/text()[2]').extract()[0]
                item['location'] = each_group.xpath(
                    './/span[@class="t3"]/text()').extract()[0].split()[0]
                item['detail_url'] = each_group.xpath(
                    './/a[@target="_blank"]/@href').extract()[0].split()[0]
                item['update_date'] = each_group.xpath(
                    './/span[@class="t5"]/text()').extract()[0]
                # item['welfare'] = ",".join(each_group['welfare'])
                item['key_word'] = self.query_list[self.query_count]
                item['company_title'] = each_group.xpath(
                    './/span[@class="t2"]/a/@title').extract()[0]

                item['crawl_date'] = self.date_time
                item['crawl_url'] = response.request.url
                item['crawl_pages'] = self.total_page_count
                # try:
                #     item['company_scale'] = each_group.xpath(
                #         './/div[@class="company-text"]/p/text()[3]').extract()[0]
                # except (TypeError, IndexError):
                #     pass
                # item['company_nature'] = each_group.xpath(
                #     './/div[@class="company-text"]/p/text()[2]').extract()[0]

                yield item

            if response.xpath('//li[@class="bk"][2]/span/text()'):
                if response.xpath('//li[@class="bk"][2]/span/text()').extract(
                )[0] == '下一页':
                    self.page_count = 1

                    if self.query_count < len(self.query_list) - 1:
                        self.query_count += 1

                    elif self.city_count < len(self.city_list) - 1:
                        self.query_count = 0
                        self.city_count += 1

        url = 'https://search.51job.com/list/{0},000000,0000,00,9,99,{1},2,{2}.html'.format(
            self.city_list[self.city_count],
            urllib.parse.quote(self.query_list[self.query_count]),
            self.page_count)
        yield scrapy.Request(url=url, callback=self.parse)
Exemplo n.º 18
0
    def parse_page(self, response):
        res = response.xpath(
            '//div[contains(@class, "jobsearch-SerpJobCard unifiedRow row result")]'
        )
        for i in range(1, len(res)):
            item = JobsItem()

            title = res[i].xpath(
                'div[@class="title"]/a/text() | div[@class="title"]/a/b/text()'
            ).extract()
            item['title'] = ''.join(title).replace('\n', '')

            company = res[i].xpath(
                'div/div/span[@class="company"]/a/text() | div/div/span[@class="company"]/text()'
            ).extract()
            item['company'] = ''.join(company).replace('\n', '')

            rating = res[i].xpath(
                'div/div/span[@class="ratingsDisplay"]/a/span/text() | div/div/a/span[@class="ratings"]/@aria-label'
            ).extract()
            item['rating'] = ''.join(rating).replace('\n', '').replace(
                ' out of 5 star rating', '')

            location = res[i].xpath(
                'div/div[contains(@class, "location")]/text() | div/span[contains(@class, "location")]/text()'
            ).extract()
            item['location'] = ''.join(location).replace('\n', '')

            salary = res[i].xpath(
                'div/span/span[@class="salaryText"]/text()').extract()
            salary = ''.join(salary).replace('\n', '')
            if salary == '':
                continue

            if 'year' in salary:
                item['salary_unit'] = 'year'
                salary = salary.replace('a year', '')
            elif 'hour' in salary:
                item['salary_unit'] = 'hour'
                salary = salary.replace('an hour', '')
            elif 'month' in salary:
                item['salary_unit'] = 'month'
                salary = salary.replace('a month', '')
            elif 'week' in salary:
                item['salary_unit'] = 'week'
                salary = salary.replace('a week', '')
            elif 'day' in salary:
                item['salary_unit'] = 'day'
                salary = salary.replace('a day', '')
            else:
                print('Error Text: ' + salary)
                raise ValueError('Cannot parse salary text.')

            salary = salary.replace(',', '').replace(' ', '').replace(
                '$', '').replace('++', '').replace('From',
                                                   '').replace('Upto', '')

            if '-' in salary:
                salary_all = salary.split('-')
                item['salary_from'] = salary_all[0]
                item['salary_to'] = salary_all[1]
            else:
                item['salary_from'] = salary
                item['salary_to'] = salary

            item['level'] = response.meta['level']
            item['state'] = response.meta['state']
            item['skill'] = response.meta['skill']

            yield item