Пример #1
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)

        item_loader.add_value("post_url", response.url)
        item_loader.add_css("title", "div .job-name::attr(title)")
        item_loader.add_css("company", "div .company::text")
        item_loader.add_css("min_salary", "span.salary::text")
        item_loader.add_xpath("city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("min_work_years",
                              "//*[@class='job-request']/p/span[3]/text()")
        item_loader.add_xpath("degree_req",
                              "//*[@class='job-request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job-request']/p/span[5]/text()")
        item_loader.add_css("tags", ".position-label li::text")
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("advantage", ".job-advantage p::text")
        item_loader.add_css("description",
                            ".job_bt div")  # hot about .job_bt p
        item_loader.add_css("addr", ".work_addr")

        job_item = item_loader.load_item()
        return job_item
Пример #2
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_xpath("tags", '//li[@class="labels"]/text()')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Пример #3
0
    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_xpath("title", '//div/span[@class="name"]/text()')
        item_loader.add_xpath("url", response.url)
        item_loader.add_xpath("url_object_id", get_md5(response.url))
        item_loader.add_xpath("salary", '//dd/p/span[@class="salary"]/text()')
        item_loader.add_xpath("job_city", '//dd/p/span[2]/text()')
        item_loader.add_xpath("work_years", '//dd/p/span[3]/text()')
        item_loader.add_xpath("degree_need", '//dd/p/span[4]/text()')
        item_loader.add_xpath("job_type", '//dd/p/span[5]/text()')
        item_loader.add_xpath("tags",
                              '//dd[@class="job_request"]/ul/li/text()')
        item_loader.add_xpath("publish_time",
                              '//dd/p[@class="publish_time"]/text()')
        item_loader.add_xpath("job_advantage", '//dl/dd/p/text()')
        item_loader.add_xpath("job_desc", '//dd/div/p/text()')
        item_loader.add_xpath("job_addr", '//dd/div[@class="work_addr"]')
        item_loader.add_xpath("company_url", '//dl/dt/a/@href')
        item_loader.add_xpath("company_name", '//dl/dt/a/img/@alt')
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Пример #4
0
 def parse_job(self, response):
     #解析拉勾网职位
     item_loader = LagouJobItemLoader(item=LagouJobItem(),response=response)
     item_loader.add_css('title','.job-name::attr(title)')
     item_loader.add_css('url', '')
     item_loader.add_value('url_obj_id', '')
     item_loader.add_css('salary', '')
     item_loader.add_css('job_city', '')
     item_loader.add_css('work_years', '')
     item_loader.add_css('degree_need', '')
     item_loader.add_css('job_type', '')
     item_loader.add_css('publish_time', '')
     item_loader.add_css('job_advantage', '')
     item_loader.add_css('job_desc', '.job_bt div')
     item_loader.add_css('job_addr', '.work_addr')
     item_loader.add_css('company_name', '')
     item_loader.add_css('company_url', '')
     item_loader.add_css('tags', '')
     item_loader.add_value('crawl_time', datetime.now())
     job_item = item_loader.load_item()
     return job_item
Пример #5
0
 def parse_job(self, response):
     #解析拉勾网的职位
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     item_loader.add_css('title', '.job-name::attr(title)')
     item_loader.add_value('url', response.url)
     item_loader.add_value('url_object_id', get_md5(response.url))
     item_loader.add_css('salary', '.salary::text')
     item_loader.add_xpath('job_city',
                           '//*[@class="job_request"]/p/span[2]/text()')
     item_loader.add_xpath('work_years',
                           '//*[@class="job_request"]/p/span[3]/text()')
     item_loader.add_xpath('degree_need',
                           '//*[@class="job_request"]/p/span[4]/text()')
     item_loader.add_xpath('job_type',
                           '//*[@class="job_request"]/p/span[5]/text()')
     item_loader.add_css('tags', '.position-label li::text')
     item_loader.add_css('publish_time', '.publish_time::text')
     item_loader.add_css('job_advantage', '.job-advantage p::text')
     item_loader.add_css('job_desc', '.job_bt div')
     item_loader.add_css('job_addr', '.work_addr')
     item_loader.add_css('company_name', '#job_company dt a img::attr(alt)')
     item_loader.add_css('company_url', '#job_company dt a::attr(href)')
     item_loader.add_value('crawl_time', datetime.datetime.now())
     job_item = item_loader.load_item()
     return job_item
Пример #6
0
    def parse_item(self, response):
        # item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        # return item
        # if 'utrack/track' in response.url:
        #     # 本想解决302重定向问题,但此法无效
        #     num = re.match('.*2F(\d+).html.*', response.url)
        #     num = str(num.group(1))
        #     url = 'https://www.lagou.com/jobs/' + num + '.html'
        #     print(url)
        #     time.sleep(2)
        #     return scrapy.Request(url, dont_filter=True, headers=self.headers)
        if response.status == 302:
            self.redirect_url.append(response.url)
            self.crawler.stats.inc_value("redirected_url")

        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css('title', 'span.name:nth-child(2)::text')
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')
        item_loader.add_xpath('job_city',
                              '//*[@class="job_request"]/p/span[2]/text()')
        item_loader.add_xpath('work_years',
                              '//*[@class="job_request"]/p/span[3]/text()')
        item_loader.add_xpath('degree_need',
                              '//*[@class="job_request"]/p/span[4]/text()')
        item_loader.add_xpath('job_type',
                              '//*[@class="job_request"]/p/span[5]/text()')
        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_name', '#job_company dt a img::attr(alt)')
        item_loader.add_css('company_url', '#job_company dt a::attr(href)')
        item_loader.add_value('crawl_time', datetime.datetime.now())

        job_item = item_loader.load_item()

        return job_item
Пример #7
0
 def parse_company(self, response):
     if delete_ip(response):
         url = response.url
         match_re = r'(https://www.lagou.com/gongsi/?\d+.html).*$'
         match = re.match(match_re, url)
         if match:
             url = match.group(1)
             if check_table_url('lagou_company', url):
                 response = return_new_company_response(
                     response.request, response)
                 companyItemLoader = LagouJobItemLoader(item=LagouCompany(),
                                                        response=response)
                 companyItemLoader.add_value('url', url)
                 companyItemLoader.add_value('url_object_id', get_md5(url))
                 tags = response.xpath(
                     "//div[@id='tags_container']//li//text()").extract()
                 if len(tags) != 0:
                     tags = clear_str((',').join(tags))
                 else:
                     tags = ''
                 companyItemLoader.add_value('tags', tags)
                 company_name = clear_str(''.join(
                     response.xpath(
                         "//h1[@class='company_main_title']//text()").
                     extract()))
                 companyItemLoader.add_value('company_name', company_name)
                 companyItemLoader.add_value(
                     'industry',
                     response.xpath(
                         "//div[@id='basic_container']//li//i[@class='type']/following-sibling::span[1]//text()"
                     ).extract_first(""))
                 companyItemLoader.add_value(
                     'finance',
                     response.xpath(
                         "//div[@id='basic_container']//li//i[@class='process']/following-sibling::span[1]//text()"
                     ).extract_first(""))
                 companyItemLoader.add_value(
                     'people_count',
                     response.xpath(
                         "//div[@id='basic_container']//li//i[@class='number']/following-sibling::span[1]//text()"
                     ).extract_first(""))
                 companyItemLoader.add_value(
                     'city',
                     response.xpath(
                         "//div[@id='basic_container']//li//i[@class='address']/following-sibling::span[1]//text()"
                     ).extract_first(""))
                 score = response.xpath(
                     "//span[@class='score']//text()").extract_first("0")
                 companyItemLoader.add_value('score', score)
                 create_date = response.xpath(
                     r"//div[@class='company_bussiness_info_container']//div[@class='content']//text()"
                 ).extract()
                 if len(create_date) != 0:
                     create_date = create_date[1]
                 else:
                     create_date = ''
                 companyItemLoader.add_value('create_date', create_date)
                 company_desc = response.xpath(
                     "//div[@id='company_intro']/div[@class='item_content']/div[@class='company_intro_text']//text()"
                 ).extract()
                 company_desc = clear_str(('').join(company_desc))
                 companyItemLoader.add_value('company_desc',
                                             company_desc.strip())
                 companyItemLoader.add_value('crawl_time', get_now())
                 company_data = response.xpath(
                     "//div[@class='company_data']//li//strong//text()"
                 ).extract()
                 companyItemLoader.add_value('review_count',
                                             company_data[3].strip())
                 companyItemLoader.add_value('job_count',
                                             company_data[0].strip())
                 company_item = companyItemLoader.load_item()
                 return company_item
Пример #8
0
 def parse_job(self, response):
     # 返回不为200,删掉该ip
     if delete_ip(response):
         url = response.url
         match_re = r'(https://www.lagou.com/jobs/?\d+.html).*$'
         match = re.match(match_re, url)
         #同理也是通过xpath进行相关数据的获取和利用正则、字符串一些方法来处理拿下来的数据
         if match:
             url = match.group(1)
             # 判断数据库是否有该url
             if check_table_url('lagou_job', url):
                 jobItemLoader = LagouJobItemLoader(item=LagouJob(),
                                                    response=response)
                 jobItemLoader.add_xpath(
                     'title', "//div[@class='job-name']//h1/text()")
                 jobItemLoader.add_value('url', url)
                 url_object_id = get_md5(url)
                 jobItemLoader.add_value('url_object_id', url_object_id)
                 job_request = response.xpath(
                     "//dd[@class='job_request']//span/text()").extract()
                 salary = job_request[0].strip()
                 jobItemLoader.add_value('max_salary',
                                         get_max_min_salary(salary, True))
                 jobItemLoader.add_value('min_salary',
                                         get_max_min_salary(salary, False))
                 job_city = get_city(job_request[1])
                 jobItemLoader.add_value('job_city', job_city)
                 work_years = job_request[2]
                 jobItemLoader.add_value('work_years', work_years)
                 degree_need = job_request[3]
                 jobItemLoader.add_value('degree_need', degree_need)
                 job_type = job_request[4]
                 jobItemLoader.add_value('job_type', job_type)
                 jobItemLoader.add_xpath(
                     'publish_time', "//p[@class='publish_time']/text()")
                 jobItemLoader.add_xpath(
                     'job_advantage',
                     "//dd[@class='job-advantage']//p/text()")
                 jobItemLoader.add_xpath(
                     'job_desc', "//div[@class='job-detail']//text()")
                 job_addr = ''.join(
                     response.xpath(
                         "//div[@class='work_addr']//text()").extract())
                 jobItemLoader.add_value('job_addr', clear_str(job_addr))
                 jobItemLoader.add_value(
                     "company_name",
                     response.xpath("//h3[@class='fl']/em/text()").extract(
                     )[0].strip())
                 jobItemLoader.add_xpath(
                     "company_url", "//dl[@class='job_company']//a/@href")
                 jobItemLoader.add_xpath(
                     "company_url_id",
                     "//dl[@class='job_company']//a/@href")
                 tags = response.xpath(
                     "//ul[@class='position-label clearfix']//li/text()"
                 ).extract()
                 if len(tags) != 0:
                     tags = clear_str((',').join(tags))
                 else:
                     tags = ''
                 jobItemLoader.add_value('tags', tags)
                 jobItemLoader.add_value('crawl_time', get_now())
                 job_item = jobItemLoader.load_item()
                 return job_item
Пример #9
0
    def parse_job(self, response):
        item_load = LagouJobItemLoader(item=LagouJobItem(), response=response)
        item_load.add_value("url", response.url)
        item_load.add_value("url_object_id", get_md5(response.url))
        item_load.add_css("title", "div.job-name::attr(title)")
        item_load.add_css("salary", ".salary::text")
        item_load.add_xpath("job_city",
                            "//*[@class='job_request']/p/span[2]/text()")
        item_load.add_xpath("work_years",
                            "//*[@class='job_request']/p/span[3]/text()")
        item_load.add_xpath("degree_need",
                            "//*[@class='job_request']/p/span[4]/text()")
        item_load.add_xpath("job_type",
                            "//*[@class='job_request']/p/span[5]/text()")
        item_load.add_css("pulish_time", ".publish_time::text")
        item_load.add_xpath("tags",
                            "//*[@class='position-label clearfix']/li/text()")
        item_load.add_xpath("job_advantage",
                            "//*[@class='job-advantage']/p/text()")
        item_load.add_xpath("job_desc", "//*[@class='job_bt']/div")
        item_load.add_xpath("job_addr", "//*[@class='work_addr']/a/text()")
        item_load.add_xpath("company_url",
                            "//*[@class='c_feature']/li/a/@title")
        item_load.add_css("company_name", ".job_company dt img::attr(alt)")
        item_load.add_value("crawl_time", datetime.datetime.now())
        item_load.add_value("crawl_update_time", datetime.datetime.now())

        lagou_item = item_load.load_item()
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        return lagou_item
Пример #10
0
    def parse_item(self, response):
        """解析拉勾网的职位"""

        # 实例化一个对象, LagouJobItem是拉勾的item
        # LagouJobItemLoader是自定义的一个给lagou的item
        # Lg = LagouJobItem()
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)

        # print(response.text)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']//span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']//span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']//span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']//span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css(
            "job_addr", ".work_addr"
        )  # 获取的字段里面包含了一段html,所以不能用::text,我们在itemloader里去除不必要的html
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Пример #11
0
    def parse_job(self, response):
        """
        解析拉勾网的职位
        :param response:
        :return:
        """
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name span::text")
        item_loader.add_value("url", response.url)
        item_loader.add_css("salary", ".salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")

        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a div h2::text")

        job_item = item_loader.load_item()

        return job_item
Пример #12
0
    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", "")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text")
        item_loader.add_css("work_years",
                            ".job_request p span:nth-child(3)::text"
                            )  # 这里使用css ,是为了在学习时,熟悉css选择器用法
        item_loader.add_xpath("degree_need",
                              "//dd[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//dd[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("tags", ".position-label.clearfix li::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_value("crawl_time", datetime.datetime.now())
        # item_loader.add_css("crawl_update_time", datetime.datetime.now())

        job_item = item_loader.load_item(
        )  # 这里先赋值给一个变量,是考虑到便于调试以及代码可读性,而不是为了代码简洁而直接return

        return job_item
Пример #13
0
 def parse_job(self, response):
     # 解析拉勾网的职位
     item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                      response=response)
     item_loader.add_css("title", ".job-name::attr(title)")
     item_loader.add_value("url", response.url)
     item_loader.add_value("url_object_id", get_md5(response.url))
     item_loader.add_css("salary", ".job_request .salary::text")
     item_loader.add_xpath("job_city",
                           "//*[@class='job_request']/p/span[2]/text()")
     item_loader.add_xpath("work_years",
                           "//*[@class='job_request']/p/span[3]/text()")
     item_loader.add_xpath("degree_need",
                           "//*[@class='job_request']/p/span[4]/text()")
     item_loader.add_xpath("job_type",
                           "//*[@class='job_request']/p/span[5]/text()")
     item_loader.add_css("tags", ".position-label li::text")
     item_loader.add_css("publish_time", ".publish_time::text")
     item_loader.add_css("job_advantage", ".job-advantage p::text")
     item_loader.add_css("job_desc", ".job_bt div")
     item_loader.add_css("job_addr", ".work_addr")
     item_loader.add_css("company_url", "#job_company dt a::attr(href)")
     item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
     item_loader.add_value("crawl_time", datetime.now())
     job_item = item_loader.load_item()
     # response_text = response.text
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return job_item