Пример #1
0
 def parse(self, response):
     try:
         for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):
             #初始化模型对象
             #职位名称
             item = TencentItem()
             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
             #详细链接
             item['positionLink'] = 'https://hr.tencent.com/' + each.xpath("./td[1]/a/@href").extract()[0]
             #职位类别
             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
             #招聘人数
             item['positionNum'] = each.xpath("./td[3]/text()").extract()[0]
             #工作地点
             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
             #发布时间
             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
         
             yield item
     except IndexError:
         pass
        
     if self.pageNum < 1680 :
         self.pageNum += 10
     else:
         raise("检索完成")
  #每次处理完一页的数据后,重新发送下一页面的请求
     yield scrapy.Request(self.url + str(self.pageNum),callback = self.parse)
Пример #2
0
    def parse(self, response):

        node_list = response.xpath("//*[@class='even']|//*[@class='odd']")
        if not node_list:
            return

        items = TencentItem()
        for node in node_list:
            items["name"] = node.xpath(
                "./td[1]/a/text()").extract_first()  # extract_first 拿一条数据
            items["detailLink"] = node.xpath("./td[1]/a/@href").extract_first()
            items["positionInfo"] = node.xpath(
                "./td[2]/text()").extract_first()
            items["peopleNumber"] = node.xpath(
                "./td[3]/text()").extract_first()
            items["workLocation"] = node.xpath(
                "./td[4]/text()").extract_first()
            items["publishTime"] = node.xpath("./td[5]/text()").extract_first()
            yield items

        url = response.xpath("//a[@id='next']/@href").extract_first()
        print("-" * 50, url)

        yield scrapy.Request("https://hr.tencent.com/" + url,
                             callback=self.parse)
Пример #3
0
 def parse(self, response):
     position_lists = response.xpath(
         '//tr[@class="even"] | //tr[@class="odd"]')
     for position in position_lists:
         item = TencentItem()
         position_name = position.xpath("./td[1]/a/text()").extract()[0]
         position_link = position.xpath("./td[1]/a/@href").get()
         position_type = position.xpath("./td[2]/text()").get()
         people_num = position.xpath("./td[3]/text()").get()
         work_address = position.xpath("./td[4]/text()").get()
         publish_time = position.xpath("./td[5]/text()").get()
         item['position_name'] = position_name
         item['position_link'] = position_link
         item['position_type'] = position_type
         item['people_num'] = people_num
         item['work_address'] = work_address
         item['publish_time'] = publish_time
         yield item
     total_page = response.xpath(
         '//div[@class="left"]/span/text()').extract()[0]
     if self.offset < int(total_page):
         self.offset += 10
     new_url = "https://hr.tencent.com/position.php?&start=" + str(
         self.offset)
     yield scrapy.Request(new_url, callback=self.parse)
Пример #4
0
    def parse(self, response):
        for each in response.xpath("//tr[@class='even'] |//tr[@class='odd']"):
            #初始化模型对象
            item = TencentItem()
            # 职位名
            item["positionname"] = each.xpath("./td[1]/a/text()").extract_first()
            # 详情链接
            item["positionlink"] = each.xpath("./td[1]/a/@href").extract_first()
            # 职位类别
            item["positionType"] = each.xpath("./td[2]/text()").extract_first()
            # 招聘人数
            item["peopleNum"]    = each.xpath("./td[3]/text()").extract_first()
            # 工作地点
            item["worlLocation"] = each.xpath("./td[4]/text()").extract_first()
            # 发布时间
            item["publishTime"]  = each.xpath("./td[5]/text()").extract_first()

            yield item

        if self.offset < 60:
            self.offset += 10

        #自增10,每次处理完一页的数据之后,重新发送下一页的页面请求,重新拼接为新的URL,并调用self.pase,处理Response
        #将请求重新发送给调度器入队列,出队列,交给下载器下载
        yield scrapy.Request(self.url + str(self.offset),callback=self.parse)
Пример #5
0
    def parse(self, response):
        """
        获取响应, 触发解析函数, 提取数据, 提取URL
        :param response: 下载==>中央引擎==>爬虫 的response对象
        :return: 数据 URL
        """
        response_dict = json.loads(response.text)
        response_data = response_dict["Data"]
        info_count = response_data["Count"]  # 所有显示的条数
        for info in response_data["Posts"]:
            item = TencentItem()
            item["Name"] = info["RecruitPostName"]
            item["Location"] = info["CountryName"] + info["LocationName"]
            item["BG"] = info["BGName"]
            item["Category"] = info["CategoryName"]
            item["Responsibility"] = info["Responsibility"]
            item["Time"] = info["LastUpdateTime"]
            item["URL"] = info["PostURL"]
            yield item

        # 提取下一页链接, 并且继续请求
        num = ceil(int(info_count) / 20)  # 向下取整函数floor和向上取整函数ceil
        for page in range(2, num):
            next_url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1566111052462&pageSize=20&language=zh-cn&area=cn&pageIndex={}".format(
                page)

            # 1. 封装成请求对象
            request = scrapy.Request(url=next_url)

            # 2. 发送给搜索引擎
            yield request
Пример #6
0
    def parse_detail(self, response):
        """
        处理需要爬取页面的

        :param response: 内容页面返回信息
        :return: 待持久化item
        """
        tencent_item = TencentItem()
        soup = BeautifulSoup(response.body.decode("utf-8"), "html.parser")

        title = soup.select("h1")[0].text
        create_date = get_now_time()
        url = response.url
        url_object_id = get_md5(url)
        content = ""

        if len(soup.select(".videoPlayer")) == 0:
            if len(soup.select("p")) > 0:
                content_list = soup.select("p")
                for element in content_list:
                    content = content + remove_special_label(element.text)
                content_list = soup.select(".text")
                for element in content_list:
                    content = content + remove_special_label(element.text)

        tencent_item["title"] = title
        tencent_item["create_date"] = create_date
        tencent_item["url"] = url
        tencent_item["url_object_id"] = url_object_id
        tencent_item['content'] = content
        if (len(title) > 0) & (len(content) > 0):
            return tencent_item
        else:
            logging.log(logging.ERROR, url)
            return None
Пример #7
0
    def parse(self, response):
        """
        获取响应, 触发解析函数, 提取数据, 提取URL
        :param response: 下载==>中央引擎==>爬虫 的response对象
        :return: 数据 URL
        """
        response_dict = json.loads(response.text)
        response_data = response_dict["Data"]
        info_count = response_data["Count"]  # 所有显示的条数
        for info in response_data["Posts"]:
            item = TencentItem()
            item["Name"] = info["RecruitPostName"]
            item["Location"] = info["CountryName"] + info["LocationName"]
            item["BG"] = info["BGName"]
            item["Category"] = info["CategoryName"]
            # item["Responsibility"] = info["Responsibility"].replace("\n", "").replace("\r", "").strip()
            item["Time"] = info["LastUpdateTime"]
            yield item  # 返回数据对象

            detail_info = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?postId='
            post_id = info["PostId"]
            yield scrapy.Request(
                url=detail_info + post_id,
                callback=self.
                parse_detail  # 设置回调解析函数, 这里发送的request请求对象, parse()解析函数处理不了这里的请求
            )
            break  # 测试
Пример #8
0
    def parse(self, response):
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0]
            item['positionLink'] = 'http://hr.tencent.com/' + node.xpath(
                "./td[1]/a/@href").extract()[0]
            if len(node.xpath("./td[2]/text()")):
                item['positionType'] = node.xpath(
                    "./td[2]/text()").extract()[0]
            else:
                item['positionType'] = ''
            item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0]
            item['workLocation'] = node.xpath("./td[4]/text()").extract()[0]
            item['publishTime'] = node.xpath("./td[5]/text()").extract()[0]
            yield item

        total = int(
            response.xpath(
                "//span[@class='lightblue total']/text()").extract()[0])

        if self.offset < total:
            self.offset += len(node_list)
            url = self.baseUrl + str(self.offset)
            yield Request(url=url, callback=self.parse)
 def parse(self, response):
     #xpath匹配规则
     for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
         item = TencentItem()
         # 职位名
         item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0]
         # 详细链接
         item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0]
         # 职位类别
         try:
             item["positionType"] = each.xpath(
                 "./td[2]/text()").extract()[0]
         except:
             item["positionType"] = '空'
         # 招聘人数
         item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0]
         # 工作地点
         item["workLocation"] = each.xpath("./td[4]/text()").extract()[0]
         # 发布时间
         item["publishTime"] = each.xpath("./td[5]/text()").extract()[0]
         #把数据交给管道文件
         yield item
     #设置新URL页码
     if (self.offset < 2620):
         self.offset += 10
     #把请求交给控制器
     yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #10
0
    def parse(self, response):
        for each in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'):
            #初始化模型对象
            item = TencentItem()

            # 职位名
            item['positionname'] = each.xpath('./td[1]/a/text()').extract()[0]
            # 职位链接
            item['positionlink'] = each.xpath('./td[1]/a/@href').extract()[0]
            # 职位类型
            positiontype = each.xpath('./td[2]/text()').extract()
            #职位类别可能为空
            if positiontype:
                item['positiontype'] = positiontype[0]
            else:
                item['positiontype'] = '职位类别'

            # 招聘人数
            item['peoplenumber'] = each.xpath('./td[3]/text()').extract()[0]
            # 工作地点
            item['worklocatiom'] = each.xpath('./td[4]/text()').extract()[0]
            # 发布时间
            item['publishtime'] = each.xpath('./td[5]/text()').extract()[0]

            yield item

        if self.offset < 2140:
            self.offset += 10

        #每次处理完一页后,重新发送下一页页面请求
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #11
0
    def parse(self, response):
        for each in response.xpath('//tr[@class="even"]|//tr[@class="odd"]'):
            #初始化对象
            item = TencentItem()
            # 职位名称
            item['positionName'] = each.xpath('./td[1]/a/text()').extract()[0]
            # 详情链接
            item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0]
            # 职位类别
            item['positionType'] = each.xpath('./td[2]/text()').extract()[0]
            # 招聘人数
            item['positionNum'] = each.xpath('./td[3]/text()').extract()[0]
            # 工作地点
            item['workLocation'] = each.xpath('./td[4]/text()').extract()[0]
            # 发布时间
            item['publishTime'] = each.xpath('./td[5]/text()').extract()[0]

            yield item

        if self.offset < 3920:
            self.offset += 10
        # else:
        # raise "结束工作"
        # break

        #每次处理完一页数据之后,重新发送下一页的页面请求
        #self.offset自增10,同时拼接新的url,并调用回调函数self.parse处理Response
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #12
0
    def parse(self, response):
        jobs = []
        for each in response.xpath('//div[@class="recruit-list"]'):
            item = TencentItem()
            # 职位名称
            item['position_name'] = each.xpath('./a/h4/text()').extract()[0]
            # 职位类别
            item['position_type'] = each.xpath(
                '/a/p/span[3]/text()').extract()[0]
            # 工作地点
            item['location'] = each.xpath('/a/p/span[2]/text()').extract()[0]
            # 发布时间
            item['pub_time'] = each.xpath('/a/p/span[4]/text()').extract()[0]

            print item['position_name']
            print item['position_type']
            print item['location']
            print item['pub_time']

            yield item
            # jobs.append(item)

        if self.offset < 10:
            self.offset += 1

        # return jobs
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #13
0
    def parse(self, response):
        for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'):
            # 初始化模型对象
            item = TencentItem()
            # print item    是个 {}

            # 职位名称
            item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
            # 详情连接
            item['positionLink'] = each.xpath('./td[1]/a/@href').extract()[0]
            # 职位类别
            item['positionType'] = each.xpath("./td[2]/text()").extract()
            # 招聘人数
            item['peopleNum'] = each.xpath('./td[3]/text()').extract()[0]
            # 工作地点
            item['workLocation'] = each.xpath('./td[4]/text()').extract()[0]
            # 发布时间
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            # yield很重要,他是爬虫和后端管道数据处理的桥梁,每处理就返回信息
            yield item
        if self.offset < 2623:
            self.offset += 10

        # 每处理完一页数据之后,重新发送下一页页面请求
        # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #14
0
    def parse(self, response):
        for each in response.xpath('//tr[@class="even"] | //tr[@class="odd"]'):
            # 初始化模型对象
            item = TencentItem()
            # 职位名称
            item["sitionname"] = each.xpath("./td[1]/a/text()").extract()[0]
            # 详情链接
            item["positionlink"] = each.xpath("./td[1]/a/@href").extract()[0]
            # 类别
            item["positiontype"] = each.xpath("./td[2]/text()").extract()[0]
            # 招聘人数
            item["perpleNum"] = each.xpath("./td[3]/text()").extract()[0]
            # 工作地点
            item["workLocation"] = each.xpath("./td[4]/text()").extract()[0]
            # 发布时间
            item["publishTime"] = each.xpath("./td[5]/text()").extract()[0]

            # 将数据给管道文件处理
            yield item

        if self.offset < 3320:
            self.offset += 10

        # 每次处理完一页重新发送请求处理下一页
        # 将请求重新发送给调度器:入队列,出队列,交给下载其下载
        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #15
0
    def parse(self, response):

        for each in response.xpath("//tr[@class='even']|//tr[@class='odd']"):

            item = TencentItem()
            name = each.xpath('./td[1]/a/text()').extract()[0]
            detailLink = each.xpath('./td[1]/a/@href').extract()[0]
            positionInfo = each.xpath('./td[2]/text()').extract()[0]
            peopleNumber = each.xpath('./td[3]/text()').extract()[0]
            workLocation = each.xpath('./td[4]/text()').extract()[0]
            publishTime = each.xpath('./td[5]/text()').extract()[0]

            print(name,detailLink,positionInfo,peopleNumber,workLocation,publishTime)

            item['name'] = name
            item['detailLink'] = "http://hr.tencent.com/" + detailLink
            item['positionInfo'] = positionInfo
            item['peopleNumber'] = peopleNumber
            item['workLocation'] = workLocation
            item['publishTime'] = publishTime

            curpage = re.search('(\d+)',response.url).group(1)
            page = int(curpage) + 10
            url = re.sub('\d+',str(page),response.url)

            yield  scrapy.Request(url,callback=self.parse)

            yield item
Пример #16
0
 def parse(self, response):
     # xpath匹配规则
     #response 是被爬取的url地址....<200 https://hr.tencent.com/position.php?&start=50>
     for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
         #print ("打印each内容:",each)
         item = TencentItem()
         #职位名
         item["positionname"] = each.xpath("./td[1]/a/text()").extract()[0]
         #print ("打印item***positionname******************",item["positionname"])
         #详细链接
         item["positionLink"] = each.xpath("./td[1]/a/@href").extract()[0]
         #print("打印item***positionLink******************", item["positionLink"])
         #职位类别
         try:
             item["positionType"] = each.xpath(
                 "./td[2]/text()").extract()[0]
         except:
             item["positionType"] = '空'
         #招聘人数
         item["peopleNum"] = each.xpath("./td[3]/text()").extract()[0]
         #print("打印item***peopleNum******************", item["peopleNum"])
         #工作地点
         item["workLocation"] = each.xpath("./td[4]/text()").extract()[0]
         #print("打印item***workLocation******************", item["workLocation"])
         #发布时间
         item["publishTime"] = each.xpath("./td[5]/text()").extract()[0]
         #print("打印item***publishTime******************", item["publishTime"])
         #把数据交给管道文件
         yield item
     # 设置新URL页码
     if (self.offset < 50):
         self.offset += 10
     #把请求交给控制器
     yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #17
0
    def parse(self, response):
        item = TencentItem()

        all_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')

        for postion in all_list:
            position_name = postion.xpath("./td[1]/a/text()").get()
            position_link = postion.xpath("./td[1]/a/@href").get()
            position_type = postion.xpath("./td[2]/text()").get()
            people_num = postion.xpath("./td[3]/text()").get()
            work_address = postion.xpath("./td[4]/text()").get()
            publish_time = postion.xpath("./td[5]/text()").get()
            item["position_name"] = position_name
            item["position_link"] = position_link
            item["position_type"] = position_type
            item["people_num"] = people_num
            item["work_address"] = work_address
            item["publish_time"] = publish_time
            yield item

        if self.page_nums < int(
                response.xpath('//tr[@class="f"]/td/div/span/text()').get()):
            self.page_nums += 10
        url = self.url + str(self.page_nums) + '#a'
        yield scrapy.Request(url, callback=self.parse)
Пример #18
0
    def parse(self, response):
        positions = response.xpath('//tr[@class="even"]|//tr[@class="odd"]')

        for position in positions:
            item = TencentItem()

            item['name'] = position.xpath('./td[1]/a/text()').extract()[0]
            print(item['name'])
            item['link'] = position.xpath('./td[1]/a/@href').extract()[0]
            #print(item['link'])
            type = position.xpath('./td[2]/text()')
            print(type)
            #tencent中有些类别为空,防止错误
            if len(type) > 0:
                type = type.extract()[0]

            item['type'] = type
            item['num'] = position.xpath('./td[3]/text()').extract()[0]
            item['location'] = position.xpath('./td[4]/text()').extract()[0]
            item['time'] = position.xpath('./td[5]/text()').extract()[0]
            #创建请求对象
            request = scrapy.Request("https://hr.tencent.com/" + item['link'],
                                     callback=self.getInfo)
            request.meta['item'] = item  #传递item对象
            yield request

        if self.offset < 1000:

            self.offset += 10
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
Пример #19
0
    def parse(self, response):
        item = TencentItem()
        pattern1 = re.compile(r'<tr class="(even|odd)">(.*?)</tr>', re.S)
        #pattern2 = re.compile(r'<td(.*?)>(.*?)</td>',re.S)
        positions = re.findall(pattern1, response.text)
        for position in positions:
            # infos = re.findall(pattern2,position[1])
            # for info in infos:
            #     print(info[1])
            try:

                c = etree.HTML(position[1])
                item['name'] = c.xpath("//td/a/text()")[0]
                item['positiontype'] = c.xpath("//td/text()")[0]
                item['num'] = c.xpath("//td/text()")[1]
                item['address'] = c.xpath("//td/text()")[2]
                item['positiontime'] = c.xpath("//td/text()")[3]
                yield item
            except Exception as e:
                print("")
                print(e)

        if self.pageNum < 2840:
            self.pageNum += 10
            yield scrapy.Request(self.url + str(self.pageNum),
                                 callback=self.parse)
        '''
Пример #20
0
    def parse(self, response):

        # 获取页面中招聘信息在网页中位置节点
        node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')

        # 匹配到下一页的按钮
        next_page = response.xpath('//a[@id="next"]/@href').extract_first()

        # 遍历节点,进入详情页,获取其他信息
        for node in node_list:
            # 实例化,填写数据
            item = TencentItem()

            item['position_name'] = node.xpath(
                './td[1]/a/text()').extract_first()
            item['position_link'] = node.xpath(
                './td[1]/a/@href').extract_first()
            item['position_type'] = node.xpath(
                './td[2]/text()').extract_first()
            item['wanted_number'] = node.xpath(
                './td[3]/text()').extract_first()
            item['work_location'] = node.xpath(
                './td[4]/text()').extract_first()
            item['publish_time'] = node.xpath('./td[5]/text()').extract_first()

            yield item
            yield scrapy.Request(url=self.base_url + item['position_link'],
                                 callback=self.details)

        # 访问下一页信息
        yield scrapy.Request(url=self.base_url + next_page,
                             callback=self.parse)
Пример #21
0
    def parse(self, response):
        zhiwei_list = response.xpath("//tr[@class='even']|//tr[@class='odd']")

        for zhiwei in zhiwei_list:
            item = TencentItem()

            name = zhiwei.xpath("./td[1]/a/text()").extract()[0]
            link = zhiwei.xpath("./td[1]/a/@href").extract()[0]

            if len(zhiwei.xpath("./td[2]/text()")) > 0:
                leibie = zhiwei.xpath("./td[2]/text()").extract()[0]
            else:
                leibie = "暂无"
            num = zhiwei.xpath("./td[3]/text()").extract()[0]
            addr = zhiwei.xpath("./td[4]/text()").extract()[0]
            date = zhiwei.xpath("./td[5]/text()").extract()[0]

            item['name'] = name
            item['link'] = link
            item['leibie'] = leibie
            item['num'] = num
            item['addr'] = addr
            item['date'] = date

            yield item
        if self.start_num < 2260:
            self.start_num += 10

        yield scrapy.Request("http://hr.tencent.com/position.php?&start=" + str(self.start_num))
Пример #22
0
    def parse(self, response):
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            #初始化模型对象
            item = TencentItem()
            #职位名
            item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
            #详情链接
            item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
            #职位类别
            try:
                item['positionType'] = each.xpath(
                    "./td[2]/text()").extract()[0]
            except:
                item['positionType'] = '空'
            #招聘人数
            item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
            #工作地点
            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
            #发布时间
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item
        if self.offset < 180:
            self.offset += 10

        yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #23
0
 def parse(self, response):
     text = response.text
     data_list = json.loads(text)['Data']['Posts']
     for data_dict in data_list:
         item = TencentItem()
         item['Position_Name'] = data_dict['RecruitPostName']
         item['Position_Location'] = data_dict['CountryName'] + data_dict[
             'LocationName']
         item['Responsibility'] = data_dict['Responsibility']
         item['Last_Update_Time'] = data_dict['LastUpdateTime']
         item['Position_Src'] = data_dict['PostURL']
         '''另外请求详细招聘信息'''
         url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?postId=" + data_dict[
             'PostId']
         yield scrapy.Request(url,
                              callback=self.parse_detail,
                              meta={"item": item},
                              dont_filter=False)
     '''翻页请求'''
     global page
     print("已抓取完第{}页".format(page))
     page = page + 1
     if (page * 10 - 9) <= int(json.loads(text)['Data']['Count']):
         next_url = "https://careers.tencent.com/tencentcareer/api/post/Query?countryId=1&parentCategoryId=40001&pageIndex={}&pageSize=10".format(
             page)
         yield scrapy.Request(next_url,
                              callback=self.parse,
                              dont_filter=False)
Пример #24
0
    def parse(self, response):
        items = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')
        for each in items:
            item = TencentItem()
            try:
                name = each.xpath('./td[1]/a/text()').extract()
                link = each.xpath('./td[1]/a/@href').extract()
                style = each.xpath('./td[2]/text()').extract()
                num = each.xpath('./td[3]/text()').extract()
                location = each.xpath('./td[4]/text()').extract()
                time = each.xpath('./td[5]/text()').extract()

                item['positionName'] = name[0].encode('utf-8')
                item['positionLink'] = link[0].encode('utf-8')
                item['positionStyle'] = style[0].encode('utf-8')
                item['positionNumber'] = num[0].encode('utf-8')
                item['positionLocation'] = location[0].encode('utf-8')
                item['positionTime'] = time[0].encode('utf-8')
            except:
                pass

            count = re.search(('\d+'), response.url).group(0)
            page = int(count) + 10
            url = re.sub('\d+', str(page), response.url)

            yield scrapy.Request(url, callback=self.parse)

            yield item
Пример #25
0
    def parse(self, response):
        # 提取每个response的数据
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            # 提取每个职位的信息
            item['position_name'] = node.xpath("./td[1]/a/text()").extract()[0]
            item['position_link'] = node.xpath("./td[1]/a/@href").extract()[0]

            if len(node.xpath("./td[2]/text()")):
                item['position_type'] = node.xpath(
                    "./td[2]/text()").extract()[0]
            else:
                item['position_type'] = '不定'

            item['people_num'] = node.xpath("./td[3]/text()").extract()[0]
            item['work_location'] = node.xpath("./td[4]/text()").extract()[0]
            item['publish_time'] = node.xpath("./td[5]/text()").extract()[0]

            # yield的重要性,是返回数据后还能回来接着执行代码
            yield item

        # 第一种写法,拼接url,适用场景:也没没有可以点击的请求链接,必须通过拼接url才能获取响应
        # if self.offset < 3330:
        #     self.offset += 10
        #     url = self.base_url + str(self.offset)
        #     yield scrapy.Request(url, callback=self.parse)

        # 第二种写法,直接从response获取需要爬取的下一页链接,并发送请求处理,直到请求全部提取完
        if not len(response.xpath("//a[@class='noactive' and @id='next']")):
            url = response.xpath("//a[@id='next']/@href")[0].extract()
            yield scrapy.Request("http://hr.tencent.com/" + url,
                                 callback=self.parse)
Пример #26
0
    def parse(self, response):

        for each in response.xpath(
                '//*[contains(@class,"odd") or contains(@class,"even")]'):
            item = TencentItem()
            name = each.xpath('./td[1]/a/text()').extract_first()
            detailLink = each.xpath('./td[2]/text()').extract_first()
            positionInfo = "https://hr.tencent.com/" + each.xpath(
                './td[1]/a/@href').extract_first()
            peopleNumber = each.xpath('./td[3]/text()').extract_first()
            workLocation = each.xpath('./td[4]/text()').extract_first()
            publishDate = each.xpath('./td[5]/text()').extract_first()

            item['name'] = name
            item['detailLink'] = detailLink
            item['positionInfo'] = positionInfo
            item['peopleNumber'] = peopleNumber
            item['workLocation'] = workLocation
            item['publishDate'] = publishDate

            yield item

        now_page = int(re.search(r'\d+', response.url).group())
        if now_page < 10:
            url = re.sub(r'\d+', str(now_page + 10), response.url)

            yield scrapy.Request(url, callback=self.parse)
Пример #27
0
    def parse(self, response):
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            # 初始化模型对象
            item = TencentItem()

            item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
            # 详情连接
            item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
            # 职位类别
            item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
            # 招聘人数
            item['peopleNum'] =  each.xpath("./td[3]/text()").extract()[0]
            # 工作地点
            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
            # 发布时间
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item

        if self.offset < 1680:
            self.offset += 10

        # 每次处理完一页的数据之后,重新发送下一页页面请求
        # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response
        yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
Пример #28
0
    def parse(self, response):
        # 获取职位节点列表
        node_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]')
        # print(len(node_list))

        # 编列节点列表,从没一个节点中抽取数据
        for node in node_list:
            # item实例化
            item = TencentItem()
            # 抽取数据
            item['name'] = node.xpath('./td[1]/a/text()').extract()[0]
            item['detail_link'] = 'https://hr.tencent.com/' + node.xpath('./td[1]/a/@href').extract()[0]
            # extract_first()提取结果的第一个,如果存在则提取,如果不存在则赋值为None
            item['category'] = node.xpath('./td[2]/text()').extract_first()
            item['number'] = node.xpath('./td[3]/text()').extract()[0]
            item['address'] = node.xpath('./td[4]/text()').extract()[0]
            item['pub_date'] = node.xpath('./td[5]/text()').extract()[0]

            # 返回数据给引擎
            yield item

            # 获取下一页链接,并且做成请求发送个引擎
            # 拼接下一页url
            next_url = 'https://hr.tencent.com/' + response.xpath('//*[@id="next"]/@href').extract()[0]
            # 判断是否到达最后一页
            if 'javascript:;' not in next_url:
                # 没有到达最后一页就发送请求,模拟翻页
                yield scrapy.Request(next_url,callback=self.parse)
Пример #29
0
    def parse(self, response):
        """处理response"""
        for eachJob in response.xpath(
                '//tr[@class="even"] | //tr[@class="odd"]'):
            # 初始化item对象
            item = TencentItem()
            # 将一页的内容放入item中
            item['jobName'] = eachJob.xpath('./td[1]/a/text()').extract()[0]

            item['jobDetail'] = eachJob.xpath('./td[1]/a/@href').extract()[0]
            item['jobType'] = eachJob.xpath('./td[2]/text()').extract_first()
            item['jobNeed'] = eachJob.xpath('./td[3]/text()').extract()[0]
            item['jobLocation'] = eachJob.xpath('./td[4]/text()').extract()[0]
            item['jobTime'] = eachJob.xpath('./td[5]/text()').extract()[0]
            # 将获取的数据交给pipeline
            yield item

        # 提取出接下来的请求,每页10条
        if self.offset < 3300:
            self.offset += 10
        else:
            print('the work finished')
        next_page = self.url + str(self.offset)

        # 将请求重新发送给调度器入队列,出队列,给下载器下载
        yield scrapy.Request(url=next_page, callback=self.parse)
Пример #30
0
	def parse(self,response):
# for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
		for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
			# 初始化模型对象
			item = TencentItem()
		
			# 返回的是一个选择器的列表
			# 职位名
			item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
			# 详细链接
			item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
			# 职位类型
			item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
			# 招聘人数
			item['peopleNum']	= each.xpath("./td[3]/text()").extract()[0]
			# 工作地点
			item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
			# 发布时间
			item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
		

			yield item
	

		if self.offset <1680:
			self.offset +=10
				#else:
				#	raise "结束工作"
		# 每次处理一页数据之后 重新发送下一页页面请求	
		# self.offset自增10,同时拼接为新的url,并调用回调函数 self.parse处理Response
		yield scrapy.Request(self.url + str(self.offset),callback = self.parse)