Exemplo n.º 1
0
    def parse(self, response):
        for each in response.xpath('//*[@class="even"]'):
            item = TencentItem()
            name = each.xpath('./td[1]/a/text()').extract()[0]
            detailLink = each.xpath('./td[1]/a/@href').extract()[0]
            positionInfo = each.xpath('./td[2]/text()').extract()[0]
            peopleNumber = each.xpath('./td[3]/text()').extract()[0]
            workLocation = each.xpath('./td[4]/text()').extract()[0]
            publishTime = each.xpath('./td[5]/text()').extract()[0]

            # print name, detailLink, catalog, peopleNumber, workLocation,publishTime

            item['name'] = name.encode('utf-8')
            item['detailLink'] = detailLink.encode('utf-8')
            item['positionInfo'] = positionInfo.encode('utf-8')
            item['peopleNumber'] = peopleNumber.encode('utf-8')
            item['workLocation'] = workLocation.encode('utf-8')
            item['publishTime'] = publishTime.encode('utf-8')

            curpage = re.search('(\d+)', response.url).group(1)
            page = int(curpage) + 10
            url = re.sub('\d+', str(page), response.url)

            # 发送新的url请求加入待爬队列,并调用回调函数 self.parse
            yield scrapy.Request(url, callback=self.parse)

            # 将获取的数据交给pipeline
            yield item
Exemplo n.º 2
0
    def parse(self, response):

        items = []
        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")

        for node in node_list:
            item = TencentItem()
            item['name'] = node.xpath('./td[1]/a/text()').extract()[0]
            item['detailLink'] = node.xpath("./td[1]/a/@href").extract()[0]
            if len(node.xpath("./td[2]/text()")):
                item['positionInfo'] = node.xpath(
                    "./td[2]/text()").extract()[0]
            else:
                item['positionInfo'] = ""
            item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0]
            item['workLocation'] = node.xpath("./td[4]/text()").extract()[0]
            item['publishTime'] = node.xpath("./td[5]/text()").extract()[0]

            yield item

        abc = response.xpath("//div[@class='pagenav']/a[@id='next']")
        if abc.xpath("./@href").extract()[0] != "javascript:;":
            self.offset += 10
            url = self.base_url + str(self.offset)
            yield scrapy.Request(url, callback=self.parse)

        else:
            print("The crawl is ending")
Exemplo n.º 3
0
    def parse(self, response):

        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for node in node_list:

            item = TencentItem()

            item['position_name'] = node.xpath(
                "./td[1]/a/text()").extract_first()
            item['position_link'] = u"https://hr.tencent.com/" + node.xpath(
                "./td[1]/a/@href").extract_first()
            item['position_type'] = node.xpath(
                "./td[2]/text()").extract_first()
            item['people_number'] = node.xpath(
                "./td[3]/text()").extract_first()
            item['work_location'] = node.xpath(
                "./td[4]/text()").extract_first()
            item['publish_times'] = node.xpath(
                "./td[5]/text()").extract_first()
            # 版本1,同一个item,同一份文件,使用meta传item对象
            # yield scrapy.Request(item['position_link'],meta={'tencent_item':item}, callback=self.parse_position)
            # 版本2,不同文件,不同items,不同管道-加判断
            yield scrapy.Request(item['position_link'],
                                 callback=self.parse_position)
            yield item
Exemplo n.º 4
0
    def parseContent(self, response):
        for each in response.xpath('//*[@class="even"]'):
            name = each.xpath('./td[1]/a/text()').extract()[0]
            detailLink = each.xpath('./td[1]/a/@href').extract()[0]
            positionInfo = each.xpath('./td[2]/text()').extract()[0]

            peopleNumber = each.xpath('./td[3]/text()').extract()[0]
            workLocation = each.xpath('./td[4]/text()').extract()[0]
            publishTime = each.xpath('./td[5]/text()').extract()[0]

            item = TencentItem()
            item['name'] = name.encode('utf-8')
            item['detailLink'] = detailLink.encode('utf-8')
            item['positionInfo'] = positionInfo.encode('utf-8')
            item['peopleNumber'] = peopleNumber.encode('utf-8')
            item['workLocation'] = workLocation.encode('utf-8')
            item['publishTime'] = publishTime.encode('utf-8')
        yield item
Exemplo n.º 5
0
 def parse(self, response):
     tr_list = response.xpath("//table[@class='tablelist'/tr]")[1:-1] # 对列表切片, 去除首尾元素        
     for tr in tr_list:            
         # item = {}            
         item = TencentItem() # scrapy不推荐直接使用字典, 推荐使用mySpider.items中的Item对象                       
         item["title"] = tr.xpath("./td[1]/a/text()").extract_first() # 当前(tr)节点下的第一个td...
         item["position"] = tr.xpath("./td[2]/text()").extract_first()
         item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()            
         yield item
     
     # 找到下一页的URL地址
     next_url = response.xpath("//a[@id='next']/@href").extract_first()
     if next_url and next_url != 'javascript:;':
         next_url = 'http://hr.tencent.com/' + next_url
         # yield一个Request对象, 可以将Request对象发送给引擎
         yield scrapy.Request(
             next_url,
             callback=self.parse # 指定提取数据的callback函数
         )
Exemplo n.º 6
0
 def parseContent(self, response):
     for each in response.xpath('//tr[@class="even"]') + response.xpath(
             '//tr[@class="odd"]'):
         item = TencentItem()
         item['name'] = each.xpath('./td[1]/a/text()').extract()[0].encode(
             'utf-8')
         item['detailLink'] = each.xpath(
             './td[1]/a/@href').extract()[0].encode('utf-8')
         try:
             item['category'] = each.xpath(
                 './td[2]/text()').extract()[0].encode('utf-8')
         except:
             item['category'] = ''
         item['peopleNumber'] = each.xpath(
             './td[3]/text()').extract()[0].encode('utf-8')
         item['workLocation'] = each.xpath(
             './td[4]/text()').extract()[0].encode('utf-8')
         item['publishTime'] = each.xpath(
             './td[5]/text()').extract()[0].encode('utf-8')
         yield item
Exemplo n.º 7
0
    def parse(self, response):

        node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']")
        for node in node_list:

            item = TencentItem()
            # item取到的是unicode?
            item['position_name'] = node.xpath(
                "./td[1]/a/text()").extract_first()
            item['position_link'] = u"https://hr.tencent.com/" + node.xpath(
                "./td[1]/a/@href").extract_first()
            item['position_type'] = node.xpath(
                "./td[2]/text()").extract_first()
            item['people_number'] = node.xpath(
                "./td[3]/text()").extract_first()
            item['work_location'] = node.xpath(
                "./td[4]/text()").extract_first()
            item['publish_times'] = node.xpath(
                "./td[5]/text()").extract_first()

            yield item
Exemplo n.º 8
0
    def parse(self, response):
        time.sleep(2)
        #rs = response.json()['data']
        jsonData = json.loads(response.text)['Data']
        PostsListData = jsonData['Posts']
        #print(PostsData[0])
        item = TencentItem()
        if PostsListData is not None:
            curpage = re.search('(\d+)', response.url).group()
            page = int(curpage) + 1
            url = re.sub('\d+', str(page), response.url, 1)
            PostsDictData = PostsListData[0]

            # {}
            name = PostsDictData['RecruitPostName']
            detailLink = PostsDictData['PostURL']
            positionInfo = PostsDictData['Responsibility']
            peopleNumber = '2'
            workLocation = PostsDictData['CountryName'] + ',' + PostsDictData[
                'LocationName']
            publishTime = PostsDictData['LastUpdateTime']
            item['name'] = name
            item['detailLink'] = detailLink
            item['positionInfo'] = positionInfo
            item['peopleNumber'] = peopleNumber
            item['workLocation'] = workLocation
            item['publishTime'] = publishTime
            #print(page,url)
            yield scrapy.Request(url, callback=self.parse)
        else:
            print(re.search('(\d+)', response.url).group(), response.url)
            print('所有跑完----')
        #print(json.loads(response.text)['Data'])

        #curpage = re.search('(\d+)',response.url).group()
        #page = int(curpage) + 1
        #url = re.sub('\d+', str(page), response.url)
        # 发送新的url请求加入待爬队列,并调用回调函数 self.parse
        #yield scrapy.Request(url, callback=self.parse)
        #
        # for each in response.xpath('//div[@class="search-content"]'):
        #     item = TencentItem()
        #     name = each.xpath('./div/div/a/text()').extract()[0]
        #     detailLink = each.xpath('./td[1]/a/@href').extract()[0]
        #     positionInfo = each.xpath('./td[2]/text()').extract()[0]
        #     peopleNumber = each.xpath('./td[3]/text()').extract()[0]
        #     workLocation = each.xpath('./td[4]/text()').extract()[0]
        #     publishTime = each.xpath('./td[5]/text()').extract()[0]
        #
        #
        #     #print name, detailLink, catalog, peopleNumber, workLocation,publishTime
        #
        #     item['name'] = name.encode('utf-8')
        #     item['detailLink'] = detailLink.encode('utf-8')
        #     item['positionInfo'] = positionInfo.encode('utf-8')
        #     item['peopleNumber'] = peopleNumber.encode('utf-8')
        #     item['workLocation'] = workLocation.encode('utf-8')
        #     item['publishTime'] = publishTime.encode('utf-8')
        #
        #     curpage = re.search('(\d+)',response.url).group(1)
        #     page = int(curpage) + 10
        #     url = re.sub('\d+', str(page), response.url)
        #
        #     # 发送新的url请求加入待爬队列,并调用回调函数 self.parse
        #     yield scrapy.Request(url, callback = self.parse)
        #
        #     # 将获取的数据交给pipeline
        yield item