def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") # items = [] for node in node_list: item = TencentItem() positionName = node.xpath("./td[1]/a/text()").extract() positionLink = node.xpath("./td[1]/a/@href").extract() positionType = node.xpath("./td[2]/text()").extract() peopleNumber = node.xpath("./td[3]/text()").extract() workLocation = node.xpath("./td[4]/text()").extract() publishTime = node.xpath("./td[5]/text()").extract() item['positionName'] = positionName[0] item['positionLink'] = positionLink[0] if len(positionType): item['positionType'] = positionType[0] else: item['positionType'] = '' item['peopleNumber'] = peopleNumber[0] item['workLocation'] = workLocation[0] item['publishTime'] = publishTime[0] yield item # if self.offset < 3070: # self.offset += 10 # url = self.baseURL + str(self.offset) # yield scrapy.Request(url, callback=self.parse) if len(response.xpath("//a[@class='noactive' and @id='next']")) == 0: url = response.xpath("//a[@id='next']/@href").extract()[0] yield scrapy.Request("https://hr.tencent.com/"+url,callback=self.parse)
def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # 初始化模型对象 item = TencentItem() item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] if len(each.xpath("./td[2]/text()").extract()) > 0: item['positionType'] = each.xpath( './td[2]/text()').extract()[0] else: item['positionType'] = "None" item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item if self.offset < 2000: self.offset += 10 # 每次处理完一页的数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse, dont_filter=True)
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for each in node_list: #初始化模型 item = TencentItem() item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0] item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0] if len(each.xpath("./td[2]/text()")) != 0: item['positionType'] = each.xpath("./td[2]/text()").extract()[0] else : item['positionType'] = "" item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] item['workLoction'] = each.xpath("./td[4]/text()").extract()[0] item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item # if self.offset < 3830: # self.offset += 10 # url = self.base_url + str(self.offset) # if len(response.xpath("//a[@id='next' and calss='noactive']")) == 0: next_url = self.base_url + str(response.xpath("//a[@id='next']/@href")) yield scrapy.Request(next_url,callback = self.parse)
def parse(self, response): print response.text print '打印response.text结束' # for each in response.xpath("//tr[@class='event']|tr[@class='odd']"): print '开始打印符合的xpath' # 循环电影的条目 for each in response.xpath( "//div[@class='article']//ol[@class='grid_view']/li"): print each # item文件导进来 douban_item = TencentItem() # 写详细的xpath,进行数据的解析 douban_item['serial_number'] = each.xpath( ".//div[@class='item']//em/text()").extract_first() douban_item['movie_name'] = each.xpath( ".//div[@class='info']/div[@class='hd']/a/span[1]/text()" ).extract_first() print douban_item # 将需要的数据传到管道中,否则pipelines接收不到数据 yield douban_item # 解析下一页规则,取后一页的xpath next_link = response.xpath( "//span[@class='next']/link/@href").extract() if next_link: next_link = next_link[0] yield scrapy.Request("https://movie.douban.com/top250" + next_link, callback=self.parse) '''
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['positionName'] = node.xpath( "./td[1]/a/text()").extract()[0].encode("utf-8") item['positionLink'] = node.xpath( "./td[1]/a/@href").extract()[0].encode("utf-8") if len(node.xpath("./td[2]/text()")): item['positionType'] = node.xpath( "./td[2]/text()").extract()[0].encode("utf-8") else: item['positionType'] = "NULL" item['peopleNumber'] = node.xpath( "./td[3]/text()").extract()[0].encode("utf-8") item['workLocation'] = node.xpath( "./td[4]/text()").extract()[0].encode("utf-8") item['publishTime'] = node.xpath( "./td[5]/text()").extract()[0].encode("utf-8") yield item if len(response.xpath("//a[@id='next' and @class='noactive']")) == 0: url = response.xpath("//a[@id='next']/@href").extract()[0] yield scrapy.Request('https://hr.tencent.com/' + url, callback=self.parse)
def parse(self, response): node_list = response.xpath( "//tr[@class ='even'] | // tr[@ class ='odd']") for node in node_list: item = TencentItem() # 提取每个职位的信息,并且将提取出的Unicode字符串编码为UTF-8 item["position_name"] = node.xpath("./td[1]/a/text()").extract()[0] if len(node.xpath("./td[2]/text()")): item["position_type"] = node.xpath( "./td[2]/text()").extract()[0] else: item["position_type"] = "" item["people_number"] = node.xpath("./td[3]/text()").extract()[0] item["position_address"] = node.xpath( "./td[4]/text()").extract()[0] item["release_time"] = node.xpath("./td[5]/text()").extract()[0] yield item # 方法一 # if self.offset < 210: # self.offset += 10 # url = self.baseURL + str(self.offset) # # callback回调函数 # yield scrapy.Request(url, callback=self.parse) # 方法二:获取下一页的链接,调用callback函数 if len(response.xpath("//a[@id='next' and @class='noactive']")) == 0: url = response.xpath("//a[@id='next']/@href").extract()[0] yield scrapy.Request("https://hr.tencent.com/" + url, callback=self.parse)
def parse(self, response): position_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for position in position_list: item = TencentItem() item['positionName'] = position.xpath('td[1]/a/text()').extract_first() item['positionLink'] = "http://hr.tencent.com/" + position.xpath('td[1]/a/@href').extract_first() positionType = position.xpath('td[2]/text()').extract() if len(positionType): item['positionType'] = positionType else: item['positionType'] = "" item['peopleNumber'] = position.xpath('td[3]/text()').extract_first() item['workLocation'] = position.xpath('td[4]/text()').extract_first() item['publishTime'] = position.xpath('td[5]/text()').extract_first() yield item self.offset += 10 print 'offset', self.offset next_url = self.base_url.format(self.offset) print 'next_url', next_url if position_list.extract_first() is not None: yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['positionName'] = node.xpath( "./td[1]/a/text()").extract()[0].encode("utf-8") item['positionLink'] = node.xpath( "./td[1]/a/@href").extract()[0].encode("utf-8") # 爬去的网页中可能为空。做一个判断,防止代码报错 if len(node.xpath("./td[2]/text()")): item['positionType'] = node.xpath( "./td[2]/text()").extract()[0].encode("utf-8") else: item['positionType'] = "" item['positionNum'] = node.xpath( "./td[3]/text()").extract()[0].encode("utf-8") item['positionAddress'] = node.xpath( "./td[4]/text()").extract()[0].encode("utf-8") item['positionTime'] = node.xpath( "./td[5]/text()").extract()[0].encode("utf-8") # 类比于return yield返回给管道 yield item if self.temp < 3000: self.temp += 10 # 自己定义的新的url newurl = self.base_url + str(self.temp) # 构建并发送请求 request两个参数 yield scrapy.Request(newurl, callback=self.parse)
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() # 提取每个职位的信息,并且将提取出的Unicode字符串编码为UTF-8编码 item['positionName'] = node.xpath("./td[1]/a/text()").extract()[0].encode("utf-8") item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8") if len(node.xpath("./td[2]/text()")): item['positionType'] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8") else: item['positionType'] = "" item['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8") item['workLocation'] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8") item['publishTime'] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8") yield item if self.offset < 3070: self.offset += 10 url = self.baseURL + str(self.offset) yield scrapy.Request(url, callback = self.parse)
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: items = TencentItem() # 提取信息 items['positionName'] = node.xpath("./td/a/text()").extract()[ 0] # .encode('utf-8') items['positionLink'] = node.xpath("./td/a/@href").extract()[0] if node.xpath("./td[2]/text()"): items['positionType'] = node.xpath( "./td[2]/text()").extract()[0] else: items['positionType'] = None items['peopleNumber'] = node.xpath("./td[3]/text()").extract()[0] items['workLocation'] = node.xpath("./td[4]/text()").extract()[0] items['publishTime'] = node.xpath("./td[5]/text()").extract()[0] # print(items) yield items if self.offset < 3300 + 10: self.offset += 10 urls = self.baseUrl + str(self.offset) request = scrapy.Request(urls, callback=self.parse) yield request # //div[@class='left wcont_b box']//table[@class='tablelist']//tr/td[@class='l square']
def parse(self, response): node_list = response.xpath("//tr[@class='odd'] | //tr[@class='even']") for node in node_list: item = TencentItem() #职位名称 item['positionName'] = node.xpath('./td[1]/a/text()').extract()[0] # 职位链接 item['positionLink'] = "https://hr.tencent.com/" + node.xpath('./td[1]/a/@href').extract()[0] # 职位类型 if len(node.xpath('./td[2]/text()')): item['positonType'] = node.xpath('./td[2]/text()').extract()[0] else: item['positonType'] = "" # 招聘人数 item['peopleNumber'] = node.xpath('./td[3]/text()').extract()[0] # 工作地点 item['workLocation'] = node.xpath('./td[4]/text()').extract()[0] # 发布时间 item['poblishTime'] = node.xpath('./td[5]/text()').extract()[0] yield item # 这是拼接URL的方法,对于读取json的时候,因为没有原始的url随意用构建 # 这里也有问题如果 如果职位数变了,就不是3600了,每次都得改。 # if self.offset < 50: # self.offset += 10 # url = self.baseURL + str(self.offset) # yield scrapy.Request(url, callback = self.parse) # 方法二,用下一页来做,这样就不管多少职位都能搞了 if len(response.xpath('//a[@class="noactive" and @id="next"]')) == 0: url = response.xpath('//a[@id="next"]/@href').extract()[0] yield scrapy.Request('https://hr.tencent.com/' + url, callback = self.parse)
def parse(self, response): # print(response.text) # print(response.body.decode("utf-8")) # print(response.headers) nodelist = response.xpath("//tr[@class='even'] | //tr[@class='odd']") #IndexError: list index out of range for node in nodelist: item = TencentItem() item['position_url'] = node.xpath('./td[1]/a/@href').extract()[0] item['position_name'] = node.xpath('./td[1]/a/text()').extract()[0] # position_type = node.xpath('./td[2]/text()').extract()[0] 爬取内容为空时,数组会报越界错误 position_types_list = node.xpath('./td[2]/text()').extract() item['position_type'] = position_types_list[0] if position_types_list else None item['position_num'] = node.xpath('./td[3]/text()').extract()[0] item['position_address'] = node.xpath('./td[4]/text()').extract()[0] #position_time = node.xpath('./td[5]/text()').extract()[0] 爬取内容为空时,数组会报越界错误 times = node.xpath('./td[5]/text()').extract() item['position_time'] = times[0] if times else None yield item if self.offset < 3040: self.offset += 10 yield scrapy.Request(self.url+str(self.offset),callback=self.parse)
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() item['position_name'] = node.xpath( "./td[1]/a/text()").extract_first() item['position_link'] = "https://hr.tencent.com/" + node.xpath( "./td[1]/a/@href").extract_first() item['position_type'] = node.xpath( "./td[2]/text()").extract_first() item['people_number'] = node.xpath( "./td[3]/text()").extract_first() item['work_location'] = node.xpath( "./td[4]/text()").extract_first() item['publish_times'] = node.xpath( "./td[5]/text()").extract_first() yield item """ # 判断当前页面是否到最后一页,如果没到最后一页,就继续发送下一页的请求 if not response.xpath("//a[@class='noactive' and @id='next']").extract_first(): next_link = "https://hr.tencent.com/" + response.xpath("//a[@id='next']/@href").extract_first() yield scrapy.Request(next_link, callback = self.parse) """ """
def parse(self, response): contents = response.xpath("//div[@class ='recruit-list']/a") for each in contents: item = TencentItem() positionName = each.xpath("/h4/text()").extract() organization = each.xpath("/p/span[1]/text()").extract() positionLocation = each.xpath("/p/span[2]/text()").extract() positionType = each.xpath("/p/span[3]/text()").extract() releaseTime = each.xpath("/p/span[4]/text()").extract() positionBrief = each.xpath("/p[2]/text()").extract() item['positionName'] = positionName[0] item['organization'] = organization[0] item['positionLocation'] = positionLocation[0] item['positionType'] = positionType[0] item['releaseTime'] = releaseTime[0] item['positionBrief'] = positionBrief[0] yield item # # 第一种写法:拼接url。使用场景,页面没有可以点击的链接,只能通过拼接url才能获得响应 if self.offset < 380: self.offset += 1 url = self.baseURL + str(self.offset) # 返回请求 yield scrapy.Request(url, callback=self.parse)
def parse(self, response): # 提取所有职位信息的节点列表 node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: # 迭代每个节点,并将数据保存到item对象里,每个item对象表示一条职位信息 item = TencentItem() item['position_name'] = node.xpath( "./td[1]/a/text()").extract_first() item['position_link'] = "http://hr.tencent.com/" + node.xpath( "./td[1]/a/@href").extract_first() item['position_type'] = node.xpath( "./td[2]/text()").extract_first() item['people_number'] = node.xpath( "./td[3]/text()").extract_first() item['work_location'] = node.xpath( "./td[4]/text()").extract_first() item['publish_times'] = node.xpath( "./td[5]/text()").extract_first() # 发送每个职位详情页的请求,并指定回掉函数处理响应 # yield scrapy.Request(url=item["position_link"], callback=self.parse_positon) # meta接收一个字典,并将该字典做为response的属性传递到回调函数里,在通过response.meta提取数据 yield scrapy.Request(url=item["position_link"], meta={"item": item}, callback=self.parse_position)
def parse(self, response): #提取的标签列表 node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for node in node_list: item = TencentItem() #提取每个职位的信息 item['name'] = node.xpath('./td[1]/a/text()').extract_first() item['pos_link'] = 'https://hr.tencent.com/' + node.xpath('./td[1]/a/@href').extract_first() pos_Type = node.xpath('./td[2]/text()').extract_first() if not pos_Type: item['pos_Type'] = '' else: item['pos_Type'] = pos_Type item['pos_nums'] = node.xpath('./td[3]/text()').extract_first() item['pos_loaction'] = node.xpath('./td[4]/text()').extract_first() item['pos_time'] = node.xpath('./td[5]/text()').extract_first() yield item #第一种写法。拼接url,页面没有连接 只能进行拼接 """if self.offset < 2190: self.offset += 10 url = self.base_url + str(self.offset) yield scrapy.Request(url, callback = self.parse) """ #直接从rsponse里提取连接 直接提取完 url = response.xpath('//a[@class="noactive" and @id="next"]') if not len(url): next_url ='https://hr.tencent.com/' + response.xpath('//a[@id="next"]/@href').extract_first() #原来是二次解析的域名被过滤掉了,解决办法 1、dont_filter=True 忽略allowed_domains的过滤 2、更换为对应的一级域名 yield scrapy.Request(next_url, callback=self.parse, dont_filter=True)
def parse(self, response): cot_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]') for cot in cot_list: item = TencentItem() cot_name = cot.xpath('./td[1]/a/text()').extract()[0] cot_link = cot.xpath('./td[1]/a/@href').extract()[0] cot_type = cot.xpath('./td[2]/text()').get() cot_num = cot.xpath('./td[3]/text()').extract()[0] cot_add = cot.xpath('./td[4]/text()').extract()[0] cot_time = cot.xpath('./td[5]/text()').extract()[0] item["cot_name"] = cot_name item["cot_link"] = cot_link item["cot_type"] = cot_type item["cot_num"] = cot_num item["cot_add"] = cot_add item["cot_time"] = cot_time yield item # page = response.xpath('//div[@class="left"]/span/text()').get() # if self.st < int(page): # self.st += 10 if self.st < 100: self.st += 10 # 每一页的请求链接 new_url = self.url + str(self.st) yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): alibaba_position = response.xpath("//div/div/table//tr[@data-type ='11']") item = TencentItem() i = 1 for each in alibaba_position: position = each.xpath('./th/a/text()').extract() # 获取职位具体工作岗位 information = each.xpath('./td/text()').extract() # techlogy =each.xpath('./td[@data-spm-anchor-id="0.0.0.i%s.1672725fHqSMuq"]/text()'%i).extract() # i += 1 techlogy = each.xpath('./td/text()').extract() # 获取工作城市 work_city =each.xpath('./td[@class="work-city"]/text()').extract() # 获取毕业时间 # generation =each.xpath('./td[@data-spm-anchor-id="0.0.0.i%s.1672725fHqSMuq"]/text()'%i).extract() # i += 1 # 获取职位具体信息 detail = each.xpath('./td[@class="position-detail"]/a/text()').extract() item['position'] = position[0] item['techlogy'] = techlogy[0] item['work_city'] = work_city[0] item['generation'] = techlogy[1] item['detail'] = detail[0] yield item
def parse(self, response): item = [] for each in response.xpath('//*[@class="even"]'): item = TencentItem() name = each.xpath('./td[1]/a/text()').extract()[0] detail_link = each.xpath('./td[1]/a/@href').extract()[0] job_info = each.xpath('./td[2]/text()').extract()[0] people_number = each.xpath('./td[3]/text()').extract()[0] work_city = each.xpath('./td[4]/text()').extract()[0] publish_date = each.xpath('./td[5]/text()').extract()[0] item['name'] = name item['detail_link'] = detail_link item['job_info'] = job_info item['people_number'] = people_number item['work_city'] = work_city item['publish_date'] = publish_date # 翻页 curpage = re.search('(\d+)', response.url).group(1) page = int(curpage) + 10 url = re.sub('\d+', str(page), response.url) # 发送新的url请求加入待爬队列,并调用回调函数 self.parse yield scrapy.Request(url, callback=self.parse) # 将获取的数据交给pipeline yield item
def parse(self, response): node_list = response.xpath('//tr[@class="even"] | //tr[@class="odd"]') for node in node_list: item = TencentItem() #xpath数组下表 从一开始 print(len(node_list)) print("真在获取页面。。。。。。") item['positionName'] = node.xpath('./td[1]/a/text()').extract()[0].encode('utf-8') item['positionLink'] = node.xpath('./td[1]/a/@href').extract()[0].encode('utf-8') if len(node.xpath('./td[2]/text()')): item['positionType'] = node.xpath('./td[2]/text()').extract()[0].encode('utf-8') else: item['positionType'] = "".encode('utf-8') item['peopleNumber'] = node.xpath('./td[3]/text()').extract()[0].encode('utf-8') item['workLocation'] = node.xpath('./td[4]/text()').extract()[0].encode('utf-8') item['publishTime'] = node.xpath('./td[5]/text()').extract()[0].encode('utf-8') print(node.xpath('./td[5]/text()').extract()[0].encode('utf-8')) yield item #self.offset = 10 ''' if self.offset < 200: self.offset += 10 url = self.baseUrl + str(self.offset) yield scrapy.Request(url, callback = self.parse) ''' #另一中方法不断 提取下一页的链接 if len(response.xpath('//a[@class="noactive" and @id="next"]')) == 0: url = response.xpath('//a[@id="next"]/@href').extract()[0] yield scrapy.Request("http://hr.tencent.com/" + url,callback = self.parse)
def parse(self, response): node_list = response.xpath( "//tr[@class ='even'] | //tr[@class ='odd']") for node in node_list: item = TencentItem() item['positionName'] = node.xpath("./td[1]//text()").extract()[0] item['positionLink'] = node.xpath("./td[1]/a/@href").extract()[0] try: item['positionType'] = node.xpath( "./td[2]//text()").extract()[0] except: item['positionType'] = '' item['peopleNumber'] = node.xpath("./td[3]//text()").extract()[0] item['workLocation'] = node.xpath("./td[4]//text()").extract()[0] item['publicTime'] = node.xpath("./td[5]//text()").extract()[0] yield item # if self.offset < 3110: # self.offset += 10 # url = self.baseURL + str(self.offset) # yield scrapy.Request(url, callback=self.parse) if not len(response.xpath("//a[@class='noactive' and @id='next']")): url = response.xpath("//a[@id='next']/@href").extract()[0] yield scrapy.Request("https://hr.tencent.com/" + url, callback=self.parse)
def parse(self, response): print("#" * 80) node_list = response.xpath( "//tr[@class='even'] | //tr[@class = 'odd']") for node in node_list: item = TencentItem() item["positionName"] = node.xpath( "./td[1]/a/text()").extract()[0].encode("utf-8") item["positionLink"] = node.xpath( "./td[1]/a/@href").extract()[0].encode("utf-8") if node.xpath("./td[2]/text()"): item["positionType"] = node.xpath( "./td[2]/text()").extract()[0].encode("utf-8") else: item["positionType"] = '' item["peopleNumber"] = node.xpath( "./td[3]/text()").extract()[0].encode("utf-8") item["workLocation"] = node.xpath( "./td[4]/text()").extract()[0].encode("utf-8") item["publishTime"] = node.xpath( "./td[5]/text()").extract()[0].encode("utf-8") yield item # print(response.xpath("//a[@class = 'noactive'and @id = 'next']")) if not response.xpath("//a[@class = 'noactive'and @id = 'next']"): url = "https://hr.tencent.com/" + response.xpath( "//a[@id = 'next']/@href").extract()[0] print(url) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response): print(response.body) node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() position_name = node.xpath("./td[1]/a/text()").extract_first() position_link = node.xpath("./td[1]/a/@href").extract_first() if len(node.xpath("./td[2]/text()")): position_type = node.xpath("./td[2]/text()").extract_first() else: position_type = "" position_number = node.xpath("./td[3]/text()").extract_first() work_location = node.xpath("./td[4]/text()").extract_first() publish_time = node.xpath("./td[5]/text()").extract_first() item['position_name'] = position_name item['position_link'] = position_link item['position_type'] = position_type item['position_number'] = position_number item['work_location'] = work_location item['publish_time'] = publish_time yield item if not len(response.xpath("//a[@class='noactive' and @id='next']")): next_url = response.xpath("//a[@id='next']/@href").extract_first() url = 'https://hr.tencent.com/' + next_url yield scrapy.Request(url, callback=self.parse) # parse前添加self.
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: # 每一个item表示一个职位信息 item = TencentItem() item["position_name"] = node.xpath(".//a/text()").extract_first() item["position_link"] = node.xpath(".//a/@href").extract_first() item["position_type"] = node.xpath( "./td[2]/text()").extract_first() item["people_number"] = node.xpath( "./td[3]/text()").extract_first() item["work_location"] = node.xpath( "./td[4]/text()").extract_first() item["publish_times"] = node.xpath( "./td[5]/text()").extract_first() yield item # 使用与确定的页码,一直循环判断并自增 # 优点是写法简单,缺点是并没有用到scrapy的并发 if self.offset <= 2690: self.offset += 10 # callback 表示回调函数 # 请求发送出去,返回的响应由callback指定定的方法解析 yield scrapy.Request(url=self.base_url + str(self.offset), callback=self.parse)
def parse(self, response): node_list = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for node in node_list: item = TencentItem() # print(node.xpath("td[1]/a/text()").extract()) item['name'] = node.xpath("./td[1]/a/text()").extract()[0].encode( "utf-8") if len(node.xpath("./td[2]/text()")) == 0: item['type'] = "" else: item['type'] = node.xpath( "./td[2]/text()").extract()[0].encode("utf-8") item['number'] = node.xpath("./td[3]/text()").extract()[0].encode( "utf-8") item['position'] = node.xpath( "./td[4]/text()").extract()[0].encode("utf-8") item['time'] = node.xpath("./td[5]/text()").extract()[0].encode( "utf-8") # print(dict(item)) yield item if len(response.xpath("//a[@id='next' and @class = 'noactive']")) == 0: # position.php? & start = 10 # a url = response.xpath("//a[@id='next']/@href").extract()[0] print(url) yield scrapy.Request('http://hr.tencent.com/' + url, dont_filter=True, callback=self.parse)
def parse(self, response): # print(response.text) for each in response.xpath( "//tr[@class = 'even'] | //tr[@class = 'odd']"): # 初始化模型对象 item = TencentItem() item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] item['positionType'] = each.xpath("./td[2]/text()").extract()[0] item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item # 方式一 拼接url # if self.offset < 100: # self.offset += 10 # # 将请求重写发送给调度器入队列、出队列、交给下载器下载 # # 拼接新的url,并回调parse函数处理response # # yield scrapy.Request(url, callback = self.parse) # yield scrapy.Request(self.url + str(self.offset), callback=self.parse) # 方式二,读取下一页的按钮 if len(response.xpath("//a[@class= 'noactive' and @id = 'next]")) == 0: url = response.xpath("//a[@id='next]/@href").extrace()[0] yield scrapy.Request('http://careers.tencent.com/' + url, callback=self.parse)
def parse(self, response): node_list = response.xpath("//tr[@class='even']|//tr[@class='odd']") for node in node_list: item = TencentItem() #提取职位信息,并将字符串编码 item['JobTitle'] = node.xpath("//tr[@class='even']/td[1]/a/text()" ).extract()[0].encode("utf-8") item['JobLink'] = node.xpath( "./td[1]/a/@href").extract()[0].encode("utf-8") if len(node.xpath("./td[2]/text()")): item['JobType'] = node.xpath( "./td[2]/text()").extract()[0].encode("utf-8") else: item['JobType'] = "" item['Numbers'] = node.xpath("./td[3]/text()").extract()[0].encode( "utf-8") item['WorkPlace'] = node.xpath( "./td[4]/text()").extract()[0].encode("utf-8") item['ReleaseTime'] = node.xpath( "./td[5]/text()").extract()[0].encode('utf-8') #返回数据后继续执行函数 yield item #Response获取需要爬的链接 callback 直到提取完成 if self.NextNumber < 2190: self.NextNumber += 10 Url = self.baseurl + str(self.NextNumber) yield scrapy.Request(Url, callback=self.parse)
def parse(self, response): position_list = response.xpath( '//tr[@class="even"] | //tr[@class="odd"]') for position in position_list: item = TencentItem() position_name = position.xpath("./td[1]/a/text()").get() position_link = position.xpath("./td[1]/a/@href").get() position_type = position.xpath("./td[2]/text()").extract()[0] people_num = position.xpath("./td[3]/text()").extract()[0] work_address = position.xpath("./td[4]/text()").get() publish_time = position.xpath("./td[5]/text()").get() item["position_name"] = position_name item["position_link"] = position_link item["position_type"] = position_type item["people_num"] = people_num item["work_address"] = work_address item["publish_time"] = publish_time yield item total = response.xpath('//tr[@class="f"]//span/text()').extract()[0] if self.offset < int(total): self.offset += 10 new_url = 'https://hr.tencent.com/position.php?&start=' + str( self.offset) yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # 初始化模型对象 try: item = TencentItem() # 职位名称 item['positionname'] = each.xpath( "./td[1]/a/text()").extract()[0] # # 详情连接 item['positionlink'] = each.xpath( "./td[1]/a/@href").extract()[0] # # 职位类别 item['positionType'] = each.xpath( "./td[2]/text()").extract()[0] # # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # # 工作地点 item['workLocation'] = each.xpath( "./td[4]/text()").extract()[0] # # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] except IndexError: print( "================================================================" ) yield item if self.offset < 3951: self.offset += 10 # 每次处理完一页的数据之后,重新发送下一页页面请求 # self.offset自增10,同时拼接为新的url,并调用回调函数self.parse处理Response yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): node_list = response.xpath('//tr[@class="odd"] | //tr[@class="even"]') for node in node_list: item = TencentItem() item['position_name'] = node.xpath( "./td[1]/a/text()").extract_first() item['position_link'] = u"https://hr.tencent.com/" + node.xpath( "./td[1]/a/@href").extract_first() item['position_type'] = node.xpath( "./td[2]/text()").extract_first() item['people_number'] = node.xpath( "./td[3]/text()").extract_first() item['work_location'] = node.xpath( "./td[4]/text()").extract_first() item['publish_time'] = node.xpath("./td[5]/text()").extract_first() yield item # 发送详情页的请求 yield scrapy.Request(item['position_link'], callback=self.parse_detail) # if self.offset < 1390: # self.offset += 10 # next_url = self.base_url + str(self.offset) # # callback: 表示该请求发送后,返回的响应交给指定的 回调函数解析 parse # yield scrapy.Request(next_url, callback=self.parse) if not response.xpath("//a[@class='noactive' and @id='next']"): next_url = u"https://hr.tencent.com/" + response.xpath( ".//a[@id='next']/@href").extract_first() yield scrapy.Request(next_url, callback=self.parse)