def parse(self, response): selector = response.xpath("//div[@class='content']") ITEM_NUMBER = len(selector) title = selector.xpath("//a[@class='title']/text()").extract() pts = selector.xpath("//div[@class='pts']/div/text()").extract() # 数据装箱 for i in range(0, ITEM_NUMBER): item = BilibiliranklistspiderItem() item['title'] = title[i] # item['play'] = play[i] item['pts'] = pts[i] yield item
def parse(self, response): selector = response.xpath("//div[@class='content']") ITEM_NUMBER = len(selector) title = selector.xpath("//a[@class='title']/text()").extract() barrage = selector.xpath( "//span[@class='data-box']//i[@class='b-icon view']/parent::node()/text()" ).extract() pts = selector.xpath("//div[@class='pts']/div/text()").extract() play = selector.xpath( "//span[@class='data-box']//i[@class='b-icon play']/parent::node()/text()" ).extract() # 数据装箱 for i in range(0, ITEM_NUMBER): item = BilibiliranklistspiderItem() item['title'] = title[i] item['barrage'] = barrage[i] item['play'] = play[i] item['pts'] = pts[i] yield item
def parse(self, response): # 文件名设为当日日期 filename = time.strftime("%Y-%m-%d") with open('./temp/' + filename, 'wb') as f: f.write(response.body) selector = response.xpath("//div[@class='content']") ITEM_NUMBER = len(selector) title = selector.xpath("//a[@class='title']/text()").extract() author = selector.xpath( "//span[@class='data-box']//i[@class='b-icon author']/parent::node()/text()" ).extract() barrage = selector.xpath( "//span[@class='data-box']//i[@class='b-icon view']/parent::node()/text()" ).extract() pts = selector.xpath("//div[@class='pts']/div/text()").extract() href = selector.xpath("//a[@class='title']/@href").extract() play = selector.xpath( "//span[@class='data-box']//i[@class='b-icon play']/parent::node()/text()" ).extract() # 数据装箱 for i in range(0, ITEM_NUMBER): item = BilibiliranklistspiderItem() item['title'] = title[i] item['author'] = author[i] item['barrage'] = barrage[i] item['play'] = play[i] item['pts'] = pts[i] item['href'] = href[i] # 为了爬取分区、粉丝数等数据,需要进入每一个视频的详情页面进行抓取 yield Request( "https://" + href[i][2:], meta={'item': item}, callback=self.detailParse)