def parse_fontpage(self, response): for zone in response.xpath( "//div[contains(@name,'m_pos')]/div[contains(@class,'mod-new')]" ): try: label_ = zone.xpath("div/h2/img/@title").extract()[0] except: continue if label_ == '放剧场': continue # 放剧场片名太怪,会引起冲突 label = self.font_table[label_] zone_set = [] zone_set += zone.xpath("div//div[@class='p-thumb']") for hide_eles in zone.xpath("div//textarea/text()").extract(): zone_set += Selector( text=hide_eles).xpath("//div[@class='p-thumb']") for ele in zone_set: try: vid = re.findall(self.id_pattern, ele.xpath("./a/@href").extract()[0])[0] except: continue img_c = ele.xpath( "./img[contains(@src,'ykimg')]/@src").extract() img_c += ele.xpath( "./img[contains(@alt,'ykimg')]/@alt").extract() img = img_c[0] title = ele.xpath('./a/@title').extract()[0] if label == '剧集': yield scrapy.Request(url=self.page.format(vid=vid), meta={ 'img': img, 'series': title }, callback=self.parse_tv) elif label == '综艺': if label_ == '综艺': series = ele.xpath( "./following-sibling::ul[@class='info-list']/li/span/text()" ).extract()[0] elif label_ == '自频道精选': series = ele.xpath( "./preceding-sibling::div[@class='p-user']/@title" ).extract()[0] yield scrapy.Request(url=self.page.format(vid=vid), meta={'series': series}, callback=self.parse_show) else: video = YoukuItem() video['vid'] = vid video['img'] = img video['title'] = ele.xpath('./a/@title').extract()[0] video['update_time'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") video['category'] = label video['series'] = '' yield video
def get_tab1(self, response): items = [] for rank in response.xpath('./div[@class="rank"]/table/tbody/tr'): item = YoukuItem() item['rank_category'] = 'dianying' item['rank_name'] = rank.xpath( './td[@class="key"]/a/@title').extract() item['rank_actor'] = rank.xpath( './td[@class="intro"]/a/text()').extract() item['rank_index'] = rank.xpath( './td[@class="status"]/span/a/text()').extract() item['rank_trend'] = rank.xpath( './td[@class="trend"]/span/@class').extract() items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.xpath('//*/ul[@class="panel"]/li[@class="yk-col4 mr1"]') for site in sites: movie = YoukuItem() movie['name'] = site.xpath( './/li[@class="title"]/a/text()').extract() movie['actor'] = site.xpath( './/li[@class="actor"]/a/text()').extract() movie['playcounts'] = site.xpath('.//li[3]/text()').extract() yield movie hxs = HtmlXPathSelector(response) next_url = "http:" + hxs.xpath( '//ul[@class="yk-pages"]/li[@class="next"]/a/@href')[0].extract() yield scrapy.Request(next_url, callback=self.parse)
def parse_tv(self, response): for item in response.xpath( "//div[@class='tvlists']//div[contains(@class,'items')]/div[contains(@name,'tvlist')]" ): video = YoukuItem() try: video['vid'] = re.findall(self.id_pattern, item.xpath('./@id').extract()[0])[0] except: continue title = response.meta['series'] + \ item.xpath('./@title').extract()[0] video['title'] = title video['update_time'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") video['category'] = '剧集' video['img'] = response.meta['img'] video['series'] = response.meta['series'] yield video
def parse_movie(self, response): i = 0 for item in response.xpath("//div[contains(@class,'p-thumb')]"): img = item.xpath(".//img/@src").extract() if not img: continue video = YoukuItem() video['img'] = img[0] video['category'] = '电影' video['title'] = item.xpath(".//a/@title").extract()[0] try: video['vid'] = re.findall( self.id_pattern, item.xpath(".//a/@href").extract()[0])[0] except: continue video['update_time'] = datetime.datetime.now().strftime("%Y-%m-%d") video['rank'] = response.meta['rank'] * 30 + i i += 1 yield video
def parse_rank(self, response): data = json.loads(response.body.decode()) for video_info in data['result']['data']: video = YoukuItem() video['title'] = video_info['title'] video['update_time'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") vid = re.findall(self.id_pattern, video_info['homepageurl']) if not vid: continue video['vid'] = vid[0] video['img'] = video_info['avatar'] video['rank'] = video_info['order'] if video_info['kind']: if type(video_info['kind']) is str: video['label'] = video_info['kind'] elif type(video_info['kind']) is list: video['label'] = ','.join(video_info['kind']) video['category'] = response.meta['category'] yield video
def parse_show(self, response): for item in response.xpath( "//div[@class='showlists']//div[contains(@class,'items')]/div[contains(@id,'child')]" ): video = YoukuItem() try: video['vid'] = re.findall( self.id_pattern, item.xpath(".//div[contains(@id,'item_')]/@id").extract() [0])[0] except: continue title = item.xpath( ".//div[contains(@id,'item_')]/@title").extract()[0] video['title'] = title video['update_time'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") video['category'] = '综艺' video['img'] = item.xpath( ".//div[contains(@class,'cover')]/img/@src").extract()[0] video['series'] = response.meta['series'] yield video
def parse(self, response): play_list = response.xpath( '//div[@id="playList"]//div[@class="p-thumb"]') items = [] for play in play_list: url = play.xpath('a/@href').extract_first() # url = "//v.youku.com/v_show/id_XMjc2NjE5NjU0OA==.html?f=49412420&o=1" id = re.findall(r'(id_)(.*)(\.)', url)[0][1] url = "http:" + url album_id = "49412420" title = play.xpath('a/@title').extract_first().encode('utf-8') img_url = play.xpath('img/@src').extract_first() item = YoukuItem() item['id'] = id item['album_id'] = album_id item['title'] = title item['url'] = url item['img_url'] = img_url items.append(item) return items