Пример #1
0
    def parse_episode_iqiyi(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_youku_playlength:%s" % response.request.url)
            pg_id = response.request.meta['pg_id']
            cat_name = response.request.meta['cat_name']
            site_id = response.request.meta['site_id']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            #category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            ep_item['category'] = cat_name
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = site_id
            ep_item['pg_id'] = pg_id
            ep_item['audit'] = audit
            ep_item['url'] = response.request.url
            ep_item['format_id'] = self.format_id
            ep_item['priority'] = priority

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #2
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            thumb_url = response.request.meta['thumb']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)

            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath('//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()').extract()

            category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract()

            upload_time = response.xpath('//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath('//div[@class="crumb_bar"]/span[2]/span/text()').extract()
            
            tag = response.xpath('//span[@id="widget-videotag"]/descendant::*/text()').extract()
            if not tag:
                tag = response.xpath('//span[@class="mod-tags_item vl-block"]/descendant::*/text()').extract()
            if not tag:
                tag = response.xpath('//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()
            
            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] =  "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            if category:
                ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id

            if albumid:
                items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)