Пример #1
0
    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            thumb_url = response.request.meta['thumb_url']
            upload_time = response.request.meta['upload_time']
            category = response.request.meta['category']
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else 1
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.url_prefix + owner_url[0] + "/about",
                            callback=self.parse_about))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            #category = response.xpath('//p[@id="eow-category"]/a/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()

            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_youtube_showid(response.request.url)
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if category:
                #ep_item['category'] = category[0].replace('&', '|')
                ep_item['category'] = category
            '''
            if upload:
                ptime = Util.get_youtube_publish(upload[0])
                if ptime:
                    ep_item['upload_time'] = ptime
            '''
            if upload_time:
                t = Util.get_youtube_upload_time(upload_time[0].strip())
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = "\n".join(description)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)
            ep_item['kw_id'] = kw_id

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.parse_other_info,
                        meta={'item': ep_item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #2
0
    def video_parse(self, response):
        items = []
        try:
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            show_id = Util.get_youtube_showid(response.request.url)
            if not show_id:
                return items

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.youtube_url_prefix + owner_url[0] +
                            "/about",
                            callback=self.video_about_parse))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()
            category = response.xpath(
                '//div[@id="watch-description"]//ul[@class="content watch-info-tag-list"]/li/a/text()'
            ).extract()
            upload = response.xpath(
                '//meta[@itemprop="datePublished"]/@content').extract()
            #该方法获取的缩略图
            thumb_url = response.xpath(
                '//link[@itemprop="thumbnailUrl"]/@href').extract()
            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = show_id
            #这里缩略图采用合成的方式得到['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault']
            #ep_item['thumb_url'] = self.thumb_url_prefix + '/' + show_id + '/default.jpg'
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if description:
                ep_item['description'] = "\n".join(description)
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            if kw_id:
                ep_item['kw_id'] = kw_id
            if pg_id:
                ep_item['pg_id'] = pg_id
            if cat_id:
                ep_item['cat_id'] = cat_id
            if subject_id:
                ep_item['subject_id'] = subject_id

            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if category:
                category = category[0].strip()
                #https://www.youtube.com/watch?v=lwy4qwaByVQ
                ep_item['category'] = category.replace('&', '|')
            if upload:
                upload = upload[0].strip()
                struct_time = None
                struct_time = time.strptime(upload, '%b %d, %Y')
                if not struct_time:
                    struct_time = time.strptime(upload, '%Y年%m月%d日')
                if struct_time:
                    time_str = time.strftime('%Y-%m-%d %H:%M:%S', struct_time)
                    #time_str = "%s-%s-%s %s" % (struct_time.tm_year, struct_time.tm_mon, struct_time.tm_mday, time_str)
                    ep_item['upload_time'] = time_str

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.video_other_info_parse,
                        meta={'item': ep_item}))
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)