Пример #1
0
    def __init__(self):
        self.__db_mgr = DbManager.instance()
        self.__cdict = {}
        self.__cdict[u'电影'] = u'电影片花'
        self.__cdict[u'电视剧'] = u'电视片花'
        self.__cdict[u'综艺'] = u'综艺片花'
        self.__cdict[u'动漫'] = u'动漫片花'
        self.__cdict[u'自拍'] = u'其他'
        self.__cdict[u'创意视频'] = u'其他'
        self.__cdict[u'网剧'] = u'搞笑'
        self.__cdict[u'拍客'] = u'搞笑'
        self.__cdict[u'亲子'] = u'母婴'
        self.__cdict[u'教育'] = u'公开课'
        self.__cdict[u'原创'] = u'其他'

        #self.__ddict = {}
        #self.__ddict['资讯'] = u'del'
        #self.__ddict['微电影'] = u'del'
        self.__dlist = [u'资讯', u'微电影']
Пример #2
0
class iqiyi_search_video(Spider):
    name = 'iqiyi_search_video'
    pipelines = ['MysqlStorePipeline']
    spider_id = '32768'
    site_id = '5'
    allowed_domain=["so.iqiyi.com", "www.iqiyi.com"]
    url_prefix = 'http://so.iqiyi.com'
    playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    playlength_url = "http://cache.video.iqiyi.com/a/"
    hottest_played_threshold = get_project_settings().get('HOTTEST_PLAYED_THRESHOLD')
    
    mgr = DbManager.instance()
    channel_exclude = mgr.get_channel_exclude()
    
    def __init__(self, cat_ids=None, keywords=None, *args, **kwargs):
        super(iqiyi_search_video, self).__init__(*args, **kwargs)
        if keywords:
            keywords = json.loads(keywords)
            self.max_search_page = get_project_settings().get('MAX_MANUAL_SEARCH_PAGE')
        else:
            keywords = self.mgr.get_keywords(st='video', site_name='iqiyi')
            self.max_search_page = get_project_settings().get('MAX_SEARCH_PAGE')
        if keywords:
            self._keywords = keywords
        else:
            self._keywords = [] 

    def start_requests(self):
        try:
            items = []
            run_time = {'10min': 2, '30min': 3, '60min': 4, 'plus': 5, 'default': 0}
            pub_time = {'day': 1, 'week': 2, 'month': 3, 'default': 0}
            quality = {'high': 3, '720P': 4, 'super': 6, '1080P': 7, 'default': ''}
            sort = {'composite': 1, 'new': 4, 'played': 11}
            for kw in self._keywords:
                url = "%s/so/q_%s_ctg__t_%s_page_%s_p_%s_qc_%s_rd_%s_site_%s_m_%s_bitrate_%s" % \
                    (self.url_prefix, urllib2.quote(kw['keyword'].encode('utf8')), run_time['default'], 1, 1, 0, pub_time['default'], "iqiyi", sort['composite'], quality['default'])
                items.append(Request(url=url, callback=self.parse, meta={'page':1, 'kw_id': kw['id']}))
            return items
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #3
0
class QqCatSpider(Spider):
    name = "qq_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "4194304"
    site_id = "16"
    format_id = 2
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(QqCatSpider, self).__init__(*args, **kwargs)
        cat_urls = kwargs.get('cats')
        if cat_urls:
            cat_urls = json.loads(cat_urls)
        else:
            cat_urls = self.mgr.get_cat_url('qq')
        if cat_urls:
            self._cat_urls = cat_urls
        else:
            self._cat_urls = []

    def start_requests(self):
        try:
            items = []
            for cat in self._cat_urls:
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_id': cat['id'],
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            }))
                url = cat.pop('url')
                r = Request(url=url, callback=self.parse_page)
                r.meta.update({'cat': cat})
                items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            cat = response.request.meta['cat']
            items = []

            qq_v = response.xpath('//div[@class="mod_cont"]/ul/li')
            for v in qq_v:
                urls = v.xpath('./h6/a/@href').extract()
                titles = v.xpath('./h6/a/@text').extract()
                thumb_urls = v.xpath('./a/img/@src').extract()
                durations = v.xpath(
                    './a/div/span[@class="mod_version"]/text()').extract()
                playeds = v.xpath('./p/span/text()').extract()

                title = titles[0] if titles else None
                thumb_url = thumb_urls[0] if thumb_urls else None
                duration = Util.get_qq_duration(
                    durations[0]) if durations else None
                played = Util.normalize_played(Util.normalize_vp(
                    playeds[0])) if playeds else None
                if urls:
                    r = Request(url=urls[0], callback=self.parse_episode)
                    d = {
                        'title': title,
                        'thumb_url': thumb_url,
                        'duration': duration,
                        'played': played
                    }
                    d.update(order)
                    r.meta.update({'order': d})
                    items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode(self, response):
        try:
            logging.log(logging.INFO, 'episode:%s' % response.request.url)
            order = response.request.meta['order']
            items = []

            #video info
            #tags = response.xpath('//p[@class="info_tags"]//a/@title').extract()
            #descriptions = response.xpath('//div[@class="info_summary cf"]/span/text()').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_qq_showid(response.request.url)
            #if tags:
            #    ep_item['tag'] = Util.unquote(tags[0]).rstrip('|')
            #if descriptions:
            #    ep_item['description'] = descriptions[0]
            for k, v in order.items():
                if k == 'user':
                    ep_item['category'] = v
                elif k == 'show_id':
                    ep_item['owner_show_id'] = v
                else:
                    ep_item[k] = v

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['format_id'] = self.format_id
            items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #4
0
class tucao_cat(Spider):
    name = "tucao_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "7"
    site_id = "14"
    max_search_page = 1
    #request_url = "http://www.acfun.tv/dynamic/channel/1.aspx?channelId=%s&orderBy=0&pageSize=16"
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(tucao_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_cat_url('tucao')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.extend([
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            })
                ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []

            #file = open('tucao.html','w')
            #file.write(response.body)
            #file.close()
            #video items
            qy_v = response.xpath('//div[@class="list"]/ul/li')
            for v in qy_v:
                thumb = v.xpath('./div/a[@class="pic"]/img/@src').extract()
                url = v.xpath('./div/a[@class="pic"]/@href').extract()
                if url:
                    items.append(
                        Request(url=url[0].strip(),
                                callback=self.parse_episode,
                                meta={
                                    'cat_name': cat_name,
                                    'thumb': thumb,
                                    'audit': audit,
                                    'priority': priority
                                }))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            show_id = Util.get_tucao_showid(response.request.url)
            title = response.xpath(
                '//h1[@class="show_title"]/text()').extract()
            tags = response.xpath(
                '//meta[@name="keywords"]/@content').extract()
            #video info
            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tags:
                ep_item['tag'] = tags[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            #ep_item['duration'] = lens
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #5
0
class YoukuOrderSpider(Spider):
    name = "youku_order_all"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    spider_id = "256"
    site_id = "1"
    allowed_domains = ["i.youku.com", "www.youku.com", "v.youku.com"]
    url_prefix = 'http://i.youku.com'
    vpaction_url = "http://v.youku.com/v_vpactionInfo/id/"
    playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS/"
    forbidden_author_list = set()

    mgr = DbManager.instance()

    def __init__(self, orders=None, *args, **kwargs):
        super(YoukuOrderSpider, self).__init__(*args, **kwargs)
        self._orders = [
            {
                'url': 'http://i.youku.com/u/UMjk3OTcyMTM2/',
                'cust_para': {
                    'category': u'音乐',
                    'priority': '3',
                    'need_check': '1'
                }
            },
        ]

        with open('./crawler/data/blacklist', 'r') as f:
            for line in f.readlines():
                self.forbidden_author_list.add(line.strip().decode('utf-8'))

    def start_requests(self):
        try:
            items = []

            for order in self._orders:
                cust_para = order['cust_para'] if 'cust_para' in order else {}
                items.append(
                    Request(url=order['url'],
                            callback=self.parse,
                            meta={'cust_para': cust_para}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            cust_para = response.request.meta['cust_para']
            items = []

            user_item = UserItem()
            #owner id
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath(
                    './div[@class="info"]/div[@class="username"]/a[1]/@title'
                ).extract()
                played = up.xpath(
                    './div[@class="state"]/ul/li[@class="vnum"]/em/text()'
                ).extract()
                fans = up.xpath(
                    './div[@class="state"]/ul/li[@class="snum"]/em/text()'
                ).extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    user_item['played'] = Util.normalize_vp(played[0])
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath(
                    './div[@class="userintro"]/div[@class="desc"]/p[2]/text()'
                ).extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = '0'
            if yh:
                video_count = yh.xpath(
                    'div[1]/div/div/div/div[@class="title"]/span/a/text()').re(
                        u'\((\d+)\)')

                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id

            items.append(user_item)

            #videos
            items.append(
                Request(url=response.request.url + "videos",
                        callback=self.parse_video_page,
                        meta={
                            'page': 1,
                            'cust_para': cust_para
                        }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_video_page(self, response):
        try:
            page = response.request.meta['page']
            cust_para = response.request.meta['cust_para']
            log.msg('%s: %s' % (response.request.url, page))

            items = []

            #get videos
            yk_v = response.xpath('//div[@class="yk-col4"]/div')
            for v in yk_v:
                url = v.xpath('./div[@class="v-link"]/a/@href').extract()
                if url:
                    items.append(
                        Request(url=url[0],
                                callback=self.parse_episode,
                                meta={'cust_para': cust_para}))

            #get last_str and ajax_url
            last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'')
            ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'')

            #reqest sibling page
            if ajax_url:
                sibling_page = (3 * page - 1, 3 * page)
                for p in sibling_page:
                    s = last_str[0] if last_str else u''
                    para = {
                        "v_page": str(page),
                        "page_num": str(p),
                        "page_order": "1",
                        "last_str": s
                    }
                    items.append(
                        FormRequest(url=self.url_prefix + ajax_url[0] +
                                    "fun_ajaxload/",
                                    formdata=para,
                                    method='GET',
                                    callback=self.parse_video_page,
                                    meta={
                                        'page': page,
                                        'cust_para': cust_para
                                    }))

            #request next page
            next_page = response.xpath(
                '//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_video_page,
                            meta={
                                'page': page + 1,
                                'cust_para': cust_para
                            }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def content_is_forbidden(self, content):
        for keyword in self.forbidden_author_list:
            if content.find(keyword) == -1:
                pass
            else:
                return True
        return False

    def parse_episode(self, response):
        try:
            cust_para = response.request.meta['cust_para']
            log.msg('%s: %s' % (response.request.url, cust_para))
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            category = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = Util.strip_title("".join(title))
                if 'need_check' in cust_para:
                    if self.content_is_forbidden(ep_item['title']):
                        log.msg('video [ %s ] is in blacklist!' %
                                ep_item['show_id'])
                        return items
                    else:
                        pass
                else:
                    pass

            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if 'category' in cust_para:
                ep_item['category'] = cust_para['category']
            elif category:
                ep_item['category'] = category[0].replace(u'频道', '')
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            if 'priority' in cust_para:
                ep_item['priority'] = cust_para['priority']

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_vpaction(self, response):
        try:
            #log.msg('%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re(
                u'<label>总播放数:</label><span.*>(.+)</span>')
            if vp:
                item['played'] = Util.normalize_vp(vp[0])

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})

            return item

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self, response):
        try:
            #log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            plsylength = str(int(float(jinfo["data"][0]["seconds"])))
            if plsylength:
                item['duration'] = str(plsylength)

            return item
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #6
0
class iqiyi_xiaomi(Spider):
    name = "iqiyi_xiaomi"
    pipelines = ['MysqlStorePipeline']
    spider_id = "65536"
    site_id = "5"  #iqiyi
    allowed_domains = [
        "list.iqiyi.com", "www.iqiyi.com", "cache.video.iqiyi.com"
    ]
    url_prefix = 'http://list.iqiyi.com'
    playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    playlength_url = "http://cache.video.iqiyi.com/a/"
    max_search_page = 1

    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(iqiyi_xiaomi, self).__init__(*args, **kwargs)
        self._cat_urls = [
            {
                'url':
                'http://list.iqiyi.com/www/25/20031-------------4-1-2-iqiyi-1-.html',
                'id': '10000',
                'name': u'热点'
            },
            {
                'url':
                'http://list.iqiyi.com/www/25/21314-------------4-1-2-iqiyi-1-.html',
                'id': '10000',
                'name': u'新闻'
            },
            {
                'url':
                'http://list.iqiyi.com/www/25/21739-------------4-1-2-iqiyi-1-.html',
                'id': '10000',
                'name': u'新闻'
            },
            {
                'url':
                'http://list.iqiyi.com/www/25/21740-------------4-1-2-iqiyi-1-.html',
                'id': '10000',
                'name': u'新闻'
            },
        ]

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.extend([
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'page': 1,
                                'cat_id': cat['id'],
                                'cat_name': cat['name']
                            })
                ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            page = response.request.meta['page']
            cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            if int(page) > int(self.max_search_page):
                return

            items = []

            #video items
            qy_v = response.xpath(
                '//div[@class="wrapper-piclist"]/ul/li/div[1]')
            for v in qy_v:
                thumb = v.xpath('./a/img/@src').extract()
                url = v.xpath('./a/@href').extract()
                items.append(
                    Request(url=url[0].strip(),
                            callback=self.parse_episode,
                            meta={
                                'cat_id': cat_id,
                                'cat_name': cat_name,
                                'thumb': thumb
                            }))

            #pages
            next_page = response.xpath(
                "//div[@class='mod-page']/a[text()='%s']/@href" %
                u'下一页').extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_page,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id,
                                'cat_name': cat_name
                            }))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)

            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            category = response.xpath(
                '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract(
                )
            if not category:
                category = response.xpath(
                    '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()'
                ).extract()
            if not category:
                category = response.xpath(
                    '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath(
                    '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()'
                ).extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = '0'
            ep_item['priority'] = '8'

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self, response):
        try:
            log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            albumid = response.request.meta['albumid']

            items = []
            #sel = Selector(response)
            msg = response.body
            index = msg.find("AlbumInfo=") + len("AlbumInfo=")
            info = msg[index:]
            jinfo = json.loads(info)
            plsylength = jinfo["data"]["playLength"]
            if plsylength:
                if int(plsylength) < 600:
                    item['duration'] = str(plsylength)
                    items.append(
                        Request(url=self.playnum_url + albumid + "/?qyid=",
                                callback=self.parse_playnum,
                                meta={'item': item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playnum(self, response):
        try:
            #log.msg('parse_playnum ,%s' % response.request.url)
            item = response.request.meta['item']

            items = []
            #sel = Selector(response)
            tplaynum = response.selector.re(re.compile(r':(\d+)'))
            #log.msg('play: %s, %s' % (tplaynum[0], response.request.url))
            if tplaynum:
                playnum = tplaynum[0]
                item['played'] = str(playnum)
                items.append(item)

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #7
0
class ku6_search_video(Spider):
    name = "ku6_search_video"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    spider_id = "16384" #ku6_search_video
    site_id = "6"   #k6
    allowed_domains = ["so.ku6.com","v.ku6.com","v3.stat.ku6.com"]
    url_prefix = 'http://v.ku6.com/fetchVideo4Player/'
    url_playnum = 'http://v3.stat.ku6.com/dostatv.do?method=getVideoPlayCount&n=gotPlayCounts&v='
    #hottest_played_threshold = get_project_settings().get('ORDERED_PLAYED_THRESHOLD')

    mgr = DbManager.instance()

    def __init__(self, cat_ids=None, keywords=None, *args, **kwargs):
        super(ku6_search_video, self).__init__(*args, **kwargs)
        if keywords:
            keywords = json.loads(keywords)
            self.max_search_page = get_project_settings().get('MAX_MANUAL_SEARCH_PAGE')
        else:
            keywords = self.mgr.get_keywords(st='video', site_name='ku6')
            self.max_search_page = get_project_settings().get('MAX_SEARCH_PAGE')
        if keywords:
            self._keywords = keywords            
        else:
            self._keywords = []
    
    def start_requests(self):
        try:
            items = []

            for kw in self._keywords:
                kw_id = kw['id']
                word = kw['keyword']
                cat_id = kw['ext_cat_id']

                turl = 'http://so.ku6.com/search?q=' + word + '&categoryid=' + str(cat_id)
                items.append(Request(url=turl,callback=self.parse_page, meta={'page': 1,'kw_id': kw_id}))
                turl1 = str(turl) + u'&sort=uploadtime'
                items.append(Request(url=turl1,callback=self.parse_page, meta={'page': 1,'kw_id': kw_id}))
                turl2 = str(turl) + u'&sort=viewcount'
                items.append(Request(url=turl2,callback=self.parse_page, meta={'page': 1,'kw_id': kw_id}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self,response):
        try:
            log.msg('parse_page: %s' % response.request.url)
            page = response.request.meta['page']
            kw_id = response.request.meta['kw_id']
            if int(page) > int(self.max_search_page):
                return

            items = []

            #video items
            titems = response.xpath('//div[@id="search_list"]/div[2]/div[2]/ul[1]/li')
            for item in titems:
                turl = item.xpath('./h3[1]/a/@href').extract()
                if turl:
                    show_id = Util.get_ku6_showid(turl[0])
                items.append(Request(url=turl[0].strip(), callback=self.parse, meta={'kw_id': kw_id, 'show_id': show_id}))

            #pages
            next_page = response.xpath("//div[@id='search_list']/div[2]/div[2]/div/a[text()='%s']/@href" % u'下一页').extract()
            if next_page:
                items.append(Request(url=next_page[0], callback=self.parse_page, meta={'page': page+1, 'kw_id': kw_id}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    #for each category parse all its sub-categories or types
    def parse(self, response):
        try:
            #log.msg('lev1: %s' % response.request.url)
            kw_id = response.request.meta['kw_id']
            show_id = response.request.meta['show_id']
            items = []
            sel = Selector(response)

            #category
            url1 = self.url_prefix + str(show_id) + ".html"
            items.extend([Request(url=url1, callback=self.parse_second, meta={'show_id': show_id,'kw_id':kw_id})])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_second(self,response):
        try:
            #log.msg('lev2: %s' % response.request.url)
            kw_id = response.request.meta['kw_id']
            items = []
            sel = Selector(response)

            #info
            jinfo = json.loads(response.body)
            title = jinfo['data']['t']
            show_id = response.request.meta['show_id']
            tags = jinfo['data']['tag']
            tag = tags.replace(' ','|').replace(',','|').strip('|')

            tuploadtime = jinfo['data']['uploadtime']
            upload_time = Util.timestamp2datetime(tuploadtime)

            description = jinfo['data']['desc']
            thumb_url = jinfo['data']['picpath']

            tduration = str(jinfo['data']['vtime'])
            tduration1 = tduration.split(',')
            duration = tduration1[0]

            ep_item = EpisodeItem()
            if len(title) != 0:
                ep_item["title"] = title
            ep_item['show_id'] = response.request.meta['show_id']

            turl = "http://v.ku6.com/show/" + show_id + ".html"

            if len(tag) != 0:
                ep_item["tag"] = tag
            if len(upload_time) != 0:
                ep_item["upload_time"] = upload_time
            if len(turl) != 0:
                ep_item["url"] = turl
            if len(thumb_url) != 0:
                ep_item['thumb_url'] = thumb_url
            if len(duration) != 0:
                ep_item["duration"] = duration
            ep_item['kw_id'] = kw_id
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id

            items.append(Request(url=turl, callback=self.parse_episode, meta={'item':ep_item}))

            return items
            
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            item = response.request.meta['item']
            items = []

            sel = Selector(response)

            #category
            tcategory = sel.xpath('//div[@class="ckl_conleftop"]/div[1]/span[1]/a[1]/text()').extract()
            category = ""
            if len(tcategory) > 0:
                category = tcategory[0].strip()
            
            item['category'] = category

            #items.append(ep_item)
            turl = self.url_playnum + item['show_id']
            items.append(Request(url=turl, callback=self.parse_playnum, meta={'item':item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playnum(self,response):
        try:
            log.msg('parse_playnum %s' % response.request.url)
            items = []

            item = response.request.meta['item']
            sel = Selector(response)
            msg = response.body
            r = re.compile(',count:"(\d+)?')
            m = r.search(msg)
            if m:
                tinfo = m.groups(0)
                if len(tinfo) > 0:
                    playnum = tinfo[0]
                    item['played'] = str(playnum)
                    items.append(item)

        except Exception as e: 
            log.msg(traceback.format_exc(), level=log.ERROR)

        return items
Пример #8
0
class YoutubeHottestSpider(Spider):
    name = "youtube_hottest"
    pipelines = ['MysqlStorePipeline']
    spider_id = "4"
    site_id = "2"
    allowed_domains = ["www.youtube.com"]
    url_prefix = 'https://www.youtube.com'
    hottest_played_threshold = get_project_settings().get(
        'HOTTEST_PLAYED_THRESHOLD')
    hottest_time_threshold = get_project_settings().get(
        'HOTTEST_TIME_THRESHOLD')

    mgr = DbManager.instance()

    def __init__(self, orders=None, *args, **kwargs):
        super(YoutubeHottestSpider, self).__init__(*args, **kwargs)
        if orders:
            orders = json.loads(orders)
            self.max_search_page = get_project_settings().get(
                'MAX_MANUAL_SEARCH_PAGE')
        else:
            orders = self.mgr.get_ordered_url(site_name='youtube')
            self.max_search_page = get_project_settings().get(
                'MAX_SEARCH_PAGE')
        if orders:
            self._orders = orders
        else:
            self._orders = []

    def start_requests(self):
        try:
            items = []
            for i in self._orders:
                items.append(
                    Request(url=i['url'],
                            callback=self.parse,
                            meta={
                                'audit': i['audit'],
                                'priority': i['priority']
                            }))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            category = [
                r['user'] for r in self._orders
                if r['url'] == response.request.url
            ]
            if not category:
                category = ['other']

            items = []
            items.append(
                Request(url=response.request.url + "/videos",
                        callback=self.parse_video,
                        meta={
                            'category': category[0],
                            'audit': audit,
                            'priority': priority
                        }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_video(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            category = response.request.meta['category']
            items = []

            #content
            #content = response.xpath('//div[@id="video-page-content"]/ul/li')
            content = response.xpath(
                '//ul[@id="channels-browse-content-grid"]/li')
            self.parse_page_content(items, content, category, audit, priority)

            #next page
            #next_page = response.xpath('//div[@id="video-page-content"]/button/@data-uix-load-more-href').extract()
            next_page = response.xpath(
                '//button/@data-uix-load-more-href').extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_more_video,
                            meta={
                                'page': 2,
                                'category': category,
                                'audit': audit,
                                'priority': priority
                            }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_more_video(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            page = response.request.meta['page']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            category = response.request.meta['category']
            if page > self.max_search_page:
                return

            items = []

            body = json.loads(response.body)
            self.parse_page_content(
                items,
                Selector(text=body['content_html']).xpath('./body/li'),
                category, audit, priority)

            #next page
            next_page = Selector(text=body['load_more_widget_html']).xpath(
                '//button/@data-uix-load-more-href').extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_more_video,
                            meta={
                                'page': page + 1,
                                'category': category,
                                'audit': audit,
                                'priority': priority
                            }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page_content(self, items, content, category, audit, priority):
        try:
            for s in content:
                url = s.xpath(
                    './div/div/div[@class="yt-lockup-thumbnail"]/span/a/@href'
                ).extract()
                thumb_url = s.xpath(
                    './div/div/div[@class="yt-lockup-thumbnail"]/span/a/span/span/span/img/@src'
                ).extract()
                views = s.xpath(
                    './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li/text()'
                ).re('([\d|,]*) views')
                upload_time = s.xpath(
                    './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li[@class="yt-lockup-deemphasized-text"]/text()'
                ).extract()
                '''
                if not views or int(Util.normalize_played(views[0])) < int(self.hottest_played_threshold):
                    #log.msg('discard played: %s' % url[0])
                    continue
                if not upload_time or Util.get_youtube_upload_time(upload_time[0].strip()) >= int(self.hottest_time_threshold):
                    #log.msg('discard upload_time: %s' % url[0])
                    continue
                '''

                if url:
                    items.append(
                        Request(url=self.url_prefix + url[0],
                                callback=self.parse_episode,
                                meta={
                                    'thumb_url': thumb_url,
                                    'upload_time': upload_time,
                                    'category': category,
                                    'audit': audit,
                                    'priority': priority
                                }))

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            thumb_url = response.request.meta['thumb_url']
            upload_time = response.request.meta['upload_time']
            category = response.request.meta['category']
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.url_prefix + owner_url[0] + "/about",
                            callback=self.parse_about))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            #category = response.xpath('//p[@id="eow-category"]/a/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()

            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_youtube_showid(response.request.url)
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if category:
                #ep_item['category'] = category[0].replace('&', '|')
                ep_item['category'] = category
            '''
            if upload:
                ptime = Util.get_youtube_publish(upload[0])
                if ptime:
                    ep_item['upload_time'] = ptime
            '''
            if upload_time:
                t = Util.get_youtube_upload_time(upload_time[0].strip())
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = "\n".join(description)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if played:
                ep_item['played'] = Util.normalize_played(played[0])

            if audit:
                ep_item['audit'] = audit

            if priority:
                ep_item['priority'] = priority

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['format_id'] = 2
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.parse_other_info,
                        meta={'item': ep_item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_other_info(self, response):
        try:
            log.msg('%s' % response.request.url)
            item = response.request.meta['item']
            items = []

            #duration
            duration = re.search(r'length_seconds=(\d+)', response.body)

            if duration:
                item['duration'] = duration.groups()[0]

            items.append(item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_about(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            items = []

            show_id = response.xpath(
                '//meta[@itemprop="channelId"]/@content').extract()
            user_name = response.xpath(
                '//span[@class="qualified-channel-title-text"]/a/text()'
            ).extract()
            fans = response.xpath('//span[@class="about-stat"]').re(
                re.compile(r'<span.*>.*<b>([\d|,]*)</b>.*subscribers.*</span>',
                           re.S))
            played = response.xpath('//span[@class="about-stat"]').re(
                re.compile(r'<span.*>.*<b>([\d|,]*)</b>.*views.*</span>',
                           re.S))
            intro = response.xpath(
                '//div[@class="about-description branded-page-box-padding"]/descendant-or-self::*/text()'
            ).extract()

            if show_id:
                user_item = UserItem()
                user_item['show_id'] = show_id[0]

                if user_name:
                    user_item['user_name'] = user_name[0]
                if fans:
                    user_item['fans'] = Util.normalize_played(fans[0])
                if played:
                    user_item['played'] = Util.normalize_played(played[0])
                if intro:
                    user_item['intro'] = "".join(intro).strip()

                user_item['spider_id'] = self.spider_id
                user_item['site_id'] = self.site_id
                user_item['url'] = response.request.url[:-len('/about')]
                items.append(user_item)

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #9
0
class YoukuOrderSpider(Spider):
    name = "youku_order"
    pipelines = ['MysqlStorePipeline']
    spider_id = "256"
    site_id = "1"
    format_id = 2
    #allowed_domains = ["www.youku.com", "v.youku.com", "i.youku.com", "index.youku.com", "play.youku.com"]
    url_prefix = 'http://i.youku.com'
    playlength_url = "http://play.youku.com/play/get.json?ct=10&vid="
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(YoukuOrderSpider, self).__init__(*args, **kwargs)
        orders = kwargs.get('orders')
        if orders:
            orders = json.loads(orders)
        else:
            orders = self.mgr.get_ordered_url(site_name='youku')
        if orders:
            self._orders = orders
        else:
            self._orders = []

    def start_requests(self):
        try:
            items = []
            print self._orders
            for order in self._orders:
                items.append(
                    Request(url=order['url'],
                            callback=self.parse,
                            meta={
                                'audit': order['audit'],
                                'cat_name': order['user'],
                                'show_id': order['show_id'],
                                'priority': order['priority']
                            }))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse(self, response):
        try:
            logging.log(logging.INFO, "parse:%s" % response.request.url)
            audit = response.request.meta['audit']
            cat_name = response.request.meta['cat_name']
            show_id = response.request.meta['show_id']
            priority = response.request.meta['priority']

            items = []
            #items.extend(self.parse_owner(response))
            v_url = response.request.url
            if not v_url.endswith('/videos'):
                if v_url.endswith('/'):
                    v_url = v_url + "videos"
                else:
                    v_url = v_url + "/videos"
            items.append(
                Request(url=v_url,
                        callback=self.parse_video_page,
                        meta={
                            'audit': audit,
                            'cat_name': cat_name,
                            'show_id': show_id,
                            'priority': priority
                        }))
            return items

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_video_page(self, response):
        try:
            # 默认是最新发布
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            audit = response.request.meta['audit']
            cat_name = response.request.meta['cat_name']
            show_id = response.request.meta['show_id']
            priority = response.request.meta['priority']

            page = 1
            items = []
            #get videos
            yk_v = response.xpath('//div[@class="yk-col4"]/div')
            for v in yk_v:
                url = v.xpath('./div[@class="v-link"]/a/@href').extract()
                thumb_urls = v.xpath(
                    './div/div[@class="v-thumb"]/img/@src').extract()
                if thumb_urls:
                    thumb_url = thumb_urls[0]
                    if thumb_url == 'http://g1.ykimg.com/':
                        thumb_url = None
                else:
                    thumb_url = None

                pl = v.xpath(
                    './div[@class="v-meta va"]/div[@class="v-meta-entry"]/span[@class="v-num"]/text()'
                ).extract()
                if pl:
                    pld = Util.normalize_played(pl[0])
                    played = int(pld)
                else:
                    played = None
                if url:
                    items.append(
                        Request(url=url[0],
                                callback=self.parse_episode,
                                meta={
                                    'audit': audit,
                                    'thumb_url': thumb_url,
                                    'played': played,
                                    'cat_name': cat_name,
                                    'show_id': show_id,
                                    'priority': priority
                                }))
            #get last_str and ajax_url
            last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'')
            ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'')

            #reqest sibling page
            if ajax_url:
                sibling_page = (3 * page - 1, 3 * page)
                for p in sibling_page:
                    s = last_str[0] if last_str else u''
                    para = {
                        "v_page": str(page),
                        "page_num": str(p),
                        "page_order": "1",
                        "last_str": s
                    }
                    items.append(
                        FormRequest(url=self.url_prefix + ajax_url[0] +
                                    "fun_ajaxload/",
                                    formdata=para,
                                    method='GET',
                                    callback=self.parse_video_page,
                                    meta={
                                        'audit': audit,
                                        'cat_name': cat_name,
                                        'show_id': show_id,
                                        'priority': priority
                                    }))

            #request next page
            '''
            next_page = response.xpath('//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract()
            if next_page:
                items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_video_page, meta={'page':page+1}))
            '''
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode(self, response):
        try:
            logging.log(logging.INFO, 'episode:%s' % response.request.url)
            audit = response.request.meta['audit']
            thumb_url = response.request.meta['thumb_url']
            played = response.request.meta['played']
            cat_name = response.request.meta['cat_name']
            owner_show_id = response.request.meta['show_id']
            priority = response.request.meta['priority']

            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            #owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                #items.append(Request(url=owner[0], callback=self.parse_owner))

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = response.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            #if category:
            #    ep_item['category'] = category[0].replace(u'频道', '')
            ep_item['category'] = cat_name
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['audit'] = audit
            ep_item['format_id'] = self.format_id
            ep_item['thumb_url'] = thumb_url
            ep_item['played'] = played
            ep_item['priority'] = priority

            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_vpaction(self, response):
        try:
            logging.log(logging.INFO, 'vpaction:%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath(
                '//ul[@class="player_info"]/li[@class="sum"]/text()').extract(
                )
            if vp:
                item['played'] = Util.normalize_played(
                    Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})
            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_playlength(self, response):
        try:
            logging.log(logging.INFO, 'playlength:%s' % response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            plsylength = str(int(float(jinfo["data"]["video"]["seconds"])))
            if plsylength:
                item['duration'] = str(plsylength)
            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_owner(self, response):
        try:
            logging.log(logging.INFO, "owner:%s" % response.request.url)
            show_id = response.request.meta['show_id']

            items = []
            user_item = UserItem()
            #owner id
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            #show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            user_item['show_id'] = show_id
            #if show_id:
            #    user_item['show_id'] = show_id[0]
            #else:
            #    return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath(
                    './div[@class="info"]/div[@class="username"]/a[1]/@title'
                ).extract()
                played = up.xpath(
                    './div[@class="state"]/ul/li[@class="vnum"]/em/text()'
                ).extract()
                fans = up.xpath(
                    './div[@class="state"]/ul/li[@class="snum"]/em/text()'
                ).extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    #user_item['played'] = Util.normalize_vp(played[0])
                    user_item['played'] = Util.normalize_played(
                        Util.normalize_vp(played[0]))
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath(
                    './div[@class="userintro"]/div[@class="desc"]/p[2]/text()'
                ).extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = None
            if yh:
                video_count = yh.xpath(
                    'div[1]/div/div/div/div[@class="title"]/span/a/text()').re(
                        u'\((\d+)\)')
                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url

            items.append(user_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #10
0
class letv_cat(Spider):
    name = "letv_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "524288"
    site_id = "15"
    max_search_page = 1

    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(letv_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        self._page_api = "http://list.le.com/apin/chandata.json?c=%s&d=%s&md=%s&o=%s&p=%s&t=%s"
        self._le_url = "http://www.le.com/ptv/vplay/%s.html"
        self._max_page = 5
        try:
            self._cat_urls = self.mgr.get_cat_url('letv')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.extend([
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            })
                ])
                ret = self.parse_info_from_url(cat['url'])
                for p in range(self._max_page):
                    url = self._page_api % (ret['c'], ret['d'], ret['md'],
                                            ret['o'], str(p + 1), ret['t'])
                    items.extend([
                        Request(url=url,
                                callback=self.parse_page_json,
                                meta={
                                    'cat_name': cat['cat_name'],
                                    'audit': cat['audit'],
                                    'priority': cat['priority']
                                })
                    ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page_json(self, response):
        try:
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            json_data = json.loads(response.body)
            for item in json_data['data_list']:
                vid = item['vid']
                url = self._le_url % (vid)
                lens = item['duration']
                images = item['images']
                if '180*135' in images:
                    thumb = images['180*135']
                    items.append(
                        Request(url=url,
                                callback=self.parse_episode,
                                meta={
                                    'cat_name': cat_name,
                                    'thumb': thumb,
                                    'audit': audit,
                                    'lens': lens,
                                    'priority': priority
                                }))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []
            #video items
            qy_v = response.xpath('//div[@class="layout"]/dl')
            for v in qy_v:
                thumb = v.xpath('./dt/a/img/@src').extract()
                url = v.xpath('./dt/a/@href').extract()
                lens = v.xpath(
                    './dt/a/span[@class="number_bg"]/text()').extract()[0]
                try:
                    if not lens:
                        lens = 0
                    else:
                        a, b = lens.split(':')
                        lens = int(a) * 60 + int(b)

                    items.append(
                        Request(url=url[0].strip(),
                                callback=self.parse_episode,
                                meta={
                                    'cat_name': cat_name,
                                    'thumb': thumb,
                                    'audit': audit,
                                    'lens': lens,
                                    'priority': priority
                                }))
                except Exception as e:
                    continue
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            lens = response.request.meta['lens']
            priority = response.request.meta['priority']
            items = []

            #show_id
            show_id = Util.get_letv_showid(response.request.url)
            albumid = response.selector.re(re.compile(r'pid: ?(\d+)'))
            #video info
            title = response.xpath(
                '//meta[@name="irTitle"]/@content').extract()
            upload_time = response.xpath(
                '//ul[@class="info_list"]//em[@id="video_time"]/text()'
            ).extract()
            tag_sel = response.xpath(
                '//meta[@name="keywords"]/@content').extract()
            ep_item = EpisodeItem()
            if title:
                ep_item['title'] = title[0]
            if show_id:
                ep_item['show_id'] = show_id
            if tag_sel:
                tag_str = tag_sel[0][len(title[0]) + 1:]
                if tag_str:
                    tag_list = []
                    split_space = tag_str.split(' ')
                    for item_space in split_space:
                        split_comma = item_space.split(',')
                        for item_comma in split_comma:
                            tag_list.append(item_comma)

                    ep_item['tag'] = "|".join([t.strip() for t in tag_list])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            ep_item['duration'] = lens
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_info_from_url(self, url):
        result = {}
        keys = ['c', 't', 'md', 'o', 'd', 'p']
        info_str = url.split('/')[-1].split('.')[0]
        key_value = info_str.split('_')
        for item in key_value:
            for key in keys:
                if item[0:len(key)] == key:
                    result[key] = item[len(key):]

        return result
Пример #11
0
class toutiao_video(Spider):
    name = "toutiao_video"
    pipelines = ['MysqlStorePipeline']
    spider_id = "123456"
    site_id = "101"  #iqiyi
    #allowed_domains = ["list.iqiyi.com","www.iqiyi.com","cache.video.iqiyi.com"]
    #url_prefix = 'http://list.iqiyi.com'
    #playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    #playlength_url = "http://cache.video.iqiyi.com/a/"
    url_first = "http://toutiao.com/api/article/recent/?source=2&category=video&as=A165771AD802ED5&cp=57A8D2FE6D658E1&_=%s"
    url_second = "http://toutiao.com/api/article/recent/?source=2&count=20&category=video&max_behot_time=%s&utm_source=toutiao&offset=0&as=A1A5A75A8882EDC&cp=57A852EE9D5C6E1&max_create_time=%s&_=%s"
    max_search_page = 1000

    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(toutiao_video, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = [""]
            #self._cat_urls = self.mgr.get_ordered_url(site_name='iqiyi')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []
            #for cat in self._cat_urls:
            url = self.url_first % int(time.time())
            items.append(Request(url=url, callback=self.parse_first))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_first(self, response):
        try:
            items = []
            user_item = UserItem()
            data = json.loads(response.body)
            print data
            return items
            has_more = data.get("has_more")
            message = data.get("message")
            max_behot_time = data.get("max_behot_time")
            data = data.get("data")
            if data:
                for it in data:
                    ep_item = EpisodeItem()
                    ep_item['title'] = it["title"]
                ep_item['show_id'] = show_id
                ep_item['tag'] = "|".join([t.strip() for t in tag])
                ep_item['upload_time'] = upload_time[0].strip()
            if category:
                ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority

            print type(data)
            #items.append(Request(url=urls, callback=self.parse_page))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            items = []
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            #video items
            qy_v = response.xpath(
                '//div[@class="wrap-customAuto-ht "]/ul/li/div[1]')
            for v in qy_v:
                thumb = v.xpath('./a/img/@src').extract()
                url = v.xpath('./a/@href').extract()
                items.append(
                    Request(url=url[0].strip(),
                            callback=self.parse_episode,
                            meta={
                                'thumb': thumb,
                                "cat_name": cat_name,
                                'audit': audit,
                                'priority': priority
                            }))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            thumb_url = response.request.meta['thumb']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)
            #print "show_id:    %s" % show_id
            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            category = response.xpath(
                '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract(
                )
            if not category:
                category = response.xpath(
                    '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()'
                ).extract()
            if not category:
                category = response.xpath(
                    '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath(
                    '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()'
                ).extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self, response):
        try:
            log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            albumid = response.request.meta['albumid']

            items = []
            #sel = Selector(response)
            msg = response.body
            index = msg.find("AlbumInfo=") + len("AlbumInfo=")
            info = msg[index:]
            jinfo = json.loads(info)
            plsylength = jinfo["data"]["playLength"]
            #if plsylength:
            #if int(plsylength) < 600:
            #item['duration'] = str(plsylength)
            #items.append(Request(url=self.playnum_url+albumid+"/?qyid=", callback=self.parse_playnum, meta={'item':item}))
            item['duration'] = str(plsylength)
            items.append(
                Request(url=self.playnum_url + albumid + "/?qyid=",
                        callback=self.parse_playnum,
                        meta={'item': item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playnum(self, response):
        try:
            #log.msg('parse_playnum ,%s' % response.request.url)
            item = response.request.meta['item']

            items = []
            #sel = Selector(response)
            tplaynum = response.selector.re(re.compile(r':(\d+)'))
            #log.msg('play: %s, %s' % (tplaynum[0], response.request.url))
            if tplaynum:
                playnum = tplaynum[0]
                item['played'] = str(playnum)
                items.append(item)

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #12
0
class YoukuOrderSpider(Spider):
    name = "youku_order_history"
    pipelines = ['MysqlStorePipeline']
    spider_id = "256"
    site_id = "1"
    allowed_domains = [
        "i.youku.com", "www.youku.com", "v.youku.com", "ykrec.youku.com"
    ]
    url_prefix = 'http://i.youku.com'
    vpaction_url = "http://v.youku.com/v_vpactionInfo/id/"
    playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS/"
    ykrec_url = "http://ykrec.youku.com/video/packed/list.json?site=1&pg=1&module=2&pl=20&vid="

    mgr = DbManager.instance()
    channel_exclude = mgr.get_channel_exclude()
    cat_exclude = mgr.get_cat_exclude()

    def __init__(self, orders=None, *args, **kwargs):
        super(YoukuOrderSpider, self).__init__(*args, **kwargs)
        if orders:
            orders = json.loads(orders)
        else:
            orders = self.mgr.get_ordered_url(site_name='youku')
        if orders:
            self._orders = orders
        else:
            self._orders = []
        start_urls = [r['url'] for r in self._orders]

    def parse(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            items = []

            user_item = UserItem()
            #owner id
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath(
                    './div[@class="info"]/div[@class="username"]/a[1]/@title'
                ).extract()
                played = up.xpath(
                    './div[@class="state"]/ul/li[@class="vnum"]/em/text()'
                ).extract()
                fans = up.xpath(
                    './div[@class="state"]/ul/li[@class="snum"]/em/text()'
                ).extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    user_item['played'] = Util.normalize_vp(played[0])
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath(
                    './div[@class="userintro"]/div[@class="desc"]/p[2]/text()'
                ).extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = '0'
            if yh:
                video_count = yh.xpath(
                    'div[1]/div/div/div/div[@class="title"]/span/a/text()').re(
                        u'\((\d+)\)')

                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id

            items.append(user_item)

            #videos
            items.append(
                Request(url=response.request.url + "/videos",
                        callback=self.parse_video_page,
                        meta={'page': 1}))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_video_page(self, response):
        try:
            page = response.request.meta['page']
            log.msg('%s: %s' % (response.request.url, page))

            items = []

            #get videos
            yk_v = response.xpath('//div[@class="yk-col4"]/div')
            for v in yk_v:
                url = v.xpath('./div[@class="v-link"]/a/@href').extract()
                #pl = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-entry"]/span[@class="v-num"]/text()').extract()
                if url:
                    '''
                    if pl:
                        pld = Util.normalize_played(pl[0])
                        if int(pld) < int(self.hottest_played_threshold):
                            log.msg('discard: %s' % url[0])
                            continue
                    '''

                    items.append(
                        Request(url=url[0],
                                callback=self.parse_episode,
                                meta={'recommend': False}))

            #get last_str and ajax_url
            last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'')
            ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'')

            #reqest sibling page
            if ajax_url:
                sibling_page = (3 * page - 1, 3 * page)
                for p in sibling_page:
                    s = last_str[0] if last_str else u''
                    para = {
                        "v_page": str(page),
                        "page_num": str(p),
                        "page_order": "1",
                        "last_str": s
                    }
                    items.append(
                        FormRequest(url=self.url_prefix + ajax_url[0] +
                                    "fun_ajaxload/",
                                    formdata=para,
                                    method='GET',
                                    callback=self.parse_video_page,
                                    meta={'page': page}))

            #request next page
            next_page = response.xpath(
                '//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_video_page,
                            meta={'page': page + 1}))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            recommend = response.request.meta['recommend']
            log.msg('%s|recommend: %s' % (response.request.url, recommend))
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return

            #check recommended video's category
            category = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            cat = None
            if category:
                cat = category[0].replace(u'频道', '')
            if recommend and cat:
                if cat in self.cat_exclude:
                    log.msg("video category excluded: %s" % cat)
                    return

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = Util.strip_title("".join(title))
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if cat:
                ep_item['category'] = cat
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            #recommendation
            if not recommend:
                items.append(
                    Request(url=self.ykrec_url + video_id[0],
                            callback=self.parse_recommendation))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_vpaction(self, response):
        try:
            #log.msg('%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re(
                u'<label>总播放数:</label><span.*>(.+)</span>')
            if vp:
                item['played'] = Util.normalize_vp(vp[0])

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})

            return item

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self, response):
        try:
            #log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            plsylength = str(int(float(jinfo["data"][0]["seconds"])))
            if plsylength:
                item['duration'] = str(plsylength)

            return item
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_recommendation(self, response):
        try:
            log.msg('%s' % response.request.url)
            items = []

            rec_data = json.loads(response.body)
            for v in rec_data['data']:
                items.append(
                    Request(url='http://v.youku.com/v_show/id_%s.html' %
                            v['codeId'],
                            callback=self.parse_episode,
                            meta={'recommend': True}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #13
0
class YoukuCatNewestSpider(Spider):
    name = "youku_cat_newest"
    pipelines = ['MysqlStorePipeline']
    spider_id = "1"
    site_id = "1"
    format_id = 2
    #allowed_domains = ["www.youku.com", "v.youku.com", "i.youku.com", "index.youku.com", "play.youku.com"]
    url_prefix = 'http://www.youku.com'
    playlength_url = "http://play.youku.com/play/get.json?ct=10&vid="
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(YoukuCatNewestSpider, self).__init__(*args, **kwargs)
        cat_urls = kwargs.get('cat_urls')
        if cat_urls:
            cat_urls = json.loads(cat_urls)
        else:
            cat_urls = self.mgr.get_cat_url("youku")
        if cat_urls:
            self._cat_urls = cat_urls
        else:
            self._cat_urls = []

    def start_requests(self):
        try:
            items = []
            for cat in self._cat_urls:
                print cat
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_id': cat['id'],
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            }))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            audit = response.request.meta['audit']
            cat_name = response.request.meta['cat_name']
            priority = response.request.meta['priority']

            #page = response.request.meta['page']
            #if int(page) > int(self.max_search_page):
            #    return
            items = []
            #video items
            #yk_v = response.xpath('//div[@class="yk-col4"]')
            '''
            yk_v = response.xpath('//div[@id="getVideoList"]/div[@class="yk-row yk-v-90u"]/div[@class="yk-col4"]')
            for v in yk_v:
                url = v.xpath('./div/div[@class="v-link"]/a/@href').extract()
                thumb_urls = v.xpath('./div/div[@class="v-thumb"]/img/@src').extract()
            '''
            # 游戏、生活、旅游、搞笑
            yk_v = response.xpath(
                '//div[@class="vaule_main"]/div[@class="box-video"]/div[@class="yk-row"]/div'
            )
            for v in yk_v:
                url = v.xpath('./div/div/a/@href').extract()
                thumb_urls = v.xpath('./div/div/img/@src').extract()
                if thumb_urls:
                    thumb_url = thumb_urls[0]
                    if thumb_url == 'http://g1.ykimg.com/':
                        thumb_url = None
                else:
                    thumb_url = None

                if url:
                    items.append(
                        Request(url=url[0],
                                callback=self.parse_episode,
                                meta={
                                    'cat_id': cat_id,
                                    'cat_name': cat_name,
                                    'audit': audit,
                                    'thumb_url': thumb_url,
                                    'priority': priority
                                }))
            # 资讯、母婴、军事
            yk_v2 = response.xpath(
                '//div[@class="yk-box"]/div[@class="yk-body"]/div[@class="yk-row"]/div[@class="yk-col4"]'
            )
            for v in yk_v2:
                url = v.xpath('./div/div[@class="v-link"]/a/@href').extract()
                thumb_urls = v.xpath(
                    './div/div[@class="v-thumb"]/img/@src').extract()
                if thumb_urls:
                    thumb_url = thumb_urls[0]
                    if thumb_url == 'http://g1.ykimg.com/':
                        thumb_url = None
                else:
                    thumb_url = None
                if url:
                    items.append(
                        Request(url=url[0],
                                callback=self.parse_episode,
                                meta={
                                    'cat_id': cat_id,
                                    'cat_name': cat_name,
                                    'audit': audit,
                                    'thumb_url': thumb_url,
                                    'priority': priority
                                }))
            '''
            #pages
            next_page = response.xpath('//div[@class="yk-pager"]/ul/li[@class="next"]/a/@href').extract()
            if next_page:
                items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id}))
            '''
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode(self, response):
        try:
            logging.log(logging.INFO, "episode:%s" % response.request.url)
            cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            thumb_url = response.request.meta['thumb_url']
            priority = response.request.meta['priority']

            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                items.append(Request(url=owner[0], callback=self.parse_owner))

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = response.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            #if category:
            #    ep_item['category'] = category[0].replace(u'频道', '')
            ep_item['category'] = cat_name
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id
            ep_item['audit'] = audit
            ep_item['format_id'] = self.format_id
            ep_item['thumb_url'] = thumb_url
            ep_item['priority'] = priority

            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                if ep_item['show_id']:
                    items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_vpaction(self, response):
        try:
            logging.log(logging.INFO, response.request.url)
            item = response.request.meta['item']
            vp = response.xpath(
                '//ul[@class="player_info"]/li[@class="sum"]/text()').extract(
                )
            if vp:
                item['played'] = Util.normalize_played(
                    Util.normalize_vp(vp[0].replace('总播放:', '')))
            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})
            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_playlength(self, response):
        try:
            logging.log(logging.INFO, response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            plsylength = str(int(float(jinfo["data"]["video"]["seconds"])))
            if plsylength:
                item['duration'] = int(plsylength)

            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_owner(self, response):
        try:
            logging.log(logging.INFO, response.request.url)
            items = []

            user_item = UserItem()
            #owner id
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath(
                    './div[@class="info"]/div[@class="username"]/a[1]/@title'
                ).extract()
                played = up.xpath(
                    './div[@class="state"]/ul/li[@class="vnum"]/em/text()'
                ).extract()
                fans = up.xpath(
                    './div[@class="state"]/ul/li[@class="snum"]/em/text()'
                ).extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    #user_item['played'] = Util.normalize_vp(played[0])
                    user_item['played'] = Util.normalize_played(
                        Util.normalize_vp(played[0]))
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath(
                    './div[@class="userintro"]/div[@class="desc"]/p[2]/text()'
                ).extract()

                if intro:
                    user_item['intro'] = ''.join(intro)

            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = None
            if yh:
                video_count = yh.xpath(
                    'div[1]/div/div/div/div[@class="title"]/span/a/text()').re(
                        u'\((\d+)\)')

                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount

            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url

            items.append(user_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #14
0
class GameFySpider(Spider):
    name = "gamefy_athletic_mobile"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(GameFySpider, self).__init__(*args, **kwargs)
        self._host_name = "http://www.gamefy.cn/"
        self._category = "游戏"
        self._site_id = '11'
        self._spider_id = '131072'
        self._cat_urls = self.mgr.get_cat_url("gamefy")

    def start_requests(self):
        items = []
        try:
            for cat in self._cat_urls:
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse_list,
                            meta={'cat_id': cat['id']}))

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        finally:
            return items

    def parse_list(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            sels = response.xpath(
                '//div[@class="con"]//div[@class="area-col min"]//div[@class="area-block"]//a'
            )
            if sels:
                for sel in sels:
                    urls = sel.xpath('./@href').extract()
                    titles = sel.xpath('./@title').extract()
                    imgs = sel.xpath('.//img/@src').extract()

                    url = urls[0]
                    title = titles[0].encode("UTF-8")
                    img = imgs[0]
                    items.append(
                        Request(url=url,
                                callback=self.parse_media,
                                meta={
                                    'title': title,
                                    'img': img,
                                    'cat_id': cat_id
                                }))

            #get next page
            next_page_sel = response.xpath(
                '//div[@class="viciao"]/a[text()=">"]/@href').extract()
            if next_page_sel:
                next_page = next_page_sel[0]
                next_page = self._host_name + next_page
                items.append(
                    Request(url=next_page,
                            callback=self.parse_list,
                            meta={'cat_id': cat_id}))

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        finally:
            return items

    def parse_media(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            title = response.request.meta['title']
            thumb_url = response.request.meta['img']
            url = response.request.url
            query = urlparse.urlparse(url).query
            query_dict = urlparse.parse_qs(query)
            show_id = query_dict['id'][0]

            #get tags
            sels = response.xpath('//span[@class="c_org1"]/a/text()').extract()
            tag = ''
            if sels:
                tag = "|".join(sels).encode("UTF-8")

            #get release time
            upload_time = ''
            sels = response.xpath(
                '//p[@class="c_gray0 lh3"]/span/text()').extract()
            if sels:
                time_times = sels[0].encode("UTF-8")
                upload_time = time_times[0:16]

            #get play times
            played = 0
            sels = response.xpath(
                '//p[@class="c_gray0 lh3"]/span/a/text()').extract()
            if sels:
                played = sels[0].strip()

            ep_item = EpisodeItem()
            ep_item['title'] = title
            ep_item['show_id'] = show_id
            ep_item['tag'] = tag
            ep_item['upload_time'] = upload_time
            ep_item['category'] = self._category
            ep_item['thumb_url'] = thumb_url
            ep_item['spider_id'] = self._spider_id
            ep_item['site_id'] = self._site_id
            ep_item['url'] = url
            ep_item['played'] = played
            ep_item['cat_id'] = cat_id

            items.append(ep_item)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        finally:
            return items
Пример #15
0
 def __init__(self):
     self.__db_mgr = DbManager.instance()
Пример #16
0
class bilibili_cat(Spider):
    name = "bilibili_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "5"
    site_id = "13"
    max_search_page = 1
    url_prefix = "http://www.bilibili.com/index/tag/%s/default/1/%s.json"
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(bilibili_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_cat_url('bilibili')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.extend([
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            })
                ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            #video items
            id = response.xpath(
                '//div[@class="fcname"]/ul/li[@class="on"]/@tid').extract()
            tag = response.xpath(
                '//div[@class="fcname"]/ul/li[@class="on"]/a/text()').extract(
                )
            if tag[0].strip() == u'全部':
                id = response.xpath(
                    '//div[@class="menu-wrapper"]/ul/li[@class="m-i  on"]/@data-tid'
                ).extract()
                tag = response.xpath(
                    '//div[@class="menu-wrapper"]/ul/li[@class="m-i  on"]/a/em/text()'
                ).extract()
            if id and tag:
                url = self.url_prefix % (id[0].strip(), tag[0].strip())
                print url
                items.append(
                    Request(url=url,
                            callback=self.parse_episode,
                            meta={
                                'cat_name': cat_name,
                                'audit': audit,
                                'priority': priority
                            }))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            data = json.loads(response.body)
            list = data.get('list')
            for item in list:
                ep_item = EpisodeItem()
                ep_item['title'] = item.get('title')
                ep_item['show_id'] = item.get('aid')
                #ep_item['tag'] =  item.get()
                ep_item['thumb_url'] = item.get('pic')
                ep_item['spider_id'] = self.spider_id
                ep_item['site_id'] = self.site_id
                ep_item[
                    'url'] = "http://www.bilibili.com/video/av%s/" % item.get(
                        'aid')
                #ep_item['cat_id'] = cat_id
                ep_item['category'] = cat_name
                ep_item['description'] = item.get("description")
                ep_item['format_id'] = '2'
                ep_item['audit'] = audit
                ep_item['priority'] = priority
                ep_item['played'] = item.get('play')
                #ep_item['upload_time'] = item.get('create')
                duration = item.get('duration')
                if duration:
                    a, b = duration.split(':')
                    duration = int(a) * 60 + int(b)
                else:
                    duration = 0
                ep_item['duration'] = duration
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #17
0
class iqiyi_subject_history(Spider):
    name = "iqiyi_subject_history"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    spider_id = "4096"  #iqiyi_order_history
    site_id = "5"  #iqiyi
    allowed_domains = [
        "list.iqiyi.com", "www.iqiyi.com", "cache.video.iqiyi.com",
        "cache.video.qiyi.com"
    ]
    url_prefix = 'http://list.iqiyi.com'
    playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    playlength_url = "http://cache.video.iqiyi.com/a/"
    hottest_played_threshold = get_project_settings().get(
        'ORDERED_PLAYED_THRESHOLD')

    mgr = DbManager.instance()

    def __init__(self, cat_urls=None, *args, **kwargs):
        super(iqiyi_subject_history, self).__init__(*args, **kwargs)
        if cat_urls:
            cat_urls = json.loads(cat_urls)
            self.max_search_page = get_project_settings().get(
                'MAX_MANUAL_SEARCH_PAGE')
        else:
            cat_urls = self.mgr.get_subjects("iqiyi")
            self.max_search_page = get_project_settings().get(
                'MAX_SEARCH_PAGE')
        if cat_urls:
            self._cat_urls = cat_urls
        else:
            self._cat_urls = []

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse,
                            meta={'cat_id': cat['id']}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    #start_urls = ["http://www.iqiyi.com/a_19rrgubpsd.html"]
    #start_urls = ["http://www.iqiyi.com/a_19rrgiavst.html#vfrm=2-3-0-1"]
    #start_urls = mgr.get_cat_url("iqiyi")

    #for each category parse all its sub-categories or types
    def parse(self, response):
        try:
            #log.msg('lev1: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #category
            albumId = response.selector.re(re.compile(r'albumId: ?(\d+)'))[0]
            sourceid = response.selector.re(re.compile(r'sourceId: ?(\d+)'))[0]
            cid = response.selector.re(re.compile(r'cid: ?(\d+)'))[0]

            years = []
            subs = sel.xpath(
                '//div[@id="block-J"]/div[1]/div[1]/div[1]/div[2]/ul/li/a/@data-year'
            ).extract()
            i = 0
            for year in subs:
                sxpath = '//div[@id="block-J"]/div[1]/div[' + str(
                    i + 2) + ']/a/@data-month'
                subs1 = sel.xpath(sxpath).extract()
                #subs1 = sel.xpath('//div[@id="block-J"]/div[1]/div[2]/a/@data-month').extract()
                for month in subs1:
                    y_month = str(year) + str(month)
                    url1 = "http://cache.video.qiyi.com/jp/sdvlst/" + cid + "/" + sourceid + "/" + y_month + "/?categoryId=" + cid + "&sourceId=" + sourceid + "&tvYear=" + y_month + "&callback=window"
                    items.extend([
                        Request(url=url1,
                                callback=self.parse_second,
                                meta={'cat_id': cat_id})
                    ])
                i = i + 1

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_second(self, response):
        try:
            #log.msg('lev2: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #category
            begin = response.body.find("try{window(")
            begin += len("try{window(")
            end = response.body.find(");}catch(e)")
            msg = response.body[begin:end]
            jmsg = json.loads(msg)
            num = len(jmsg["data"])
            for i in range(num):
                title = jmsg["data"][i]["aName"]
                play_num = "0"
                play_num = str(jmsg["data"][i]["disCnt"])
                upload_time = jmsg["data"][i]["tvYear"]
                turl = jmsg["data"][i]["vUrl"]
                timelength = str(jmsg["data"][i]["timeLength"])

                ep_item = EpisodeItem()
                if len(title) != 0:
                    ep_item["title"] = title
                ep_item["played"] = play_num
                if len(upload_time) != 0:
                    ep_item["upload_time"] = upload_time
                if len(turl) != 0:
                    ep_item["url"] = turl
                if len(timelength) != 0:
                    ep_item["duration"] = timelength
                ep_item['subject_id'] = cat_id

                items.append(
                    Request(url=turl,
                            callback=self.parse_episode,
                            meta={'item': ep_item}))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            items = []

            item = response.request.meta['item']
            sel = Selector(response)

            #video info
            ttitle = sel.xpath(
                '//div[@class="play-tit-l"]/h2/span/text()').extract()
            title = ""
            if len(ttitle) > 0:
                title = ttitle[0]

            if len(title) == 0:
                ttitle = sel.xpath(
                    '//div[@class="play-tit-l"]/h1/text()').extract()
                if len(ttitle) > 0:
                    title = ttitle[0]

            if len(title) == 0 and "title" in item:
                title = item["title"]

            index = response.request.url.rfind("#")
            surl = response.request.url[:index]

            show_id = ""
            r = re.compile(r'[vw]_[0-9a-zA-Z]*')
            m = r.search(surl)
            if m:
                show_id = m.group()

            category = None
            tcategory = sel.xpath(
                '//div[@id="block-E"]/div[1]/div[1]/div[2]/div[1]/span[1]/a[2]/text()'
            ).extract()
            if len(tcategory) > 0:
                category = tcategory[0].strip()

            tcategory = sel.xpath(
                '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract(
                )  #"channel"
            if len(tcategory) > 0 and not category:
                category = tcategory[0].strip()
            else:
                tcategory = sel.xpath(
                    '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()'
                ).extract()
                if len(tcategory) > 0 and not category:
                    category = tcategory[0].strip()
                else:
                    tcategory = sel.xpath(
                        '//div[@class="crumb_bar"]/span[1]/a[2]/text()'
                    ).extract()  #"channel"
                    if len(tcategory) > 0 and not category:
                        category = tcategory[0].strip()
                    #else:
                    #    log.msg("not find category,url is %s" % response.request.url, level=log.ERROR)

            #get upload time
            upload_time = ""
            tag = ""

            tupload_time = sel.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if len(tupload_time) > 0:
                upload_time = tupload_time[0].strip()

            tupload_time = sel.xpath(
                '//div[@class="crumb_bar"]/span[2]/span/text()').extract()
            if len(tupload_time) > 0:
                upload_time = tupload_time[0].strip()

            if len(upload_time) == 0:
                tupload_time = sel.xpath(
                    '//div[@class="movieMsg"]/div/p/text()').extract()
                if len(tupload_time) > 0:
                    r = re.compile(r'(\d+)[.-](\d+)[\d+].*')
                    m = r.search(tupload_time[0])
                    if m:
                        ttupload_time = m.group()
                        upload_time = ttupload_time.replace(".", "-")
                    #ttupload_time  = tupload_time.re(r'(\d+)[.-](\d+)[\d+].*')

            #get tags,two ways to get tags
            taglist = sel.xpath(
                '//div[@class="crumb_bar"]/span[2]/a/text()').extract()
            if len(taglist) > 0:
                tag = "|".join(taglist)

            if not tag or len(tag) == 0:
                taglist = sel.xpath(
                    '//div[@class="crumb_bar"]/span[1]/a/text()').extract()
                if len(taglist) > 0:
                    tag = "|".join(taglist)

            ep_item = response.request.meta['item']

            if title:
                ep_item['title'] = title
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = tag
            if upload_time:
                ep_item['upload_time'] = upload_time
            if category:
                ep_item['category'] = category

            if not title or not show_id or not category:
                #log.msg("title ,show_id,category is null ,url is %s" % response.request.url, level=log.ERROR)
                return
            if len(title) == 0 or len(show_id) == 0 or len(category) == 0:
                #log.msg("title ,show_id,category is null ,url is %s" % response.request.url, level=log.ERROR)
                return

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #18
0
class youku_spider(Spider):

    name = 'youku_spider'
    site_id = '1'

    subscribe_ids = {
        'all': '1',
        'channel': '2',
        'keyword': '4',
        'page': '8',
        'cagtegory': '16',
        'subject': '32',
        'manual': '64'
    }

    #prefix urls
    youku_url_prefix = "http://www.youku.com"
    soku_url_prefix = "http://www.soku.com"
    vpaction_url = "http://v.youku.com/v_vpactionInfo/id"
    playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS"

    #global variable
    mgr = DbManager.instance()
    channel_exclude = mgr.get_channel_exclude()
    category_exclude = mgr.get_cat_exclude()
    ordered_played_threshold = get_project_settings().get(
        'ORDERED_PLAYED_THRESHOLD')
    hottest_played_threshold = get_project_settings().get(
        'HOTTEST_PLAYED_THRESHOLD')
    newest_time_threshold = get_project_settings().get('NEWEST_TIME_THRESHOLD')

    #default value
    max_search_page = "0"

    def __init__(self, *args, **kwargs):
        super(youku_spider, self).__init__(*args, **kwargs)
        self.spider_parses = {'channel':self.channel_parse,\
                        'video_set':self.video_set_parse,\
                        'search':self.search_parse,\
                        'video':self.video_parse,\
                        'page':self.page_parse,\
                        'user':self.user_parse,\
                        'category':self.category_parse}

        # #新的接口方式
        # #自动订阅接口
        # self.subscribe_type = kwargs['type'] if 'type' in kwargs.keys() else None
        # #手动订阅接口
        # self.subscribe_url = kwargs['url'] if 'url' in kwargs.keys() else None
        # self.subscribe_id = kwargs['id'] if 'id' in kwargs.keys() else None
        #
        #以下老的接口方式
        #自动订阅接口
        self.subscribe_type = kwargs['type'] if 'type' in kwargs.keys(
        ) else None
        #手动订阅接口
        #   频道订阅urls
        subscribe_channel_urls = kwargs[
            'channel_urls'] if 'channel_urls' in kwargs.keys() else None
        #   分类订阅urls
        subscribe_cat_urls = kwargs['cat_urls'] if 'cat_urls' in kwargs.keys(
        ) else None
        #   页面订阅urls
        subscribe_page_urls = kwargs[
            'page_urls'] if 'page_urls' in kwargs.keys() else None
        #   专题订阅urls
        subscribe_subject_urls = kwargs[
            'subject_urls'] if 'subject_urls' in kwargs.keys() else None
        #   关键词订阅urls
        self.subscribe_keywords = kwargs[
            'keywords'] if 'keywords' in kwargs.keys() else None
        self.subscribe_cat_ids = kwargs['cat_ids'] if 'cat_ids' in kwargs.keys(
        ) else []

        self.subscribe_url = None
        self.subscribe_id_key = None
        self.subscribe_id_value = None
        url = None
        key = None
        try:
            if subscribe_channel_urls:
                log.msg("subscribe_channel_url:", level=log.DEBUG)
                subscribe_channel_urls = json.loads(subscribe_channel_urls)
                key = None
                url = subscribe_channel_urls[0]

            if subscribe_cat_urls:
                log.msg("subscribe_cat_url:", level=log.DEBUG)
                subscribe_cat_urls = json.loads(subscribe_cat_urls)
                key = 'cat_id'
                url = subscribe_cat_urls[0]

            if subscribe_page_urls:
                log.msg("subscribe_page_url:", level=log.DEBUG)
                subscribe_page_urls = json.loads(subscribe_page_urls)
                key = 'pg_id'
                url = subscribe_page_urls[0]

            if subscribe_subject_urls:
                log.msg("subscribe_subject_url:", level=log.DEBUG)
                subscribe_subject_urls = json.loads(subscribe_subject_urls)
                key = None
                url = subscribe_subject_urls[0]
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
            return

        if url:
            log.msg(url, level=log.DEBUG)
            self.subscribe_url = url['url'] if 'url' in url.keys() else None
            self.subscribe_id_value = url['id'] if 'id' in url.keys() else None
        if key:
            self.subscribe_id_key = key

        if self.subscribe_keywords:
            log.msg("subscribe_keywords:", level=log.DEBUG)
            self.subscribe_keywords = json.loads(self.subscribe_keywords)
            self.subscribe_id_key = 'kw_id'
            for url in self.subscribe_keywords:
                log.msg(url, level=log.DEBUG)

        if self.subscribe_cat_ids:
            log.msg("subscribe_cat_ids:", level=log.DEBUG)
            self.subscribe_cat_ids = json.loads(self.subscribe_cat_ids)
            for url in self.subscribe_cat_ids:
                log.msg(url, level=log.DEBUG)

    #解析spider_type
        self.spider_type_resolve()
Пример #19
0
class iqiyi_order(Spider):
    name = "iqiyi_order"
    pipelines = ['MysqlStorePipeline']
    spider_id = "131072"
    site_id = "5"  #iqiyi
    allowed_domains = [
        "list.iqiyi.com", "www.iqiyi.com", "cache.video.iqiyi.com"
    ]
    url_prefix = 'http://list.iqiyi.com'
    playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    playlength_url = "http://cache.video.iqiyi.com/a/"
    max_search_page = 1

    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(iqiyi_order, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_ordered_url(site_name='iqiyi')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            #items.append(Request(url="http://www.iqiyi.com/u/1061614233", callback=self.parse_first,meta={'cat_name': u'生活','audit':1,'show_id':'1061614233'}))
            #'''
            for cat in self._cat_urls:
                #items.append(Request(url="http://www.iqiyi.com/u/1211677213", callback=self.parse_first))
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse_first,
                            meta={
                                'cat_name': cat['user'],
                                'audit': cat['audit'],
                                'show_id': cat['show_id'],
                                'priority': cat['priority']
                            }))
            #'''
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_first(self, response):
        try:
            items = []
            user_item = UserItem()
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            show_id = response.request.meta['show_id']
            priority = response.request.meta['priority']

            #owner_id = response.xpath('//div[@class="top-yc_userCare fl"]/a/@data-userid')
            fans = response.xpath(
                '//div[@class="info_connect"]//em/a[@data-fans="fans"]/text()')
            played = response.xpath(
                '//div[@class="info_connect"]/span[@class="conn_type S_line1"]/em/a/text()'
            )
            '''
            if owner_id:
                owner_id = owner_id.extract()[0].strip()
                #user_item['owner_id']=owner_id
                user_item['show_id']=owner_id
            else:
                owner_id = response.xpath('//span[@class="pc-btn pc-care-large pc-btn-reset"]/a[@class="btn-care btn-care-tocare"]/@data-userid')
                if owner_id:
                    owner_id = owner_id.extract()[0].strip()
                    #user_item['owner_id']=owner_id
                    user_item['show_id']=owner_id
            '''
            user_item['show_id'] = show_id
            if fans:
                fans = fans.extract()[0].strip()
                fans = fans.replace(',', '')
                if fans.find(u'万'):
                    fans = float(fans[:fans.find(u'万')])
                    fans = fans * 10000
                    user_item['fans'] = int(fans)
                else:
                    user_item['fans'] = int(fans)
            if played:
                played = played.extract()[0].strip()
                played = played.replace(',', '')
                if played.find(u'万'):
                    played = float(played[:played.find(u'万')])
                    played = played * 10000
                    user_item['played'] = int(played)
                else:
                    user_item['played'] = int(played)

            username = response.xpath(
                '//div[@class="pf_username"]/span/text()')
            userinfo = response.xpath('//div[@class="pf_intro"]/a/text()')
            if username:
                username = username.extract()[0].strip()
                user_item['user_name'] = username
            if userinfo:
                userinfo = userinfo.extract()[0].strip()
                user_item['intro'] = userinfo

            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url
            items.append(user_item)

            title = u'视频'
            urls = ''
            u = response.xpath(
                '//div[@class="qiyiSet-nav"]/ul[@class="qiyiNav-normal"]/li/a[@title="%s"]/@href'
                % title)
            if u:
                urls = u.extract()[0]
            else:
                u = response.xpath(
                    '//div[@class="pc-nav-title pc-item-box"]/ul[@class="pc-user-nav pc-user-nav-4 clearfix"]/li[@data-ugcguide-target ="2"]/a/@href'
                )
                urls = u.extract()[0]
            items.append(
                Request(url=urls,
                        callback=self.parse_page,
                        meta={
                            'cat_name': cat_name,
                            'audit': audit,
                            'priority': priority
                        }))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            items = []
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            #video items
            qy_v = response.xpath(
                '//div[@class="wrap-customAuto-ht "]/ul/li/div[1]')
            for v in qy_v:
                thumb = v.xpath('./a/img/@src').extract()
                url = v.xpath('./a/@href').extract()
                items.append(
                    Request(url=url[0].strip(),
                            callback=self.parse_episode,
                            meta={
                                'thumb': thumb,
                                "cat_name": cat_name,
                                'audit': audit,
                                'priority': priority
                            }))

            #pages
            #next_page = response.xpath("//div[@class='mod-page']/a[text()='%s']/@href" % u'下一页').extract()
            #if next_page:
            #    items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id, 'cat_name': cat_name}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            thumb_url = response.request.meta['thumb']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)
            #print "show_id:    %s" % show_id
            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            category = response.xpath(
                '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract(
                )
            if not category:
                category = response.xpath(
                    '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()'
                ).extract()
            if not category:
                category = response.xpath(
                    '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath(
                    '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()'
                ).extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self, response):
        try:
            log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            albumid = response.request.meta['albumid']

            items = []
            #sel = Selector(response)
            msg = response.body
            index = msg.find("AlbumInfo=") + len("AlbumInfo=")
            info = msg[index:]
            jinfo = json.loads(info)
            plsylength = jinfo["data"]["playLength"]
            #if plsylength:
            #if int(plsylength) < 600:
            #item['duration'] = str(plsylength)
            #items.append(Request(url=self.playnum_url+albumid+"/?qyid=", callback=self.parse_playnum, meta={'item':item}))
            item['duration'] = str(plsylength)
            items.append(
                Request(url=self.playnum_url + albumid + "/?qyid=",
                        callback=self.parse_playnum,
                        meta={'item': item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playnum(self, response):
        try:
            #log.msg('parse_playnum ,%s' % response.request.url)
            item = response.request.meta['item']

            items = []
            #sel = Selector(response)
            tplaynum = response.selector.re(re.compile(r':(\d+)'))
            #log.msg('play: %s, %s' % (tplaynum[0], response.request.url))
            if tplaynum:
                playnum = tplaynum[0]
                item['played'] = str(playnum)
                items.append(item)

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #20
0
class sohu_cat(Spider):
    name = "sohu_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "1048576"
    site_id = "3"  #iqiyi
    max_search_page = 1

    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(sohu_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_cat_url('sohu')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'page': 1,
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            }))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            log.msg('parse page %s: %s' %
                    (response.request.url, response.request.meta['page']))
            page = response.request.meta['page']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            #if int(page) > int(self.max_search_page):
            #    return

            items = []
            #video items
            qy_v = response.xpath('//div[@class="column-bd wemd cfix"]/ul/li')
            if not qy_v:
                qy_v = response.xpath('//div[@class="column-bd cfix"]/ul/li')
            print len(qy_v)
            for v in qy_v:
                thumb = v.xpath('./div[@class="st-pic"]/a/img/@src').extract()
                url = v.xpath('./div[@class="st-pic"]/a/@href').extract()
                lens = v.xpath(
                    './div[@class="st-pic"]/span[@class="maskTx"]/text()'
                ).extract()
                if not lens:
                    lens = v.xpath(
                        './div[@class="st-pic"]/a/span[@class="maskTx"]/text()'
                    ).extract()
                try:
                    lens = lens[0].strip()

                    if not lens:
                        lens = 0
                    else:
                        a, b = lens.split(':')
                        lens = int(a) * 60 + int(b)
                    items.append(
                        Request(url=url[0].strip(),
                                callback=self.parse_episode,
                                meta={
                                    'cat_name': cat_name,
                                    'thumb': thumb,
                                    'audit': audit,
                                    'lens': lens,
                                    'priority': priority
                                }))
                except Exception as e:
                    continue
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            lens = response.request.meta['lens']
            priority = response.request.meta['priority']
            items = []

            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            #albumid = response.selector.re(re.compile(r'pid: ?(\d+)'))
            #show_id
            show_id = Util.get_sohu_showid(response.request.url)
            #tag
            tag = response.xpath('//meta[@name="keywords"]/@content').extract()
            #video info
            title = response.xpath(
                '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()'
            ).extract()
            #played = response.xpath('//em[@id="video_playcount"]').extract()
            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = tag[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            ep_item['duration'] = lens
            #if played:
            #    ep_item['played']=played
            #if albumid:
            #    items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]}))
            #else:
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #21
0
class QqOrderSpider(Spider):
    name = "qq_order"
    pipelines = ['MysqlStorePipeline']
    spider_id = "2097152"
    site_id = "16"
    format_id = 2
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(QqOrderSpider, self).__init__(*args, **kwargs)
        orders = kwargs.get('orders')
        if orders:
            orders = json.loads(orders)
        else:
            orders = self.mgr.get_ordered_url(site_name='qq')
        if orders:
            self._orders = orders
        else:
            self._orders = []

    def start_requests(self):
        try:
            items = []
            for order in self._orders:
                url = order.pop('url')
                if not url.endswith('/videos'):
                    if url.endswith('/'):
                        url = url + 'videos'
                    else:
                        url = url + '/videos'
                r = Request(url=url, callback=self.parse_page)
                r.meta.update({'order': order})
                items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            order = response.request.meta['order']
            items = []

            qq_v = response.xpath(
                '//ul[@id="videolst_cont"]/li[@class="list_item"]')
            for v in qq_v:
                urls = v.xpath('./strong/a/@href').extract()
                titles = v.xpath('./strong/a/text()').extract()
                thumb_urls = v.xpath('./a/img/@src').extract()
                durations = v.xpath('./a/span/em/text()').extract()
                playeds = v.xpath(
                    './div/span[@class="figure_info_play"]/span/text()'
                ).extract()
                upload_times = v.xpath(
                    './div/span[@class="figure_info_time"]/text()').extract()

                title = titles[0] if titles else None
                thumb_url = thumb_urls[0] if thumb_urls else None
                duration = Util.get_qq_duration(
                    durations[0]) if durations else None
                played = Util.normalize_played(Util.normalize_vp(
                    playeds[0])) if playeds else None
                upload_time = Util.get_qq_upload_time(
                    upload_times[0]) if upload_times else None
                if urls:
                    r = Request(url=urls[0], callback=self.parse_episode)
                    d = {
                        'title': title,
                        'thumb_url': thumb_url,
                        'duration': duration,
                        'played': played,
                        'upload_time': upload_time
                    }
                    d.update(order)
                    r.meta.update({'order': d})
                    items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode(self, response):
        try:
            logging.log(logging.INFO, 'episode:%s' % response.request.url)
            order = response.request.meta['order']
            items = []

            #video info
            #tags = response.xpath('//p[@class="info_tags"]//a/@title').extract()
            #descriptions = response.xpath('//div[@class="info_summary cf"]/span/text()').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_qq_showid(response.request.url)
            #if tags:
            #    ep_item['tag'] = Util.unquote(tags[0]).rstrip('|')
            #if descriptions:
            #    ep_item['description'] = descriptions[0]
            for k, v in order.items():
                if k == 'user':
                    ep_item['category'] = v
                elif k == 'show_id':
                    ep_item['owner_show_id'] = v
                else:
                    ep_item[k] = v

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['format_id'] = self.format_id
            items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #22
0
class iqiyi_military_hottest(Spider):
    name = "iqiyi_military_hottest"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    spider_id = "512" #iqiyi_military_hottest
    site_id = "5"   #iqiyi
    allowed_domains = ["list.iqiyi.com","www.iqiyi.com","cache.video.iqiyi.com"]
    url_prefix = 'http://list.iqiyi.com'
    playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    playlength_url = "http://cache.video.iqiyi.com/a/"
    hottest_played_threshold = get_project_settings().get('ORDERED_PLAYED_THRESHOLD')

    mgr = DbManager.instance()

    def __init__(self, cat_urls=None, *args, **kwargs):
        super(iqiyi_military_hottest, self).__init__(*args, **kwargs)
        if cat_urls:
            cat_urls = json.loads(cat_urls)
            self.max_search_page = get_project_settings().get('MAX_MANUAL_SEARCH_PAGE')
        else:
            cat_urls = self.mgr.get_cat_url("iqiyi")
            self.max_search_page = get_project_settings().get('MAX_SEARCH_PAGE')
        if cat_urls:
            self._cat_urls = cat_urls 
        else:
            self._cat_urls = [] 

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.append(Request(url=cat['url'], callback=self.parse, meta={'cat_id': cat['id']}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)


    #for each category parse all its sub-categories or types
    def parse(self, response):
        try:
            #log.msg('lev1: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []

            #category
            subs = response.xpath('//div[@class="mod_sear_menu mt20 mb30"]/div[2]/ul/li/a/@href').extract()
            for turl in subs:
                if turl != "#":
                    url = self.url_prefix+turl
                    items.extend([Request(url=url, callback=self.parse_second, meta={'cat_id': cat_id})])
                else:
                    items.extend([Request(url=response.request.url, callback=self.parse_most_played, meta={'cat_id': cat_id})])

            inh_item = self.parse_second(response)
            if inh_item:
                items.extend(inh_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_second(self,response):
        try:
            #log.msg('lev2: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []

            #category
            subs = response.xpath('//div[@class="mod_sear_menu mt20 mb30"]/div[3]/ul/li/a/@href').extract()
            for turl in subs:
                if turl != "#":
                    url = self.url_prefix+turl
                    items.extend([Request(url=url, callback=self.parse_most_played, meta={'cat_id': cat_id})])
                else:
                    items.extend([Request(url=response.request.url, callback=self.parse_most_played, meta={'cat_id': cat_id})])

            inh_item = self.parse_most_played(response)
            if inh_item:
                items.extend(inh_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    #for each sub-category we get the most played
    def parse_most_played(self, response):
        try:
            #log.msg('lev3: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []

            url = response.request.url
            suburl = "------------"
            index = url.rfind(suburl)
            #combine all sort types
            if index > 0:
                headurl =  url[0:index]
                url11 = headurl + suburl + "10-1-2--1-.html"
                items.extend([Request(url=url11, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})])
                url12 = headurl + suburl + "10-1-2--2-.html"
                items.extend([Request(url=url12, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})])
                url21 = headurl + suburl + "4-1-2--1-.html"
                items.extend([Request(url=url21, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})])
                url22 = headurl + suburl + "4-1-2--2-.html"
                items.extend([Request(url=url22, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})])

            #donnot forget parse current reponse's page
            response.request.meta.update({'page': 1})
            items.extend(self.parse_page(response))
                        
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            page = response.request.meta['page']
            cat_id = response.request.meta['cat_id']
            if int(page) > int(self.max_search_page):
                return

            items = []

            #video items
            qy_v = response.xpath('//div[@class="wrapper-piclist"]/ul/li/div[1]')
            for v in qy_v:
                thumb = v.xpath('./a/img/@src').extract()
                url = v.xpath('./a/@href').extract()
                items.append(Request(url=url[0].strip(), callback=self.parse_episode, meta={'cat_id': cat_id, 'thumb': thumb}))

            #pages
            next_page = response.xpath("//div[@class='mod-page']/a[text()='%s']/@href" % u'下一页').extract()
            if next_page:
                items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            thumb_url = response.request.meta['thumb']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)

            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath('//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()').extract()

            category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract()

            upload_time = response.xpath('//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath('//div[@class="crumb_bar"]/span[2]/span/text()').extract()
            
            tag = response.xpath('//span[@id="widget-videotag"]/descendant::*/text()').extract()
            if not tag:
                tag = response.xpath('//span[@class="mod-tags_item vl-block"]/descendant::*/text()').extract()
            if not tag:
                tag = response.xpath('//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()
            
            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] =  "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            if category:
                ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id

            if albumid:
                items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self,response):
        try:
            log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            albumid = response.request.meta['albumid']

            items = []
            #sel = Selector(response)
            msg = response.body
            index = msg.find("AlbumInfo=") + len("AlbumInfo=")
            info = msg[index:]
            jinfo = json.loads(info)
            plsylength = jinfo["data"]["playLength"]
            if plsylength:
                item['duration'] = str(plsylength)
            
            items.append(Request(url=self.playnum_url+albumid+"/?qyid=", callback=self.parse_playnum, meta={'item':item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playnum(self, response):
        try:
            #log.msg('parse_playnum ,%s' % response.request.url)
            item = response.request.meta['item']

            items = []
            #sel = Selector(response)
            tplaynum = response.selector.re(re.compile(r':(\d+)'))
            #log.msg('play: %s, %s' % (tplaynum[0], response.request.url))
            if tplaynum:
                playnum = tplaynum[0]
                if int(playnum) > int(self.hottest_played_threshold):
                    item['played'] = str(playnum)
                    items.append(item)

            return items
                
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #23
0
class v1_cat(Spider):
    name = "v1_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "10"
    site_id = "17"
    max_search_page = 1
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(v1_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_cat_url('v1')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.extend([
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            })
                ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            qy_v = response.xpath('//div[@id="addMore"]/ul/li')
            print len(qy_v)
            for v in qy_v:
                thumb = v.xpath('./div[@class="lists"]/a/img/@src').extract()
                url = v.xpath('./div[@class="lists"]/a/@href').extract()
                if url:
                    items.append(
                        Request(url=url[0].strip(),
                                callback=self.parse_episode,
                                meta={
                                    'cat_name': cat_name,
                                    'audit': audit,
                                    'priority': priority,
                                    'thumb': thumb
                                }))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            thumb = response.request.meta['thumb']
            items = []

            show_id = Util.get_v1_showid(response.request.url)
            title = response.xpath('//meta[@name="title"]/@content').extract()
            tags = response.xpath(
                '//meta[@name="keywords"]/@content').extract()
            ep_item = EpisodeItem()
            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tags:
                ep_item['tag'] = tags[0].strip()

            ep_item['thumb_url'] = thumb[0].strip()
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            #ep_item['description'] = item.get("description")
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            #ep_item['played'] = item.get('play')
            #ep_item['upload_time'] = item.get('create')
            #duration = item.get('duration')
            #if duration:
            #    a,b=duration.split(':')
            #    duration = int(a)*60+int(b)
            #else:
            #    duration = 0
            #ep_item['duration'] = duration
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #24
0
class YoutubeSearchVideoSpider(Spider):
    name = "youtube_search_video"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    spider_id = "64"
    site_id = "2"
    allowed_domains = ["www.youtube.com"]
    url_prefix = 'https://www.youtube.com'

    mgr = DbManager.instance()

    def __init__(self, keywords=None, *args, **kwargs):
        super(YoutubeSearchVideoSpider, self).__init__(*args, **kwargs)
        if keywords:
            keywords = json.loads(keywords)
            self.max_search_page = get_project_settings().get(
                'MAX_MANUAL_SEARCH_PAGE')
        else:
            keywords = self.mgr.get_keywords(st='video', site_name='youtube')
            self.max_search_page = get_project_settings().get(
                'MAX_SEARCH_PAGE')
        if keywords:
            self._keywords = keywords
        else:
            self._keywords = []

    def start_requests(self):
        try:
            items = []
            for page in xrange(int(self.max_search_page)):
                items.extend([
                    Request(
                        url=
                        'https://www.youtube.com/results?filters=video%%2C+week&search_sort=video_view_count&search_query=%s&page=%s'
                        % (k['keyword'], page + 1),
                        callback=self.parse,
                        meta={
                            'category': k['user'],
                            'kw_id': k['id']
                        }) for k in self._keywords
                ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse(self, response):
        try:
            category = response.request.meta[
                'category'] if 'category' in response.request.meta else 'other'
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else 1
            log.msg('%s: %s' % (response.request.url, category),
                    level=log.INFO)

            items = []
            #videos
            videos = response.xpath('//ol[@class="item-section"]/li')
            for v in videos:
                url = v.xpath(
                    './div/div/div[@class="yt-lockup-thumbnail"]/a/@href'
                ).extract()
                thumb_url = v.xpath(
                    './div/div/div[@class="yt-lockup-thumbnail"]/a/div/img/@src'
                ).extract()
                views = v.xpath(
                    './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li/text()'
                ).re('([\d|,]*) views')
                upload_time = v.xpath(
                    './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li[2]/text()'
                ).extract()

                if url:
                    items.append(
                        Request(url=self.url_prefix + url[0],
                                callback=self.parse_episode,
                                meta={
                                    'thumb_url': thumb_url,
                                    'upload_time': upload_time,
                                    'category': category,
                                    'kw_id': kw_id
                                }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            thumb_url = response.request.meta['thumb_url']
            upload_time = response.request.meta['upload_time']
            category = response.request.meta['category']
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else 1
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.url_prefix + owner_url[0] + "/about",
                            callback=self.parse_about))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            #category = response.xpath('//p[@id="eow-category"]/a/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()

            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_youtube_showid(response.request.url)
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if category:
                #ep_item['category'] = category[0].replace('&', '|')
                ep_item['category'] = category
            '''
            if upload:
                ptime = Util.get_youtube_publish(upload[0])
                if ptime:
                    ep_item['upload_time'] = ptime
            '''
            if upload_time:
                t = Util.get_youtube_upload_time(upload_time[0].strip())
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = "\n".join(description)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)
            ep_item['kw_id'] = kw_id

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.parse_other_info,
                        meta={'item': ep_item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_other_info(self, response):
        try:
            log.msg('%s' % response.request.url)
            item = response.request.meta['item']
            items = []

            #duration
            duration = re.search(r'length_seconds=(\d+)', response.body)

            if duration:
                item['duration'] = duration.groups()[0]

            items.append(item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_about(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            items = []

            show_id = response.xpath(
                '//meta[@itemprop="channelId"]/@content').extract()
            user_name = response.xpath(
                '//span[@class="qualified-channel-title-text"]/a/text()'
            ).extract()
            fans = response.xpath('//ul[@class="about-stats"]/li').re(
                re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*subscribers.*</li>',
                           re.S))
            played = response.xpath('//ul[@class="about-stats"]/li').re(
                re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*views.*</li>', re.S))
            intro = response.xpath(
                '//div[@class="about-description branded-page-box-padding"]/descendant-or-self::*/text()'
            ).extract()

            if show_id:
                user_item = UserItem()
                user_item['show_id'] = show_id[0]

                if user_name:
                    user_item['user_name'] = user_name[0]
                if fans:
                    user_item['fans'] = Util.normalize_played(fans[0])
                if played:
                    user_item['played'] = Util.normalize_played(played[0])
                if intro:
                    user_item['intro'] = "".join(intro).strip()

                user_item['spider_id'] = self.spider_id
                user_item['site_id'] = self.site_id
                user_item['url'] = response.request.url[:-len('/about')]

                items.append(user_item)

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #25
0
class YoukuCatHottestSpider(Spider):
    name = "youku_cat_hottest"
    pipelines = [
        'HottestItemPipeline', 'CategoryPipeline', 'MysqlStorePipeline'
    ]
    spider_id = "2"  #youku_cat_hottest
    site_id = "1"  #youku
    allowed_domains = [
        "www.youku.com", "v.youku.com", "i.youku.com", "index.youku.com",
        "play.youku.com"
    ]
    url_prefix = 'http://www.youku.com'
    vpaction_url = "http://v.youku.com/v_vpactionInfo/id/"
    # playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS/"
    playlength_url = "http://play.youku.com/play/get.json?ct=10&vid="
    hottest_played_threshold = get_project_settings().get(
        'HOTTEST_PLAYED_THRESHOLD')

    mgr = DbManager.instance()
    channel_exclude = mgr.get_channel_exclude()

    def __init__(self, cat_urls=None, *args, **kwargs):
        super(YoukuCatHottestSpider, self).__init__(*args, **kwargs)
        if cat_urls:
            cat_urls = json.loads(cat_urls)
            self.max_search_page = get_project_settings().get(
                'MAX_MANUAL_SEARCH_PAGE')
        else:
            cat_urls = self.mgr.get_cat_url("youku")
            self.max_search_page = get_project_settings().get(
                'MAX_SEARCH_PAGE')
        if cat_urls:
            self._cat_urls = cat_urls
        else:
            self._cat_urls = []

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                items.append(
                    Request(url=cat['url'],
                            callback=self.parse,
                            meta={'cat_id': cat['id']}))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    #for each category parse all its sub-categories or types
    def parse(self, response):
        try:
            #log.msg('lev1: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #category
            subs = sel.xpath(
                '//div[@class="yk-filter-panel"]/div[2]/ul/li/a/@href'
            ).extract()
            items.extend([
                Request(url=url,
                        callback=self.parse_most_played,
                        meta={'cat_id': cat_id}) for url in subs
            ])

            inh_item = self.parse_most_played(response)
            if inh_item:
                items.extend(inh_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    #for each sub-category we get the most played
    def parse_most_played(self, response):
        try:
            #log.msg('lev2: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #most played
            most_played = sel.xpath(
                "//div[@class='yk-sort']/div[3]/div/div[@class='panel']/ul/li/a[text()='%s']/@href"
                % u'本周').extract()
            items.extend([
                Request(url=url,
                        callback=self.parse_page,
                        meta={
                            'page': 1,
                            'cat_id': cat_id
                        }) for url in most_played
            ])
            '''
            inh_item = self.parse_page(response)
            if inh_item:
                items.extend(inh_item)
            '''

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            log.msg('%s: %s' %
                    (response.request.url, response.request.meta['page']))
            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']
            if int(page) > int(self.max_search_page):
                return

            items = []
            sel = Selector(response)

            #video items
            yk_v = sel.xpath('//div[@class="yk-col4"]')
            for v in yk_v:
                url = v.xpath('./div/div[@class="v-link"]/a/@href').extract()
                pl = v.xpath(
                    './div/div[@class="v-meta va"]/div[@class="v-meta-entry"]/span/text()'
                ).extract()
                if url and pl:
                    pld = Util.normalize_played(pl[0])
                    if int(pld) >= int(self.hottest_played_threshold):
                        items.append(
                            Request(url=url[0],
                                    callback=self.parse_episode,
                                    meta={'cat_id': cat_id}))
                    #else:
                    #    log.msg('discard: %s' % url[0])

            #pages
            next_page = sel.xpath(
                '//div[@class="yk-pager"]/ul/li[@class="next"]/a/@href'
            ).extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_page,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id
                            }))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #owner
            owner = sel.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return
                items.append(Request(url=owner[0], callback=self.parse_owner))

            #video info
            #title = sel.xpath('//div[@class="base_info"]/h1/descendant-or-self::*/text()').extract()
            title = sel.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            category = sel.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            scripts = sel.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = sel.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = sel.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = sel.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                #ep_item['title'] = Util.strip_title("".join(title))
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if category:
                ep_item['category'] = category[0].replace(u'频道', '')
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id

            #if video_id:
            #    items.append(Request(url=self.vpaction_url+video_id[0], callback=self.parse_vpaction, meta={'item':ep_item}))
            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_vpaction(self, response):
        try:
            #log.msg('%s' % response.request.url)
            item = response.request.meta['item']
            sel = Selector(response)

            #vp = sel.xpath('//div[@id="videodetailInfo"]/ul/li').re(u'<label>总播放数:</label><span.*>(.+)</span>')
            #vp = sel.xpath('//div[@class="info_num"]/span/text()').extract()
            vp = sel.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()'
                           ).extract()
            if vp:
                item['played'] = Util.normalize_played(
                    Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})

            return item

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self, response):
        try:
            #log.msg('parse_playlength ,%s' % response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            # plsylength = str(int(float(jinfo["data"][0]["seconds"])))
            plsylength = str(int(float(jinfo["data"]["video"]["seconds"])))
            if plsylength:
                item['duration'] = str(plsylength)

            return item
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_owner(self, response):
        try:
            log.msg('%s' % response.request.url)
            items = []
            sel = Selector(response)

            user_item = UserItem()
            #owner id
            script = sel.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = sel.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath(
                    './div[@class="info"]/div[@class="username"]/a[1]/@title'
                ).extract()
                played = up.xpath(
                    './div[@class="state"]/ul/li[@class="vnum"]/em/text()'
                ).extract()
                fans = up.xpath(
                    './div[@class="state"]/ul/li[@class="snum"]/em/text()'
                ).extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    #user_item['played'] = Util.normalize_vp(played[0])
                    user_item['played'] = Util.normalize_played(
                        Util.normalize_vp(played[0]))
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = sel.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath(
                    './div[@class="userintro"]/div[@class="desc"]/p[2]/text()'
                ).extract()

                if intro:
                    user_item['intro'] = ''.join(intro)

            #count
            yh = sel.xpath('//div[@class="YK-home"]')
            vcount = '0'
            if yh:
                video_count = yh.xpath(
                    'div[1]/div/div/div/div[@class="title"]/span/a/text()').re(
                        u'\((\d+)\)')

                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount

            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url

            items.append(user_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #26
0
class PageOrderSpider(CrawlSpider):
    name = 'page_order'
    pipelines = ['MysqlStorePipeline']
    spider_id = "2048"
    format_id = 2
    allowed_domains = [
        "youku.com", "www.youku.com", "www.iqiyi.com", "cache.video.iqiyi.com",
        "www.soku.com", "index.youku.com", "play.youku.com"
    ]
    vpaction_url = "http://v.youku.com/v_vpactionInfo/id/"
    playnum_url = 'http://cache.video.iqiyi.com/jp/pc/'
    playlength_url = "http://cache.video.iqiyi.com/a/"
    youku_playlength_url = "http://play.youku.com/play/get.json?ct=10&vid="

    mgr = DbManager.instance()

    rules = (
        #Rule(LinkExtractor(allow=r'http://v.youku.com/v_show/id_.+\.html'), callback='parse_episode_youku'),
        Rule(LinkExtractor(allow=r'http://v.youku.com/v_show/id_.+\.html.*'),
             callback='parse_episode_youku'),
        Rule(LinkExtractor(allow=r'http://www.iqiyi.com/[vw]_.+\.html'),
             callback='parse_episode_iqiyi'),
    )

    def __init__(self, orders=None, *args, **kwargs):
        super(PageOrderSpider, self).__init__(*args, **kwargs)
        if orders:
            orders = json.loads(orders)
        else:
            orders = self.mgr.get_ordered_page(site_name=['iqiyi', 'youku'])
        if orders:
            self._orders = orders
        else:
            self._orders = []

    def _requests_to_follow(self, response):
        if not isinstance(response, HtmlResponse):
            return
        seen = set()
        for n, rule in enumerate(self._rules):
            links = [
                l for l in rule.link_extractor.extract_links(response)
                if l not in seen
            ]
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                r = Request(url=link.url, callback=self._response_downloaded)
                r.meta.update(response.request.meta)
                r.meta.update(rule=n, link_text=link.text)
                yield rule.process_request(r)

    def start_requests(self):
        try:
            items = []
            for page in self._orders:
                items.append(
                    Request(url=page['url'],
                            meta={
                                'pg_id': page['id'],
                                'cat_name': page['user'],
                                'site_id': page['site_id'],
                                'audit': page['audit'],
                                'priority': page['priority']
                            }))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode_youku(self, response):
        try:
            logging.log(logging.INFO,
                        "episode_youku:%s" % response.request.url)
            pg_id = response.request.meta['pg_id']
            cat_name = response.request.meta['cat_name']
            site_id = response.request.meta['site_id']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []
            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = response.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                #ep_item['title'] = Util.strip_title("".join(title))
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            #if category:
            #    ep_item['category'] = category[0].replace(u'频道', '')
            ep_item['category'] = cat_name
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = site_id
            ep_item['url'] = response.request.url
            ep_item['pg_id'] = pg_id
            ep_item['audit'] = audit
            ep_item['format_id'] = self.format_id
            ep_item['priority'] = priority

            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_vpaction(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_vpaction:%s" % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath(
                '//ul[@class="player_info"]/li[@class="sum"]/text()').extract(
                )
            if vp:
                item['played'] = Util.normalize_played(
                    Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.youku_playlength_url + show_id,
                           callback=self.parse_youku_playlength,
                           meta={'item': item})
            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_youku_playlength(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_youku_playlength:%s" % response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            playlength = str(int(float(jinfo["data"]["video"]["seconds"])))
            if playlength:
                item['duration'] = str(playlength)
            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode_iqiyi(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_youku_playlength:%s" % response.request.url)
            pg_id = response.request.meta['pg_id']
            cat_name = response.request.meta['cat_name']
            site_id = response.request.meta['site_id']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            #category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            ep_item['category'] = cat_name
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = site_id
            ep_item['pg_id'] = pg_id
            ep_item['audit'] = audit
            ep_item['url'] = response.request.url
            ep_item['format_id'] = self.format_id
            ep_item['priority'] = priority

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_playlength(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_playlength:%s" % response.request.url)
            item = response.request.meta['item']
            albumid = response.request.meta['albumid']

            items = []
            msg = response.body
            index = msg.find("AlbumInfo=") + len("AlbumInfo=")
            info = msg[index:]
            jinfo = json.loads(info)
            playlength = jinfo["data"]["playLength"]
            if playlength:
                item['duration'] = str(playlength)

            items.append(
                Request(url=self.playnum_url + albumid + "/?qyid=",
                        callback=self.parse_playnum,
                        meta={'item': item}))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_playnum(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_playnum:%s" % response.request.url)
            item = response.request.meta['item']

            items = []
            tplaynum = response.selector.re(re.compile(r':(\d+)'))
            if tplaynum:
                item['played'] = str(tplaynum[0])
                items.append(item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #27
0
class YoukuSearchVideoSpider(Spider):
    name = "youku_search_video"
    pipelines = ['MysqlStorePipeline']
    spider_id = "1024"
    site_id = "1"
    format_id = 2
    url_prefix = 'http://www.soku.com'
    vpaction_url = "http://v.youku.com/v_vpactionInfo/id/"
    playlength_url = "http://play.youku.com/play/get.json?ct=10&vid="
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(YoukuSearchVideoSpider, self).__init__(*args, **kwargs)
        keywords = kwargs.get('kwargs')
        if keywords:
            keywords = json.loads(keywords)
        else:
            keywords = self.mgr.get_keyword_url(site_name='youku')
        if keywords:
            self._keywords = keywords
        else:
            self._keywords = []
    
    def start_requests(self):
        try:
            items = []
            for kw in self._keywords:
                items.append(Request(url=kw['url'], callback=self.parse, meta={'audit': kw['audit'], 'cat_name': kw['user'], 'kw_id': kw['id'], 'priority': kw['priority']}))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse(self, response):
        try:
            logging.log(logging.INFO, "parse:%s" % response.request.url)
            audit =  response.request.meta['audit']
            cat_name = response.request.meta['cat_name']
            kw_id = response.request.meta['kw_id']
            priority = response.request.meta['priority']

            items = []

            #video items
            yk_v = response.xpath('//div[@class="sk-vlist clearfix"]/div[@class="v"]')
            for v in yk_v:
                url = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-title"]/a/@href').extract()
                thumb_urls = v.xpath('./div[@class="v-link"]/a/@href').extract()
                if thumb_urls:
                    thumb_url = thumb_urls[0]
                    if thumb_url == 'http://g1.ykimg.com/':
                        thumb_url = None
                else:
                    thumb_url = None
                pl = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-entry"]/div/label[text()="%s"]/../span/text()' % u'播放: ').extract()
                if pl:
                    pld = Util.normalize_played(pl[0])
                    played = int(pld)
                else:
                    played = None
                if url:
                    items.append(Request(url=url[0], callback=self.parse_episode, meta={'audit': audit, 'thumb_url': thumb_url, 'played': played, 'cat_name': cat_name, 'kw_id': kw_id, 'priority': priority}))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_episode(self, response):
        try:
            logging.log(logging.INFO, 'episode:%s' % response.request.url)
            audit = response.request.meta['audit']
            thumb_url = response.request.meta['thumb_url']
            played = response.request.meta['played']
            cat_name = response.request.meta['cat_name']
            kw_id = response.request.meta['kw_id']
            priority = response.request.meta['priority']

            items = []

            #owner
            owner = response.xpath('//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                items.append(Request(url=owner[0], callback=self.parse_owner))

            #video info
            title = response.xpath('//div[@class="base_info"]/h1/descendant-or-self::text()').extract()
            #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath('//div[@class="yk-videoinfo"]/div[@class="time"]/text()').extract()
            description = response.xpath('//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()').extract()
            vp_url = response.xpath('//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            #if category:
            #    ep_item['category'] = category[0].replace(u'频道', '')
            ep_item['category'] = cat_name
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['audit'] = audit
            ep_item['format_id'] = self.format_id
            ep_item['thumb_url'] = thumb_url
            ep_item['played'] = played
            ep_item['kw_id'] = kw_id
            ep_item['priority'] = priority

            if vp_url:
                items.append(Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item':ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_vpaction(self, response):
        try:
            logging.log(logging.INFO, 'vpaction:%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()').extract()
            if vp:
                item['played'] = Util.normalize_played(Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url+show_id, callback=self.parse_playlength, meta={'item':item})
            return item
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_playlength(self,response):
        try:
            logging.log(logging.INFO, 'playlength:%s' % response.request.url)
            item = response.request.meta['item']
            showid = item["show_id"]

            msg = response.body
            jinfo = json.loads(msg)
            plsylength = str(int(float(jinfo["data"]["video"]["seconds"])))
            if plsylength:
                item['duration'] = str(plsylength)
            return item
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_owner(self, response):
        try:
            logging.log(logging.INFO, "owner:%s" % response.request.url)
            items = []
            user_item = UserItem()
            #owner id 
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath('./div[@class="info"]/div[@class="username"]/a[1]/@title').extract()
                played = up.xpath('./div[@class="state"]/ul/li[@class="vnum"]/em/text()').extract()
                fans = up.xpath('./div[@class="state"]/ul/li[@class="snum"]/em/text()').extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    #user_item['played'] = Util.normalize_vp(played[0])
                    user_item['played'] = Util.normalize_played(Util.normalize_vp(played[0]))
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath('./div[@class="userintro"]/div[@class="desc"]/p[2]/text()').extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = None
            if yh:
                video_count = yh.xpath('div[1]/div/div/div/div[@class="title"]/span/a/text()').re(u'\((\d+)\)')
                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url
            
            items.append(user_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Пример #28
0
class Acfun_cat(Spider):
    name = "acfun_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "3"
    site_id = "12"   
    max_search_page = 1
    request_url = "http://www.acfun.tv/dynamic/channel/1.aspx?channelId=%s&orderBy=0&pageSize=16"
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(Acfun_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_cat_url('acfun')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                url = self.request_url % Util.get_acfun_showid(cat['url'])
                items.extend([Request(url=url, callback=self.parse_page,meta={'cat_name': cat['cat_name'],'audit':cat['audit'],'priority':cat['priority']})])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []
            #video items
            qy_v = response.xpath('//body/div')
            for v in qy_v:
                thumb = v.xpath('./a[@class="thumb"]/img/@src').extract()
                url = v.xpath('./a[@class="thumb"]/@href').extract()
                lens = v.xpath('./a[@class="thumb"]/p/text()').extract()
                if lens:
                    try:
                        a,b=lens[0].strip().split(':')
                        lens = int(a)*60+int(b)
                    except Exception as e:
                        lens = 0
                else:
                    lens = 0
                if url:
                    items.append(Request(url=("http://www.acfun.tv%s" % url[0].strip()), callback=self.parse_episode, meta={'cat_name': cat_name, 'thumb': thumb,'audit':audit,'priority':priority,'lens':lens}))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            lens = response.request.meta['lens']
            items = []
            show_id = response.xpath('//div[@id="block-data-view"]/@data-aid').extract()
            title = response.xpath('//div[@id="block-data-view"]/@data-title').extract()
            tags = response.xpath('//div[@id="block-data-view"]/@data-tags').extract()
            if lens ==0:
                data_from = response.xpath('//div[@id="area-part-view"]/div/a/@data-from').extract()
                data_sid = response.xpath('//div[@id="area-part-view"]/div/a/@data-sid').extract()
                if data_sid:
                    second_request = "http://www.acfun.tv/video/getVideo.aspx?id=" + data_sid[0].strip()
                    items.append(Request(url=second_request, callback=self.parse_duration, meta={'cat_name': cat_name, 'thumb': thumb_url,'audit':audit,'priority':priority,'show_id':show_id,'title':title,'tags':tags,'url':response.request.url}))
                return items
                
            else:
                ep_item = EpisodeItem()
             
                if title:
                    ep_item['title'] = title[0].strip()
                if show_id:
                    ep_item['show_id'] = show_id[0].strip()
                if tags:
                    ep_item['tag'] = tags[0].strip()
                if thumb_url:
                    ep_item['thumb_url'] = thumb_url[0].strip()

                ep_item['spider_id'] = self.spider_id
                ep_item['site_id'] = self.site_id
                ep_item['url'] = response.request.url
                #ep_item['cat_id'] = cat_id
                ep_item['category'] = cat_name
                ep_item['format_id'] = '2'
                ep_item['audit'] = audit
                ep_item['priority'] =priority
                ep_item['duration'] = lens
                items.append(ep_item)
                return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_duration(self, response):
        try:
            items = []
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            title = response.request.meta['title']
            show_id = response.request.meta['show_id']
            tags = response.request.meta['tags']
            url = response.request.meta['url']
            data = json.loads(response.body)
            success = data.get('success')
            if not success or success == 'false':
                return items
            duration = data.get('time')
            if not duration:
                return items 
            ep_item = EpisodeItem()
             
            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id[0].strip()
            if tags:
                ep_item['tag'] = tags[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] =priority
            ep_item['duration'] = int(duration)
            items.append(ep_item)
            return items 
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Пример #29
0
class ifeng_cat(Spider):
    name = "ifeng_cat"
    pipelines = ['MysqlStorePipeline']
    spider_id = "9"
    site_id = "4"
    max_search_page = 1
    mgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(ifeng_cat, self).__init__(*args, **kwargs)
        self._cat_urls = []
        try:
            self._cat_urls = self.mgr.get_cat_url('ifeng')
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def start_requests(self):
        try:
            items = []

            for cat in self._cat_urls:
                print cat
                items.extend([
                    Request(url=cat['url'],
                            callback=self.parse_page,
                            meta={
                                'cat_name': cat['cat_name'],
                                'audit': cat['audit'],
                                'priority': cat['priority']
                            })
                ])

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_page(self, response):
        try:
            #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page']))
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []
            #video items
            qy_v = response.xpath('//div[@class="listwrap"]/div/ul/li')
            for v in qy_v:
                thumb = v.xpath('./div[@class="pic"]/a/img/@src').extract()
                url = v.xpath('./div[@class="pic"]/a/@href').extract()
                lens = v.xpath('./div[@class="pic"]/span[@class="sets"]/text()'
                               ).extract()
                if lens:
                    try:
                        a, b = lens[0].strip().split(':')
                        lens = int(a) * 60 + int(b)
                    except Exception as e:
                        lens = 0
                else:
                    lens = 0
                if url:
                    items.append(
                        Request(url=url[0].strip(),
                                callback=self.parse_episode,
                                meta={
                                    'cat_name': cat_name,
                                    'thumb': thumb,
                                    'audit': audit,
                                    'priority': priority,
                                    'lens': lens
                                }))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)

    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            lens = response.request.meta['lens']
            items = []

            show_id = Util.get_ifeng_showid(response.request.url)
            title = response.xpath(
                '//head/meta[@property="og:title"]/@content').extract()
            tags = response.xpath('//div[@class="protag"]/a/text()').extract()
            upload_time = response.xpath(
                '//div[@class="vTit_wrap"]/div/p/span[@class="data"]/text()'
            ).extract()
            #video info
            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tags:
                ep_item['tag'] = '|'.join(tags)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()
            if upload_time:
                ep_item['upload_time'] = upload_time[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            ep_item['duration'] = lens
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)