Exemplo n.º 1
0
 def text_infos_resolve(labels, infos, mediaItem):
     try:
         if mediaItem == None:
             return
         if labels and infos:
             labels = str(labels[0]).splitlines()
             label = ''.join(labels)
             label = label.replace(' ','')
             if label.startswith(u'导演'):
                 mediaItem['director'] = Util.join_list_safely(infos) 
             elif label.startswith(u'主演'):
                 mediaItem['actor'] = Util.join_list_safely(infos) 
             elif label.startswith(u'类型'):
                 mediaItem['type'] = Util.join_list_safely(infos) 
             elif label.startswith(u'地区'):
                 mediaItem['district'] = Util.join_list_safely(infos) 
             elif label.startswith(u'上映'):
                 release_date = ''.join(infos) 
                 release_dates = re.findall(r'[\d]+', release_date)
                 release_date = ''.join(release_dates) 
                 release_date = Util.str2date(release_date)
                 mediaItem['release_date'] = release_date
             elif label.startswith(u'片长'):
                 duration = ''.join(infos) 
                 durations = re.findall(r'[\d]+', duration)
                 duration = ''.join(durations) 
                 mediaItem['duration'] = duration
             elif label.startswith(u'人气'):
                 score = ''.join(infos) 
                 scores = re.findall(r'[\d]+', score)
                 score = ''.join(scores) 
                 mediaItem['score'] = score 
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 2
0
    def api_media_info(self, mediaItem):
        try:
            api_url = self.other_album_api % (mediaItem['cont_id'], 1)
            result = Util.get_url_content(api_url)
            if not result:
                return
            json_result = json.loads(result)
            desc = json_result['body']['intro']['desc']
            mediaItem['title'] = desc['nameCn']
            if 'directory' in desc:
                director_list = desc['directory'].split(",")
                mediaItem['director'] = Util.join_list_safely(director_list)
            if 'starring' in desc:
                actor_list = desc['starring'].split(",")
                mediaItem['actor'] = Util.join_list_safely(actor_list)
            if 'subCategory' in desc:
                type_list = desc['subCategory'].split(",")
                desc['type'] = Util.join_list_safely(type_list)
            if 'area' in desc:
                district_list = desc['area'].split(",")
                mediaItem['district'] = Util.join_list_safely(district_list)
            if 'releaseDate' in desc:
                mediaItem['release_date'] = Util.str2date(
                    str(desc['releaseDate']))

        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 3
0
    def api_episode_info(self, mvItem=None, playlistId='', cat_id=''):
        # 应该保证mvItem,playlistId不为空,且包含mid或者sid、untrack_id,包含channel_id、site_id
        items = []
        try:
            mvitem = mvItem
            ep_item = mvitem["media"]

            url = self.album_api % (playlistId, 1)
            logging.log(logging.INFO, 'api_episode_info, info url %s' % url)
            info = self.httpdownload.get_data(url)
            info = info.decode('gbk').encode('utf-8')
            info_json = json.loads(info)

            actor_list = info_json.get("mainActors")
            director_list = info_json.get("directors")
            type_list = info_json.get("categories")
            if "actor" not in ep_item and actor_list:
                ep_item["actor"] = Util.join_list_safely(actor_list)
            if "director" not in ep_item and director_list:
                ep_item["director"] = Util.join_list_safely(director_list)
            if "type" not in ep_item and type_list:
                ep_item["type"] = Util.join_list_safely(type_list)
            if "title" not in ep_item:
                ep_item["title"] = info_json.get("albumName")
            if "district" not in ep_item:
                ep_item["district"] = info_json.get("area")
            if "release_date" not in ep_item and info_json.get("publishYear"):
                ep_item["release_date"] = Util.str2date(
                    str(info_json.get("publishYear")))
            if "intro" not in ep_item:
                ep_item["intro"] = info_json.get("albumDesc")
            if "poster_url" not in ep_item or not str.strip(
                    str(ep_item["poster_url"])):
                ep_item["poster_url"] = info_json.get("pic240_330")
            if "cont_id" not in ep_item:
                ep_item["cont_id"] = playlistId

            ttvitem = []
            if ep_item['title']:
                mvitem['media'] = ep_item
                ttvitem = self.parse_video_item(cat_id, playlistId)
            if ttvitem:
                mvitem['video'] = ttvitem
                if "url" not in mvitem["media"]:
                    mvitem["media"]["url"] = ttvitem[0]['url']
                mvitem["media"]["info_id"] = Util.md5hash(
                    Util.summarize(mvitem["media"]))
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 4
0
    def api_media_info(self, mediaVideoItem, vid, prefix_video_url):
        mediaItem = mediaVideoItem[
            'media'] if 'media' in mediaVideoItem else MediaItem()
        try:
            miu = self.media_info_url % vid
            jdata = self.httpdownload.get_data(miu)
            if not jdata:
                pass
            else:
                ddata = json.loads(jdata)
                assert int(ddata.get('code', 202)) == 200, "接口获取媒体信息失败"
                detail = ddata.get('data').get('detail')
                assert type(detail) == dict
                mediaItem['cont_id'] = str(detail.get('collectionId'))
                mediaItem['title'] = detail.get('collectionName')
                mediaItem['director'] = Util.join_list_safely(
                    detail.get('director').split('/'))
                mediaItem['actor'] = Util.join_list_safely(
                    detail.get('player').split('/'))
                mediaItem['release_date'] = Util.str2date(
                    detail.get('publishTime'))
                mediaItem['vcount'] = int(detail.get('totalvideocount'))
                latest = detail.get('lastseries')
                m = re.compile('\D*(\d+)\D*').match(latest)
                if m:
                    mediaItem['latest'] = m.group(1)
                if mediaItem['vcount'] == 1:
                    mediaItem['latest'] = 1
                mediaItem['paid'] = detail.get('isvip')
                mediaItem['intro'] = detail.get('desc')
                mediaItem['poster_url'] = detail.get('image')
                mediaItem['site_id'] = self.site_id
                mediaItem['channel_id'] = self.channels_name_id[
                    mediaItem['channel_id']]
                info_id = Util.md5hash(Util.summarize(mediaItem))
                mediaItem['info_id'] = info_id

                vcount = mediaItem['vcount']
                if not vcount:
                    vcount = 1
                else:
                    vcount = int(vcount)
                video_list = self.api_video_list(vid, vcount, prefix_video_url,
                                                 mediaItem['channel_id'])
                if video_list:
                    Util.set_ext_id(mediaItem, video_list)
                mediaVideoItem['video'] = video_list
                mediaVideoItem['media'] = mediaItem
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.ERROR, vid)
Exemplo n.º 5
0
 def resolve_media_info(self, xpara, mediaItem, ismovie=False):
     try:
         title = xpara.find('title')
         if title is not None:
             mediaItem['title'] = title.text
         tag = xpara.find('catalog')
         if tag is not None and tag.text is not None:
             mediaItem['type'] = Util.join_list_safely(tag.text.split(','))
         director = xpara.find('director')
         if director is not None and director.text is not None:
             mediaItem['director'] = Util.join_list_safely(
                 director.text.split(','))
         actor = xpara.find('act')
         if actor is not None and actor.text is not None:
             mediaItem['actor'] = Util.join_list_safely(
                 actor.text.split(','))
         district = xpara.find('area')
         if district is not None and district.text is not None:
             mediaItem['district'] = district.text
         release_date = xpara.find('year')
         if release_date is not None and release_date.text is not None:
             # 会有<year>0</year>的情况,导致release_date为空
             mediaItem['release_date'] = Util.str2date(release_date.text)
         if ismovie:
             duration = xpara.find('duration')
             if duration is not None and duration.text is not None:
                 mediaItem['duration'] = int(float(duration.text))
         paid = xpara.find('pay')
         if paid is not None and paid.text is not None:
             mediaItem['paid'] = int(float(paid.text))
         intro = xpara.find('content')
         if intro is not None and intro.text is not None:
             mediaItem['intro'] = intro.text
         poster_url = xpara.find('imgurl')
         if poster_url is not None and poster_url.text is not None:
             mediaItem['poster_url'] = poster_url.text
         score = xpara.find('mark')
         if score is not None and score.text is not None:
             mediaItem['score'] = float(score.text)
         latest = xpara.find('vsTitle')
         if latest is not None and latest.text is not None:
             l = re.findall(r'[\d+]', latest.text)
             if l:
                 mediaItem['latest'] = ''.join(l)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 6
0
 def text_infos_resolve(labels, infos, mediaItem):
     try:
         if mediaItem == None:
             return
         if labels and infos:
             labels = str(labels[0]).splitlines()
             label = ''.join(labels)
             label = label.replace(' ', '')
             if label.startswith(u'更新'):
                 latest = infos[0]
                 latests = latest.split('/')
                 #共N集/更新至N集
                 if len(latests) > 1:
                     latest = latests[1]
                 else:
                     latest = latests[0]
                 latests = re.findall(r'[\d]+', latest)
                 mediaItem['latest'] = ''.join(latests)
             elif label.startswith(u'导演'):
                 mediaItem['director'] = Util.join_list_safely(infos)
             elif label.startswith(u'作者') or label.startswith(u'编剧'):
                 mediaItem['writer'] = Util.join_list_safely(infos)
             elif label.startswith(u'主演') or label.startswith(
                     u'配音') or label.startswith(u'主持'):
                 mediaItem['actor'] = Util.join_list_safely(infos)
             elif label.startswith(u'地区'):
                 mediaItem['district'] = Util.join_list_safely(infos)
             elif label.startswith(u'类型'):
                 mediaItem['type'] = Util.join_list_safely(infos)
             elif label.startswith(u'年份') or label.startswith(u'上映'):
                 release_dates = re.findall(r'[\d]+', infos[0])
                 if release_dates:
                     release_date = ''.join(release_dates)
                     release_date = Util.str2date(release_date)
                     mediaItem['release_date'] = release_date
             elif label.startswith(u'片长'):
                 durations = re.findall(r'[\d]+', infos[0])
                 if durations:
                     mediaItem['duration'] = durations[0]
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 7
0
 def text_infos_resolve(label_sels, info_sels, mediaItem, ignore=False):
     try:
         if mediaItem == None:
             return
         if not label_sels or not info_sels:
             return
         labels = label_sels.xpath('./text()').extract()
         infos = info_sels.xpath('./text()').extract()
         if labels and infos:
             labels = str(labels[0]).splitlines()
             label = ''.join(labels)
             label = label.replace(' ', '')
             if label.startswith(u'导演') and ignore == False:
                 mediaItem['director'] = Util.join_list_safely(infos)
             elif label.startswith(u'编剧') and ignore == False:
                 mediaItem['writer'] = Util.join_list_safely(infos)
             elif label.startswith(u'主演') and ignore == False:
                 mediaItem['actor'] = Util.join_list_safely(infos)
             elif label.startswith(u'类型'):
                 #类型/地区放在一块,特殊处理
                 type_infos = info_sels.xpath(
                     './@href[re:test(., "/mdb/film/list/mtype-[\d]+.*")]/../text()'
                 ).extract()
                 district_infos = info_sels.xpath(
                     './@href[re:test(., "/mdb/film/list/country-[\w]+.*")]/../text()'
                 ).extract()
                 mediaItem['type'] = Util.join_list_safely(type_infos)
                 mediaItem['district'] = Util.join_list_safely(
                     district_infos)
             elif label.startswith(u'上映'):
                 info = ''.join(infos)
                 release_dates = re.findall(r'[\d]+', info)
                 if release_dates:
                     release_date = ''.join(release_dates)
                     release_date = Util.str2date(release_date)
                     mediaItem['release_date'] = release_date
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 8
0
 def media_info_extract(response, mediaItem):
     try:
         if mediaItem == None:
             mediaItem = MediaItem()                
         #普通媒体页
         release_dates = response.xpath('./@data-qitancomment-tvyear').extract()
         if release_dates:
             release_dates = re.findall(r'[\d]+', release_dates[0]) 
             if release_dates:
                 release_date = ''.join(release_dates)
                 release_date = Util.str2date(release_date)
                 mediaItem['release_date'] = release_date 
         class_names = response.xpath('./@type').extract()
         if class_names and 'text/javascript' == class_names[0]:
             #视频类型 video:正片 trailer:片花
             regex_express = "vType[ ]?:[ ]?[']?(\w+)[']"
             match_result = response.re(regex_express)
             if match_result:
                vType  = match_result[0]
                if vType.strip() != 'video':
                     return
             regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)'
             #默认采用的是sourceId
             cont_id = '0'
             regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)'
             match_result = response.re(regex_express)
             if match_result:
                 cont_id = match_result[0]
             if cont_id == '0':
                 #其他采用的是albumId
                 regex_express = 'albumId[ ]?:[ ]?["]?(\d+)'
                 match_result = response.re(regex_express)
                 if match_result:
                     cont_id = match_result[0]
                     mediaItem['cont_id'] = '%s|album_id' % (cont_id)                    
             else:
                 mediaItem['cont_id'] = '%s|source_id' % (cont_id)                    
             regex_express = 'cid[ ]?:[ ]?(\d+)'
             match_result = response.re(regex_express)
             if match_result:
                 cid = match_result[0]
                 mediaItem['channel_id'] = cid                    
             regex_express = 'title[ ]?:[ ]?\"(.*)\"'
             match_result = response.re(regex_express)
             if match_result:
                 title = match_result[0]
                 mediaItem['title'] = title
             #特殊剧集页:http://www.iqiyi.com/dianshiju/18jbj.html#vfrm=2-4-0-1
             regex_express = 'albumInfo[ ]?=[ ]?(\{.*\})'
             match_result = response.re(regex_express)
             if match_result:
                 json_content = match_result[0]
                 try:
                     json_data = json.loads(json_content)
                     cont_ids = '0'
                     cont_ids = json_data['sourceId']
                     if cont_ids != '0':
                         cont_ids = '%s|source_id' % (cont_ids) 
                         mediaItem['cont_id'] = cont_ids
                     else:
                         cont_ids = json_data['albumId']
                         cont_ids = '%s|album_id' % (cont_ids) 
                         mediaItem['cont_id'] = cont_ids
                     districts = json_data['areas']
                     types = json_data['types']
                     directors = json_data['directors']
                     actors = json_data['mainActors']
                     writers = json_data['writer']
                     titles = json_data['tvName']
                     poster_urls = json_data['tvPictureUrl']
                     vcounts = json_data['episodeCounts'] 
                     latests = json_data['currentMaxEpisode']
                     release_dates = json_data['issueTime']
                     intros = json_data['tvDesc']
                     if districts:
                         districts_json = json.loads(districts)
                         districts = districts_json.values()
                         mediaItem['district'] = Util.join_list_safely(districts)
                     if types:
                         types_json = json.loads(types)
                         types = types_json.values()
                         mediaItem['type'] = Util.join_list_safely(types)
                     mediaItem['director'] = Util.join_list_safely(directors)
                     mediaItem['actor'] = Util.join_list_safely(actors)
                     mediaItem['writer'] = Util.join_list_safely(writers)
                     mediaItem['title'] = titles
                     mediaItem['poster_url'] = poster_urls
                     mediaItem['vcount'] = vcounts
                     mediaItem['latest'] = latests
                     release_dates = str(release_dates)
                     release_date = Util.str2date(release_dates)
                     mediaItem['release_date'] = release_date
                     mediaItem['intro'] = intros
                 except Exception, e:
                     logging.log(logging.ERROR, traceback.format_exc())
                     logging.log(logging.INFO, '=================json_content=================')
                     logging.log(logging.INFO, json_content)
         #普通媒体页 - 媒体信息域
         # (1) http://www.iqiyi.com/a_19rrgjaiqh.html#vfrm=2-4-0-1
         #   集数的情况很复杂,这里不予考虑
         sels = response.xpath('.//div[@class="result_pic pr"]')
         if sels:
             poster_urls = sels.xpath('.//a/img/@src').extract()
             if poster_urls:
                 mediaItem['poster_url'] = poster_urls[0]
         sels = response.xpath('.//div[@class="result_detail"]')
         if sels:
             titles = sels.xpath('.//h1[@class="main_title"]//a/text()').extract()
             scores = sels.xpath('.//div[@class="topic_item topic_item-rt"]//span[@class="score_font"]//span/text()').extract()
             scores = ''.join(scores)
             scores = re.findall(r'[\d.]+', scores) 
             if titles:
                 mediaItem['title'] = titles[0]
             if scores:
                 try:
                     mediaItem['score'] = float(scores[0])
                 except Exception, e:
                     pass
             msg_sels = sels.xpath('.//div[@class="topic_item clearfix"]')
             for msg_sel in msg_sels:
                 msg_more_sels = msg_sel.xpath('./div')
                 for sel in msg_more_sels:
                     labels = sel.xpath('.//em/text()').extract()
                     infos = sel.xpath('.//em/a/text()').extract()
                     iqiyi_extract.text_infos_resolve(labels, infos, mediaItem)
             intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="moreinfo"]/span/text()').extract()
             if not intros:
                 intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="lessinfo"]/span/text()').extract()
             if intros:
                 mediaItem['intro'] = intros[0]
Exemplo n.º 9
0
    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            year_list = []
            lyears = []

            title_list = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title'
            ).extract()
            director_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'导演:').extract()
            performer_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'主演:').extract()
            type_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'类型:').extract()
            district_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'地区:').extract()
            year_info = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/text()' %
                u'地区:').extract()
            year = None
            if len(year_info) >= 2:
                year = self.get_year(year_info[1])

            #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            districts = Util.join_list_safely(district_list)

            #text
            text = response.xpath(
                '//div[@class="juqing briefTab"]/div/text()').extract()
            #score
            score = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()'
            ).extract()

            play_url = ""
            tplay_url = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href'
            ).extract()
            if tplay_url:
                play_url = self.url_prefix + tplay_url[0].strip()
            videoitems = []

            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0]
                if ep_item["title"].find(u'预:') >= 0:
                    print "预告片,url", response.request.url
                    return items
            ep_item["actor"] = pers
            ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = districts
            if year:
                ep_item["release_date"] = Util.str2date(year)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url,
                                                "baofeng")

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item

            vurl = ""

            videoid = self.getshowid(response.request.url)
            mvitem["media"]["cont_id"] = videoid
            ttvitem = {}
            if title_list:
                ttvitem = self.parse_video_item(response, cat_id, play_url,
                                                title_list, None)
            if ttvitem:
                if 'video' in ttvitem and len(ttvitem['video']) > 0:
                    mvitem['video'] = ttvitem['video']
                    mvitem["media"]["info_id"] = Util.md5hash(
                        Util.summarize(mvitem["media"]))
                    Util.set_ext_id(mvitem["media"], mvitem["video"])
                    if untrack_id and sid:
                        mvitem["untrack_id"] = untrack_id
                        mvitem["sid"] = sid
                    res = self.check_url(mvitem)
                    #if self.check_url(mvitem):
                    if res:
                        items.append(mvitem)
                        pass
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 10
0
    def parse_episode_info(self,response):
        try:
            logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            title = response.request.meta['title']
            actor = response.request.meta['actor']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            items = []

            if not poster_url:
                poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract()
                if poster_url_list:
                    poster_url = poster_url_list[0]
            if not title:
                title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract()
                if title_list:
                    title = title_list[0]
            if not actor:
                #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract()
                actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u' 主演:').extract()
                if actor_list:
                    actor = Util.join_list_safely(actor_list)
                    #actor = "|".join([t.strip() for t in actor_list])

            #performer
            pers = actor
            type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'类型:\n').extract()
            district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'地区:').extract()
            release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'年代:').extract()
            types = None
            if type_list:
                types = Util.join_list_safely(type_list)
            
            #director
            director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'编导:').extract()
            if not director_list:
                director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'导演:').extract()
            dirs = Util.join_list_safely(director_list)
            #dirs = "|".join([t.strip() for t in director_list])
            #text
            text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract()

            #sourceid
            sourceid = self.get_tudou_showid(response.request.url)
            videoitems = []
            ep_item = MediaItem()

            if len(title) > 0:
                ep_item["title"] = title
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = district_list[0].strip()
            if release_date_list:
                ep_item["release_date"] = Util.str2date(release_date_list[0])

            #ep_item["info_id"] = Util.md5hash(tinfo)
            ep_item["cont_id"] = sourceid
            ep_item["site_id"] = self.site_id
            ep_item["url"] = response.request.url
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            
            if len(text) > 0:
                ep_item["intro"] = text[0]

            mvitem = MediaVideoItem();
            mvitem["media"] = ep_item;
            mvitem["video"] = videoitems

            lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid)
            info = self.httpdownload.get_data(lurl)
            jinfo = json.loads(info)
            if "items" in jinfo:
                for sitem in jinfo["items"]:
                    vitem = VideoItem()
                    vitem["title"] = sitem["itemTitle"]
                    vitem["vnum"] = sitem["episode"]
                    vitem["os_id"] = self.os_id
                    trailer = sitem['trailer']
                    if not sitem["itemPlayUrl"]:
                        continue
                    #预告片
                    if trailer:
                        continue
                    turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou")
                    vitem["url"] = turl
                    vitem["os_id"] = self.os_id
                    vitem["site_id"] = self.site_id
                    vitem["ext_id"] = Util.md5hash(turl)
                    vitem["cont_id"] = self.get_tudou_showid(turl)
                    #if "ext_id" not in mvitem["media"]:
                    #    mvitem["media"]["ext_id"] = vitem["ext_id"]
                    #vitem["media_ext_id"] = vitem["ext_id"]
                    mvitem["video"].append(vitem)

            if len(mvitem["video"]) > 0:
                Util.set_ext_id(mvitem["media"],mvitem["video"])
                mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"]))
                if untrack_id:
                    mvitem["untrack_id"] = untrack_id
                if sid:
                    mvitem["sid"] = sid
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e: 
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 11
0
 types = response.xpath('./meta[@itemprop="genre"]/@content').extract()
 actors = response.xpath('./item[@itemprop="actor"]//meta[@itemprop="name"]/@content').extract()
 writers = response.xpath('./meta[@itemprop="author"]/@content').extract()
 intros = response.xpath('./meta[@itemprop="description"]/@content').extract() 
 poster_urls = response.xpath('./item/meta[@itemprop="image"]/@content').extract()
 durations = response.xpath('./item/meta[@itemprop="duration"]/@content').extract()
 vcounts = response.xpath('./meta[@itemprop="numberOfEpisodes"]/@content').extract()
 latests = response.xpath('./meta[@itemprop="newestEpisode"]/@content').extract()
 districts = response.xpath('./item/meta[@itemprop="contentLocation"]/@content').extract()
 scores = response.xpath('./item/div[@itemprop="aggregateRating"]/meta[@itemprop="ratingValue"]/@content').extract()
 if titles:
     mediaItem['title'] = titles[0]
 if release_dates:
     release_dates = re.findall(r'[\d]+', release_dates[0]) 
     release_date = ''.join(release_dates)
     release_date = Util.str2date(release_date)
     mediaItem['release_date'] = release_date
 if directors:
     mediaItem['director'] = Util.join_list_safely(directors)
 if types:
     mediaItem['type'] = Util.join_list_safely(types)
 if actors:
     mediaItem['actor'] = Util.join_list_safely(actors)
 if writers:
     mediaItem['writer'] = Util.join_list_safely(writers)
 if intros:
     mediaItem['intro'] = intros[0]
 if poster_urls:
     mediaItem['poster_url'] = poster_urls[0]
 if durations:
     durations = re.findall(r'[\d]+', durations[0])
Exemplo n.º 12
0
    def parse_episode_info(self,response):
        try:
            logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            page_id = self.get_youku_pageid(response.request.url)
            if not page_id:
                log.error('miss content id: %s' % response.request.url)
                return

            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']
            items = []

            year_list = []

            title = self.parse_title(response,cat_id)
            performer_list = self.parse_actor(response)
            director_list = self.parse_director(response)
            district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'地区:').extract()
            type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'类型:').extract()
            play_date = self.parse_play_date(response)
            total_num = self.parse_total_num(response)

            year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)

            #text
            text = response.xpath('//div[@class="detail"]/span/text()').extract()

            videoitems = []

            ep_item = MediaItem()
            if title:
                ep_item["title"] = title[0].strip()
            if pers:
                ep_item["actor"] = pers
            if dirs > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = district_list[0].strip()
            if play_date:
                ep_item["release_date"] = Util.str2date(play_date)
            if total_num:
                ep_item["vcount"] = total_num

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url,"youku")
            if text:
                ep_item["intro"] = text[0].strip()
            ep_item["cont_id"] = page_id
            ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item))

            mvitem = MediaVideoItem();
            if mid:
                mvitem['mid'] = mid
            mvitem["media"] = ep_item;
            if untrack_id:
                mvitem["untrack_id"] = untrack_id
            if sid:
                mvitem["sid"] = sid

            video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id)
            mvitem['video'] = video_list
            Util.set_ext_id(mvitem["media"], mvitem["video"])
            items.append(mvitem)

        except Exception as e: 
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 13
0
    def list_json_parse(self, response):
        items = []
        try:
            origin_url = response.request.meta['url']
            request_url = response.request.url
            logging.log(logging.INFO, 'json api url: %s' % request_url)
            page = response.request.meta[
                'page'] if 'page' in response.request.meta else 1
            if page > self.max_update_page:
                return items
            channel_id = response.request.meta[
                'id'] if 'id' in response.request.meta else None
            list_json_postfix_url = response.request.meta[
                'postfix_url'] if 'postfix_url' in response.request.meta else None
            json_datas = json.loads(response.body)
            videos = []
            if json_datas:
                videos = json_datas[
                    'data_list'] if 'data_list' in json_datas else []
            if videos:
                #表明仍有下一页
                video_url = 'http://www.letv.com/ptv/vplay/%s.html'
                for item in videos:
                    mediaVideoItem = MediaVideoItem()
                    mediaItem = MediaItem()
                    mediaItem['channel_id'] = channel_id
                    if 'rating' in item and item['rating']:
                        mediaItem['score'] = item['rating']
                    subCategoryName = item['subCategoryName']
                    mediaItem['type'] = subCategoryName.replace(',', ';')
                    mediaVideoItem['media'] = mediaItem
                    release_date = item['releaseDate']
                    if release_date:
                        release_date = float(release_date)
                        if release_date > 0:
                            release_date = release_date / 1000
                            release_date = time.localtime(release_date)
                            release_date = '%s-%s-%s' % (release_date.tm_year,
                                                         release_date.tm_mon,
                                                         release_date.tm_mday)
                            mediaItem['release_date'] = Util.str2date(
                                release_date)
                    vid = ''
                    if 'vids' in item:
                        vids = item['vids']
                        vids = vids.split(',')
                        vid = vids[0]
                    elif 'vid' in item:
                        vid = item['vid']
                    if vid:
                        url = video_url % vid
                        items.append(
                            Request(url=url,
                                    callback=self.video_parse,
                                    meta={'item': mediaVideoItem}))

                #下一页
                page = page + 1
                url = self.list_json_prefix_url + list_json_postfix_url + 'p=%s' % page
                items.append(
                    Request(url=url,
                            callback=self.list_json_parse,
                            meta={
                                'page': page,
                                'id': channel_id,
                                'postfix_url': list_json_postfix_url,
                                'url': url
                            }))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'json api url: %s' % request_url)
            logging.log(logging.INFO, 'origin url: %s' % origin_url)
Exemplo n.º 14
0
    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            #title
            title = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()'
            ).extract()
            if not title or not title[0]:
                title = response.xpath(
                    '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title'
                ).extract()
                if not title or not title[0]:
                    title = response.xpath(
                        '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title'
                    ).extract()
                    if not title or not title[0]:
                        title = response.xpath(
                            '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title'
                        ).extract()
            #performer
            #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract()
            performer_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()'
            ).extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()'
                    % u'主演:').extract()
            #director
            #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract()
            director_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()'
            ).extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()'
                    % u'导演:').extract()
            #text
            text = response.xpath(
                '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()'
            ).extract()
            if not text:
                response.xpath(
                    '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()'
                ).extract()
            type_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()'
            ).extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()'
                    % u'类型:').extract()
            year_info = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()'
            ).extract()
            if not year_info:
                year_info = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()'
                    % u'年份:').extract()
            play_date = None
            if year_info:
                play_date = self.get_year(year_info[0])

            #
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            pers = Util.join_list_safely(performer_list)

            #sourceid
            sourceid = ""
            sourceid_list = response.xpath(
                '//div[@class="mod_bd sourceCont"]/@sourceid').extract()
            if sourceid_list:
                sourceid = sourceid_list[0]

            videoitems = []

            ep_item = MediaItem()

            if len(title) > 0:
                ep_item["title"] = title[0]
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if play_date:
                ep_item["release_date"] = Util.str2date(play_date)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["url"] = Util.normalize_url(response.request.url, "qq")
            ep_item["poster_url"] = poster_url

            if len(text) > 0:
                ep_item["intro"] = text[0]

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item
            mvitem["video"] = videoitems

            vurl = ""
            url_pre = "http://s.video.qq.com/loadplaylist?vkey="
            url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1"

            videoid = self.get_qq_showid(response.request.url)
            #videoid = self.get_vid(response.body,response.request.url)
            mvitem["media"]["cont_id"] = videoid
            mvitem["media"]["info_id"] = Util.md5hash(
                Util.summarize(mvitem["media"]))
            vurl = url_pre + str(sourceid) + url_tail

            tflag = "jQuery191048201349820010364_1425370006500"
            tpitem = self.parse_play_list(cat_id, vurl, tflag, response)
            #没有sourceid,比如专题页面
            if not tpitem:
                tpitem = self.parse_topic_play_list(response)
                videoids = response.xpath(
                    '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value'
                ).extract()
                if videoids:
                    mvitem["media"]["cont_id"] = videoids[0]
            if tpitem:
                mvitem["video"] = tpitem
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if untrack_id:
                    mvitem["untrack_id"] = untrack_id
                if sid:
                    mvitem["sid"] = sid
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 15
0
 def media_info_extract(response, mediaItem):
     try:
         if mediaItem == None:
             mediaItem = MediaItem()
         #播放页
         class_names = response.xpath('./@type').extract()
         if class_names and 'text/javascript' == class_names[0]:
             #媒体页
             titles = response.re('title[ ]?:[ ]?\"(.*)\"')
             if titles:
                 mediaItem['title'] = titles[0]
             #播放页
             pids = response.re('pid[ ]?:[ ]?(\d+)')
             totals = response.re('totalcount[ ]?:[ ]?(\d+)')
             trylooks = response.re('trylook[ ]?:[ ]?(\d*)')
             pPics = response.re('pPic[ ]?:[ ]?\"(.*)\"')
             if pids:
                 mediaItem['cont_id'] = pids[0]
             if totals:
                 mediaItem['vcount'] = totals[0]
             """
             if trylooks:
                 trylook = str(trylooks[0])
                 if trylook == '0':
                     mediaItem['paid'] = 0
                 else:
                     mediaItem['paid'] = 1
             """
             if pPics:
                 poster_url = pPics[0]
                 mediaItem['poster_url'] = poster_url
             if u'电影' == mediaItem['channel_id']:
                 #电影,获取时长
                 durations = response.re('duration[ ]?:[ ]?\"(.*)\"')
                 if durations:
                     duration = durations[0]
                     durations = duration.split(':')
                     length = len(durations)
                     if length == 3:
                         duration = int(durations[0]) * 60 + int(
                             durations[1])
                         duration = str(duration)
                     elif length == 2:
                         duration = durations[0]
                     mediaItem['duration'] = duration
         #媒体页
         results = response.xpath('.//span[@class="s-t"]/text()').extract()
         if results:
             latests = re.findall(r'[\d]+', results[0])
             if latests:
                 latest = ''.join(latests)
                 mediaItem['latest'] = latest
         #媒体页-综艺、动漫、电视剧
         sels = response.xpath('.//dd[@data-statectn="n_textInfo"]')
         if sels:
             results = sels.xpath('.//p[@class="p1"]//a/text()').extract()
             if results:
                 if u'综艺' == mediaItem['channel_id']:
                     mediaItem['actor'] = Util.join_list_safely(results)
                 else:
                     mediaItem['director'] = Util.join_list_safely(results)
             results = sels.xpath('.//p[@class="p2"]//a/text()').extract()
             if results:
                 if u'综艺' != mediaItem['channel_id']:
                     mediaItem['actor'] = Util.join_list_safely(results)
             results = sels.xpath('.//p[@class="p3"]//a/text()').extract()
             if results:
                 mediaItem['district'] = Util.join_list_safely(results)
             if 'release_date' not in mediaItem:
                 results = sels.xpath(
                     './/p[@class="p4"]//a/text()').extract()
                 if results:
                     release_date = results[0]
                     release_date = Util.str2date(release_date)
                     mediaItem['release_date'] = release_date
             results = sels.xpath('.//p[@class="p5"]//a/text()').extract()
             if results:
                 mediaItem['type'] = Util.join_list_safely(results)
             results = sels.xpath('.//p[@class="p7"]/text()').extract()
             if results:
                 intro = results[0].strip()
                 mediaItem['intro'] = intro
         else:
             #媒体页-电影
             sels = response.xpath('.//dd[@data-statectn="n_w150_dd"]')
             if sels:
                 results = sels.xpath(
                     './/p[@class="p2"]//a/text()').extract()
                 if results:
                     mediaItem['director'] = Util.join_list_safely(results)
                 results = sels.xpath(
                     './/p[@class="p3"]//a/text()').extract()
                 if results:
                     mediaItem['actor'] = Util.join_list_safely(results)
                 results = sels.xpath(
                     './/span[@class="s4"]//a/text()').extract()
                 if results:
                     mediaItem['district'] = Util.join_list_safely(results)
                 results = sels.xpath(
                     './/span[@class="s5"]//a/text()').extract()
                 if results:
                     for item in results:
                         if re.match(r"^[\d-]+$", item):
                             if 'release_date' not in mediaItem:
                                 release_date = Util.str2date(item)
                                 mediaItem['release_date'] = release_date
                             results.remove(item)
                             break
                     mediaItem['type'] = Util.join_list_safely(results)
                 results = sels.xpath('.//p[@class="p6"]/text()').extract()
                 if results:
                     intro = results[0].strip()
                     mediaItem['intro'] = intro
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 16
0
    def parse_media_info(self, response):
        try:
            result = {}

            #details
            details = response.xpath('//div[@class="v-details"]')
            title = details.xpath(
                './div[@class="v-title clearfix"]/span[@id="film_name"]/text()'
            ).extract()
            poster = details.xpath(
                './div[@class="v-poster"]/descendant-or-self::*/img/@src'
            ).extract()
            #movieid = details.xpath('./div[@class="v-poster"]/a/@movieid').extract()

            #base info
            base_info = details.xpath('./div[@class="v-main-info clearfix"]')
            actor = base_info.xpath(
                './div[1]/p[@id="actors"]/a/text()').extract()
            director = base_info.re(
                re.compile(r'<i>%s</i>(.*?)</p>' % u'导演:', re.S))
            area = base_info.re(
                re.compile(r'<i>%s</i>(.*?)</p>' % u'地区:', re.S))
            category = base_info.re(
                re.compile(r'<i>%s</i>(.*?)</p>' % u'类型:', re.S))
            year = base_info.re(
                re.compile(r'<i>%s</i>(.*?)</p>' % u'年代:', re.S))

            intro = base_info.xpath(
                './p[@class="intro"]/span[@id="full-intro"]/text()').extract()
            if not intro:
                intro = base_info.xpath(
                    './p[@class="intro"]/span[@id="part-intro"]/text()'
                ).extract()
            if not intro:
                intro = base_info.xpath(
                    './p[@class="intro"]/span[@class="text"]/text()').extract(
                    )
            episode_num = base_info.xpath(
                './p[@class="episode clearfix"]/text()').extract()
            score_int = base_info.xpath(
                './div[@class="aggregate-rating"]/div[1]/p/span/em/text()'
            ).extract()
            score_dec = base_info.xpath(
                './div[@class="aggregate-rating"]/div[1]/p/span/text()'
            ).extract()

            #side info
            side_info = details.xpath('./div[@id="left_info"]')

            if title:
                result['title'] = title[0]
            if poster:
                result['poster_url'] = poster[0]
            #if movieid:
            #    result['cont_id'] = movieid[0]
            result['cont_id'] = get_cluster_id(response.request.url)
            if actor:
                result['actor'] = V360Formatter.join(actor)
            if director:
                result['director'] = V360Formatter.rejoin(director[0])
            if area:
                result['district'] = V360Formatter.rejoin(area[0])
            if category:
                result['type'] = V360Formatter.rejoin(category[0])
            if year:
                result['release_date'] = Util.str2date(year[0])
            if intro:
                result['intro'] = ''.join(intro)
            if episode_num:
                vc = V360Formatter.episode_num(episode_num[0].strip())
                if 'vcount' in vc:
                    result['vcount'] = vc['vcount']
                if 'latest' in vc:
                    result['latest'] = re.sub(r"[^\d]", "", vc['latest'])
            if score_int and score_dec:
                result['score'] = V360Formatter.score(score_int[0],
                                                      score_dec[0])
            elif score_int:
                result['score'] = V360Formatter.score(score_int[0], None)

            return result

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 17
0
    def parse_media_info(self, response):
        try:
            result = {}
            #main
            main = response.xpath('//div[@id="main"]')

            #base info
            base_info = main.xpath('./div[1]/div[2]')
            title = base_info.xpath(
                './div[1]/h1[@id="film_name"]/text()').extract()
            director = base_info.xpath(
                './div[1]/dl[@id="director"]/dd/a[not(@id)]/text()').extract()
            actor = base_info.xpath(
                './div[1]/dl[@id="actor"]/dd/a/text()').extract()
            #tag = base_info.xpath('./div[1]/dl[@id="genre"]/dd/a/text()').extract()
            type = None
            area = base_info.xpath(
                './div[1]/div[@class="text"]/dl[@class="area"]/dd/text()'
            ).extract()
            year = base_info.xpath(
                './div[1]/div[@class="text"]/dl[@class="year"]/dd/text()'
            ).extract()
            duration = base_info.xpath(
                './div[1]/div[@class="text"]/dl[@class="duration"]/dd/text()'
            ).extract()
            score_int = base_info.xpath(
                './div[1]/div[@class="aggregate-rating"]/div[1]/p/span/em/text()'
            ).extract()
            score_dec = base_info.xpath(
                './div[1]/div[@class="aggregate-rating"]/div[1]/p/span/text()'
            ).extract()

            #side info
            side = main.xpath('./div[1]/div[1]')
            poster = side.xpath(
                './div/descendant-or-self::*/img/@src').extract()
            #movieid = side.xpath('./div/a/@movieid').extract()
            pay = side.xpath('./div/a/em').extract()

            #intro
            desc = main.xpath('./div[2]/div[2]')
            intro = desc.xpath('//p[@class="more"]/text()').extract()
            if not intro:
                intro = desc.xpath('//p[@class="less"]/text()').extract()

            #
            try:
                contid = get_cluster_id(response.request.url)
                tag_url = "http://android.api.360kan.com/coverpage/?id=" + contid + "&cat=1&method=coverpage.data&refm=selffull&ss=4&token=2bf65a903d03167e48d38694f8aa4f1a&ver=71&ch=360sjzs"
                taginfo = self.httpdownload.get_data(tag_url)
                if taginfo and len(taginfo) > 32:
                    subtaginfo = taginfo[32:]
                    jtaginfo = json.loads(subtaginfo)
                    if "data" in jtaginfo and "data" in jtaginfo[
                            "data"] and "type" in jtaginfo["data"]["data"]:
                        type = jtaginfo["data"]["data"]["type"]
            except Exception as e:
                pass

            if title:
                result['title'] = title[0]
            if director:
                result['director'] = V360Formatter.join(director)
            if actor:
                result['actor'] = V360Formatter.join(actor)
            if type:
                result['type'] = V360Formatter.join(type)
            if area:
                result['district'] = V360Formatter.rejoin(area[0])
            if year:
                result['release_date'] = Util.str2date(year[0])
            if duration:
                result['duration'] = V360Formatter.duration(duration[0])
            if score_int and score_dec:
                result['score'] = V360Formatter.score(score_int[0],
                                                      score_dec[0])
            elif score_int:
                result['score'] = V360Formatter.score(score_int[0], None)
            if poster:
                result['poster_url'] = poster[0]
            #if movieid:
            #    result['cont_id'] = movieid[0]
            result['cont_id'] = get_cluster_id(response.request.url)
            if pay:
                result['paid'] = 1
            if intro:
                result['intro'] = ''.join(intro)

            return result

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 18
0
    def parse_episode_info(self, response):
        items = []
        try:
            request_url = response.request.url
            logging.log(logging.INFO, 'parse_episode_info: %s' % request_url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']

            #此处因考虑不想过多改变原来的程序结构,其实这些属性可以通过接口获得
            #http://clientapi.wasu.cn/Phone/vodinfo/id/6786984
            title_list = response.xpath(
                '//div[@class="cloudotm1"]/p[1]/a/text()').extract()
            if not title_list:
                title_list = response.xpath(
                    '//div[@class="tele_txts"]/h4[1]/a/text()').extract()

            director_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'导演').extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'导演').extract()
            performer_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'演员').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'演员').extract()
            area_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'地区').extract()
            if not area_list:
                area_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'地区').extract()
            tag_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'标签').extract()
            if not tag_list:
                tag_list = response.xpath(
                    '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()'
                    % u'类型').extract()
            if not tag_list:
                tag_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'标签').extract()
            if not tag_list:
                tag_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'类型').extract()
            year_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'年份').extract()
            if not year_list:
                year_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'年份').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            areas = Util.join_list_safely(area_list)
            tags = Util.join_list_safely(tag_list)

            #text
            text = response.xpath(
                '//div[@class="right_fl"]/p/span[@id="infoS"]/text()').extract(
                )
            if text:
                text = response.xpath(
                    '//div[@class="tele_b_otm"]/p/span[@id="infoS"]/text()'
                ).extract()

            play_url = ""
            mvitem = self.compose_mvitem(response, title_list, pers, dirs,
                                         response.request.url, cat_id,
                                         poster_url, text)
            if mid:
                mvitem['mid'] = mid

            if mvitem and 'video' in mvitem and 'url' in mvitem['video'][
                    0] and mvitem['video'][0]['url']:
                mvitem['media']['type'] = tags
                mvitem['media']['district'] = areas
                if year_list:
                    mvitem['media']['release_date'] = Util.str2date(
                        year_list[0])
                tlen = len(mvitem['video'])
                logging.log(
                    logging.INFO, "++++url: %s video len: %d " %
                    (response.request.url, tlen))
                items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 19
0
    def media_info_extract(response, mediaItem):
        try:
            if mediaItem == None:
                mediaItem = MediaItem()
            #list列表页
            sels = response.xpath('.//p[@class="movielist_tt"]')
            if sels:
                scores = sels.xpath('.//em[@class="score"]/text()').extract()
                latests = sels.xpath('.//em[@class="update"]/text()').extract()
                if scores:
                    mediaItem['score'] = scores[0]
                if latests:
                    latests = re.findall(r'[\d]+', latests[0])
                    if latests:
                        mediaItem['latest'] = latests[0]
            sels = response.xpath('.//a[@class="pic"]')
            if sels:
                poster_urls = sels.xpath('./img/@_src').extract()
                if poster_urls:
                    mediaItem['poster_url'] = poster_urls[0]

            #播放页 - 普通版面
            sels = response.xpath('..//ul[@class="movieinfo"]/li')
            for sel in sels:
                labels = sel.xpath('./text()').extract()
                infos = sel.xpath('./*/text()').extract()
                kankan_extract.text_infos_resolve(labels, infos, mediaItem)
            intros = response.xpath(
                '..//p[@id="movie_info_intro_l"]/text()').extract()
            if intros:
                mediaItem['intro'] = intros[0]

            #播放页 - vip
            #(1)http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01#7927921
            sels = response.xpath('..//div[@class="intro"]')
            if sels:
                url_sels = sels.xpath('.//dt/a/@href')
                if url_sels:
                    regex_express = '(http://movie\.kankan\.com/movie/[\d]+).*'
                    match_result = url_sels.re(regex_express)
                    if match_result:
                        mediaItem['url'] = match_result[0]
                sels = sels.xpath('.//dd')
                for sel in sels:
                    labels = sel.xpath('./text()').extract()
                    infos = sel.xpath('./a/text()').extract()
                    kankan_extract.text_infos_resolve(labels, infos, mediaItem)
                intros = sels.xpath('./dd[@class="intro_p"]/p').extract()
                mediaItem['intro'] = ''.join(intros)
            #(2)http://vip.kankan.com/vod/88365.html#7306075
            sels = response.xpath('..//div[@class="movie_info"]')
            if sels:
                url_sels = sels.xpath('.//dd')
                for sel in url_sels:
                    labels = sel.xpath('./span/text()').extract()
                    infos = sel.xpath('./span/a/text()').extract()
                    if not labels:
                        labels = sel.xpath('./text()').extract()
                        infos = sel.xpath('./a/text()').extract()
                    kankan_extract.text_infos_resolve(labels, infos, mediaItem)
                intros = sels.xpath('./dd/p[@class="intro_p"]').extract()
                mediaItem['intro'] = ''.join(intros)

            #媒体页
            sels = response.xpath('//head//script')
            if sels:
                regex_express = 'movieInfo\.movieid[ ]?=[ ]?(\d+)'
                match_result = sels.re(regex_express)
                if match_result:
                    mediaItem['cont_id'] = match_result[0]
                regex_express = 'movieInfo\.movie_title[ ]?=[ ]?\'(.*)\''
                match_result = sels.re(regex_express)
                if match_result:
                    mediaItem['title'] = match_result[0]
                regex_express = 'movieInfo\.poster[ ]?=[ ]?\'(.*)\''
                match_result = sels.re(regex_express)
                if match_result:
                    mediaItem['poster_url'] = match_result[0]
                regex_express = 'movieInfo\.movie_classify[ ]?=[ ]?(\{.*\})'
                match_result = sels.re(regex_express)
                if match_result:
                    content = match_result[0]
                    json_data = json.loads(content)
                    release_date = json_data[
                        'year'] if 'year' in json_data else ''
                    release_dates = re.findall(r'[\d]+', str(release_date))
                    release_date = ''.join(release_dates)
                    if release_date:
                        release_date = Util.str2date(release_date)
                        mediaItem['release_date'] = release_date
                regex_express = 'movieInfo\.episode[ ]?=[ ]?\'(.*)\''
                match_result = sels.re(regex_express)
                if match_result:
                    latests = match_result[0]
                    latests = latests.split('/')
                    #共N集/更新至N集
                    if len(latests) > 1:
                        latests = latests[1]
                    else:
                        latests = latests[0]
                    latests = re.findall(r'[\d]+', latests)
                    mediaItem['latest'] = ''.join(latests)
                regex_express = 'movieInfo\.total_number[ ]?=[ ]?(\d+)'
                match_result = sels.re(regex_express)
                if match_result:
                    if int(match_result[0]) > 0:
                        mediaItem['vcount'] = match_result[0]
            sels = response.xpath('..//div[@class="info_list"]//li')
            for sel in sels:
                labels = sel.xpath('./text()').extract()
                if not labels:
                    labels = sel.xpath('./em/text()').extract()
                infos = sel.xpath('./a/text()').extract()
                if not infos:
                    infos = sel.xpath('./span/text()').extract()
                kankan_extract.text_infos_resolve(labels, infos, mediaItem)
            sels = response.xpath('..//ul[@class="detail_ul"]//li')
            for sel in sels:
                labels = sel.xpath('./text()').extract()
                infos = sel.xpath('./*/text()').extract()
                kankan_extract.text_infos_resolve(labels, infos, mediaItem)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 20
0
    def parse_media_info(self, response):
        try:
            result = {}
            #base info
            base_info = response.xpath('//div[@id="info"]')
            title = base_info.xpath(
                './div[1]/h1[@id="film_name"]/text()').extract()
            actor = base_info.re(
                re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'主角:',
                           re.S))
            director = base_info.re(
                re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'导演:',
                           re.S))
            area = base_info.re(
                re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'地区:',
                           re.S))
            category = base_info.re(
                re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'类型:',
                           re.S))
            year = base_info.re(
                re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'年代:',
                           re.S))
            intro = base_info.xpath(
                './div[2]/p/em[text()="%s"]/../span[@id="full-intro"]/span/text()'
                % u'简介:').extract()
            if not intro:
                intro = base_info.xpath(
                    './div[2]/p/em[text()="%s"]/../span[@id="part-intro"]/span/text()'
                    % u'简介:').extract()
            if not intro:
                intro = base_info.xpath(
                    './div[2]/p/em[text()="%s"]/../span[@class="text"]/text()'
                    % u'简介:').extract()
            episode_num = base_info.re(
                re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'剧集:',
                           re.S))

            #poster
            poster = response.xpath(
                '//div[@id="left_info"]/div[@id="poster"]/descendant-or-self::*/img/@src'
            ).extract()
            #movieid = response.xpath('//div[@id="left_info"]/div[@id="poster"]/a/@movieid').extract()

            if title:
                result['title'] = title[0]
            if actor:
                result['actor'] = V360Formatter.rejoin(actor[0])
            if director:
                result['director'] = V360Formatter.rejoin(director[0])
            if area:
                result['district'] = V360Formatter.rejoin(area[0])
            if category:
                result['type'] = V360Formatter.rejoin(category[0])
            if year:
                result['release_date'] = Util.str2date(year[0])
            if intro:
                result['intro'] = ''.join(intro)
            if episode_num:
                vc = V360Formatter.episode_num(episode_num[0].strip())
                if 'vcount' in vc:
                    result['vcount'] = vc['vcount']
                if 'latest' in vc:
                    result['latest'] = re.sub(r"[^\d]", "", vc['latest'])
            if poster:
                result['poster_url'] = poster[0]
            #if movieid:
            #    result['cont_id'] = movieid[0]
            result['cont_id'] = get_cluster_id(response.request.url)

            return result

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 21
0
    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']

            year_list = []
            lyears = []

            playlistId = ""
            playlistId_list = response.selector.re(
                re.compile(r'var playlistId.*?(\d+)'))
            if not playlistId_list:
                playlistId_list = response.selector.re(
                    re.compile(r'var PLAYLIST_ID.*?(\d+)'))
            if not playlistId_list:
                playlistId_list = response.selector.re(
                    re.compile(r'= playlistId.*?(\d+)'))

            if playlistId_list:
                playlistId = playlistId_list[0]
            if not playlistId:
                logging.log(
                    logging.INFO,
                    "parse_episode_info error,not find playlistid,url:%s " %
                    response.request.url)
                return items

            title_list = self.parse_title(response, cat_id)
            performer_list = self.parse_actor(response)
            director_list = self.parse_director(response)
            district_list = self.parse_district(response)
            type_list = self.parse_type_list(response)
            #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            year_list = self.parse_year(response)
            year = None
            if year_list:
                year = year_list[0]
            #pers = "|".join([t.strip() for t in performer_list])
            #dirs = "|".join([t.strip() for t in director_list])
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            district = Util.join_list_safely(district_list)

            #text
            text = response.xpath(
                '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()'
            ).extract()

            play_url = ""
            play_url = response.xpath(
                '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href'
            ).extract()
            videoitems = []

            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0]
            ep_item["actor"] = pers
            ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district:
                ep_item["district"] = district
            if year:
                ep_item["release_date"] = Util.str2date(year)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url, "sohu")
            playlistId = str(playlistId)
            ep_item["cont_id"] = playlistId

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            if mid:
                mvitem['mid'] = mid
            if untrack_id and sid:
                mvitem["untrack_id"] = untrack_id
                mvitem["sid"] = sid
            mvitem["media"] = ep_item
            vurl = ""
            ttvitem = []
            if title_list:
                ttvitem = self.parse_video_item(cat_id, playlistId)
            if ttvitem:
                mvitem['video'] = ttvitem
                mvitem["media"]["info_id"] = Util.md5hash(
                    Util.summarize(mvitem["media"]))
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if self.check_url(mvitem):
                    items.append(mvitem)
            if not items and playlistId:
                items += self.api_episode_info(mvitem,
                                               playlistId,
                                               cat_id=cat_id)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemplo n.º 22
0
    def media_info_extract(response, mediaItem):
        try:
            if mediaItem == None:
                mediaItem = MediaItem()
            #媒体页
            sels = response.xpath('.//dt[@class="album-info"]//p')
            for sel in sels:
                labels = sel.xpath('./em[@class="label"]/text()').extract()
                if not labels:
                    continue
                label = labels[0]
                text_regex = re.compile('(%s.*%s.*)' % (u'简',u'介'))
                match_results = text_regex.search(label)
                if match_results:
                    infos = sel.xpath('./span/text()').extract()
                    if not infos:
                        continue
                    mediaItem['intro'] = infos[0]
                else:
                    infos = sel.xpath('./a/text()').extract()
                    if not infos:
                        continue
                    text_regex = re.compile('(%s.*%s.*)' % (u'导',u'演'))
                    match_results = text_regex.search(label)
                    if match_results:
                        mediaItem['director'] = Util.join_list_safely(infos) 
                    else: 
                        text_regex = re.compile('(%s.*%s.*)' % (u'主',u'演'))
                        match_results = text_regex.search(label)
                        if match_results:
                            mediaItem['actor'] = Util.join_list_safely(infos) 
                        else:
                            text_regex = re.compile('(%s.*%s.*%s.*)' % (u'主',u'持',u'人'))
                            match_results = text_regex.search(label)
                            if match_results:
                                mediaItem['actor'] = Util.join_list_safely(infos) 
                            else:
                                text_regex = re.compile('(%s.*%s.*)' % (u'类',u'型'))
                                match_results = text_regex.search(label)
                                if match_results:
                                    mediaItem['type'] = Util.join_list_safely(infos) 
                                else:
                                    text_regex = re.compile('(%s.*%s.*)' % (u'地',u'区'))
                                    match_results = text_regex.search(label)
                                    if match_results:
                                        mediaItem['district'] = Util.join_list_safely(infos) 
            sels = response.xpath('.//div[@class="mod-album-1-intro-til"]')
            if sels:
                titles = sels.xpath('.//span[@class="dd-pic-til"]/text()').extract()
                vcounts = sels.xpath('.//span[@class="update-info-series"]//em/text()').extract()
                if titles:
                    mediaItem['title'] = titles[0]
                if vcounts:
                    vcount = vcounts[0]
                    mediaItem['vcount'] = int(vcount)
            #媒体页
            class_names = response.xpath('./@language').extract()
            if class_names and 'javascript' == class_names[0]:
                cids = response.re('cid[ ]?:[ ]?(\d+)')
                is_fulls = response.re('\"isfull\"[ ]?:[ ]?(\d+)')
                latest = mediaItem['latest'] if 'latest' in mediaItem else None
                if cids:
                    cid = cids[0]
                    mediaItem['cont_id'] = cid
                if is_fulls:
                    is_full = is_fulls[0]
                    if is_full == '0':
                        latest = '0'
                    else:
                        latests = response.re('\"lastseries\"[ ]?:[ ]?\"([\d-]+)\"')
                        if latests:
                            latest = latests[0]
                            latest = filter(str.isalnum, str(latest))
                mediaItem['latest'] = latest

            #播放页(电影)
            sels = response.xpath('..//div[@class="play-xxmes clearfix"]//p')
            for sel in sels:
                labels = sel.xpath('./span[@class="px-l"]/text()').extract()
                if not labels:
                    continue
                label = labels[0]
                text_regex = re.compile('(%s.*%s.*)' % (u'简',u'介'))
                match_results = text_regex.search(label)
                if match_results:
                    infos = sel.xpath('./span[@class="px-r"]/text()').extract()
                    if not infos:
                        continue
                    mediaItem['intro'] = infos[0]
                else:
                    infos = sel.xpath('./span[@class="px-r"]/a/text()').extract()
                    if not infos:
                        continue
                    text_regex = re.compile('(%s.*%s.*)' % (u'导',u'演'))
                    match_results = text_regex.search(label)
                    if match_results:
                        mediaItem['director'] = Util.join_list_safely(infos) 
                    else: 
                        text_regex = re.compile('(%s.*%s.*)' % (u'主',u'演'))
                        match_results = text_regex.search(label)
                        if match_results:
                            mediaItem['actor'] = Util.join_list_safely(infos) 
                        else:
                            text_regex = re.compile('(%s.*%s.*)' % (u'类',u'型'))
                            match_results = text_regex.search(label)
                            if match_results:
                                mediaItem['type'] = Util.join_list_safely(infos)
                            else:
                                text_regex = re.compile('(%s.*%s.*)' % (u'地',u'区'))
                                match_results = text_regex.search(label)
                                if match_results:
                                    mediaItem['district'] = Util.join_list_safely(infos)
            #播放页
            class_names = response.xpath('./@type').extract()
            if class_names and 'text/javascript' == class_names[0]:
                titles = response.re('title[ ]?:[ ]?\"(.*)\"')
                cids = response.re('cid[ ]?:[ ]?(\d+)')
                release_dates = response.re('release_date[ ]?:[ ]?\"([\d-]+)\"')
                if titles:
                    mediaItem['title'] = titles[0]
                if cids:
                    mediaItem['cont_id'] = cids[0]
                if release_dates:
                    release_date = release_dates[0]
                    release_date = Util.str2date(release_date)
                    mediaItem['release_date'] = release_date 

            #list列表页
            poster_urls = response.xpath('.//img[@class="lazy"]/@data-original').extract()
            members = response.xpath('.//span[@class="member-ico"]').extract()
            if poster_urls:
                mediaItem['poster_url'] = poster_urls[0]
            if members:
                mediaItem['paid'] = '1'
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())