def api_parse(self, mediaVideoItem): items = [] try: mediaItem = mediaVideoItem['media'] logging.log(logging.INFO, 'api parse pid: %s' % mediaItem['cont_id']) self.api_media_info(mediaItem) if 'title' in mediaItem: videoItems = [] pagenum = 1 while True: videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) if not page_items: break videoItems = videoItems + page_items pagenum = pagenum + 1 if videoItems: if 'url' not in mediaItem: mediaItem['url'] = videoItems[0]['url'] Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def api_episode_info(self, mvItem=None, playlistId='', cat_id=''): # 应该保证mvItem,playlistId不为空,且包含mid或者sid、untrack_id,包含channel_id、site_id items = [] try: mvitem = mvItem ep_item = mvitem["media"] url = self.album_api % (playlistId, 1) logging.log(logging.INFO, 'api_episode_info, info url %s' % url) info = self.httpdownload.get_data(url) info = info.decode('gbk').encode('utf-8') info_json = json.loads(info) actor_list = info_json.get("mainActors") director_list = info_json.get("directors") type_list = info_json.get("categories") if "actor" not in ep_item and actor_list: ep_item["actor"] = Util.join_list_safely(actor_list) if "director" not in ep_item and director_list: ep_item["director"] = Util.join_list_safely(director_list) if "type" not in ep_item and type_list: ep_item["type"] = Util.join_list_safely(type_list) if "title" not in ep_item: ep_item["title"] = info_json.get("albumName") if "district" not in ep_item: ep_item["district"] = info_json.get("area") if "release_date" not in ep_item and info_json.get("publishYear"): ep_item["release_date"] = Util.str2date( str(info_json.get("publishYear"))) if "intro" not in ep_item: ep_item["intro"] = info_json.get("albumDesc") if "poster_url" not in ep_item or not str.strip( str(ep_item["poster_url"])): ep_item["poster_url"] = info_json.get("pic240_330") if "cont_id" not in ep_item: ep_item["cont_id"] = playlistId ttvitem = [] if ep_item['title']: mvitem['media'] = ep_item ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem if "url" not in mvitem["media"]: mvitem["media"]["url"] = ttvitem[0]['url'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def api_media_info(self, mediaVideoItem, vid, prefix_video_url): mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() try: miu = self.media_info_url % vid jdata = self.httpdownload.get_data(miu) if not jdata: pass else: ddata = json.loads(jdata) assert int(ddata.get('code', 202)) == 200, "接口获取媒体信息失败" detail = ddata.get('data').get('detail') assert type(detail) == dict mediaItem['cont_id'] = str(detail.get('collectionId')) mediaItem['title'] = detail.get('collectionName') mediaItem['director'] = Util.join_list_safely( detail.get('director').split('/')) mediaItem['actor'] = Util.join_list_safely( detail.get('player').split('/')) mediaItem['release_date'] = Util.str2date( detail.get('publishTime')) mediaItem['vcount'] = int(detail.get('totalvideocount')) latest = detail.get('lastseries') m = re.compile('\D*(\d+)\D*').match(latest) if m: mediaItem['latest'] = m.group(1) if mediaItem['vcount'] == 1: mediaItem['latest'] = 1 mediaItem['paid'] = detail.get('isvip') mediaItem['intro'] = detail.get('desc') mediaItem['poster_url'] = detail.get('image') mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] info_id = Util.md5hash(Util.summarize(mediaItem)) mediaItem['info_id'] = info_id vcount = mediaItem['vcount'] if not vcount: vcount = 1 else: vcount = int(vcount) video_list = self.api_video_list(vid, vcount, prefix_video_url, mediaItem['channel_id']) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['video'] = video_list mediaVideoItem['media'] = mediaItem except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.ERROR, vid)
def compose_mvitem(self, response, title_list, pers, dirs, play_url, cat_id, poster_url, text): try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0].strip() ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "wasu") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item mid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = mid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) if not res: return None except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def album_json_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None url = response.request.meta[ 'url'] if 'url' in response.request.meta else None if url != request_url: #被重定向,表明不存在 return items year_express = '(\[.*\])' year_regex = re.compile(year_express) match_results = year_regex.search(response.body) if match_results: videoItems = [] year_content = match_results.groups()[0] years = json.loads(year_content) for year in years: video_url = mediaItem['url'] + '/s/json.%s.js' % year result = Util.get_url_content(video_url) videoItems = videoItems + self.album_tag_json_resolve( text=result, meta={'url': video_url}) if videoItems: Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def parse_video_item(self, response, cat_id, url, title, playlistId): #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title)) videoitems = [] ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item["video"] = videoitems try: if int(cat_id) != int(self.movie_id): ul_list = response.xpath( '//div[@class="episodes clearfix "]/a') if not ul_list: ul_list = response.xpath( '//div[@class="episodes clearfix enc-episodes-detail"]/a' ) for li in ul_list: url = li.xpath('./@href').extract() ttitle = li.xpath('./@title').extract() snum = li.xpath('./text()').extract() if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num = self.getvnum(self.url_prefix + url[0]) if not ttitle: ttitle = [play_num] vitem = self.compose_vitem([self.url_prefix + url[0]], title, play_num) if 'url' in vitem: videoitems.append(vitem) elif int(cat_id) == int(self.movie_id): if url: vitem = self.compose_vitem([url], title, 1) if 'url' in vitem: videoitems.append(vitem) if videoitems: item["video"] = videoitems item["media"]["url"] = response.request.url Util.set_ext_id(item["media"], item["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return item
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else None mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="play"]') letv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//dl[@class="textInfo"]') if sels: #电视剧、综艺、动漫 letv_extract.media_info_extract(sels, mediaItem) else: #电影 sels = response.xpath('//div[@class="detail"]') letv_extract.media_info_extract(sels, mediaItem) #获取正片的url videoItems = [] if u'电影' == mediaItem['channel_id']: pagenum = 1 videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) videoItems = page_items #综艺 elif u'综艺' == mediaItem['channel_id']: sels = response.xpath( '//div[@class="listTab"]//div[@data-statectn="n_click"]') if sels: year_month_sels = sels.xpath('.//a') for year_month_sel in year_month_sels: years = year_month_sel.xpath('./@list-year').extract() months = year_month_sel.xpath( './@list-month').extract() year = None month = None if years: year = years[0] if months: month = months[0] if year and month: videos_url = self.zongyi_album_api % ( year, month, mediaItem['cont_id']) result = Util.get_url_content(videos_url) videoItems = videoItems + self.zongyi_album_resolve( text=result, meta={ 'url': videos_url, 'year': year, 'month': month }) elif mediaItem['channel_id'] in [u'电视剧', u'动漫']: pagenum = 1 while True: videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) if not page_items: break videoItems = videoItems + page_items pagenum = pagenum + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] page_id = self.get_youku_pageid(response.request.url) if not page_id: log.error('miss content id: %s' % response.request.url) return untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] items = [] year_list = [] title = self.parse_title(response,cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'地区:').extract() type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'类型:').extract() play_date = self.parse_play_date(response) total_num = self.parse_total_num(response) year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) #text text = response.xpath('//div[@class="detail"]/span/text()').extract() videoitems = [] ep_item = MediaItem() if title: ep_item["title"] = title[0].strip() if pers: ep_item["actor"] = pers if dirs > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if play_date: ep_item["release_date"] = Util.str2date(play_date) if total_num: ep_item["vcount"] = total_num ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url,"youku") if text: ep_item["intro"] = text[0].strip() ep_item["cont_id"] = page_id ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item)) mvitem = MediaVideoItem(); if mid: mvitem['mid'] = mid mvitem["media"] = ep_item; if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id) mvitem['video'] = video_list Util.set_ext_id(mvitem["media"], mvitem["video"]) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_video_item(self, response, cat_id, url, title, playlistId): #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title)) videoitems = [] ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item["video"] = videoitems try: if int(cat_id) == int(self.variety_id): tvideoitems = self.parse_variety(response) if tvideoitems: for titem in tvideoitems: videoitems.append(titem) elif '/Play/show' not in url: #if int(cat_id) != int(self.movie_id): #ul_list = response.xpath('//div[@class="teleplay_gather tab_box"]/div[@class="list_tabs_cont"]/ul/li') ul_list = response.xpath( '//div[@class="teleplay_gather tab_box"]/div/ul/li') if ul_list: #http://www.wasu.cn/Tele/index/id/6539647 for li in ul_list: yugaopian = li.xpath('.//i[@class="yugao"]').extract() if yugaopian: continue url = li.xpath('./a/@href').extract() ttitle = li.xpath('./a/@title').extract() snum = li.xpath('./a/text()').extract() play_num = "" if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num1 = self.getvnum(self.url_prefix + url[0]) if play_num1: play_num = play_num1 if not ttitle: ttitle = [play_num] vitem = None if self.site_name == Util.guess_site(url[0]): vitem = self.compose_vitem([url[0]], [title[0].strip()], play_num) else: vitem = self.compose_vitem( [self.url_prefix + url[0]], [title[0].strip()], play_num) if 'url' in vitem: videoitems.append(vitem) if not ul_list: #http://www.wasu.cn/Tele/index/id/6786984 ul_list = response.xpath( '//div[@class="tab_box"]//div[ends-with(@class, "col2")]' ) for li in ul_list: yugaopian = li.xpath('.//i[@class="yugao"]').extract() if yugaopian: continue url = li.xpath( './div[@class="ws_des"]/p[1]/a/@href').extract() ttitle = li.xpath( './div[@class="ws_des"]/p[2]/span/text()').extract( ) snum = li.xpath( './div[@class="ws_des"]/p[1]/a/text()').extract() play_num = "" if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num1 = self.getvnum(self.url_prefix + url[0]) if play_num1: play_num = play_num1 if not ttitle: ttitle = [play_num] vitem = None if self.site_name == Util.guess_site(url[0]): vitem = self.compose_vitem([url[0]], [title[0].strip()], play_num) else: vitem = self.compose_vitem( [self.url_prefix + url[0]], [title[0].strip()], play_num) if 'url' in vitem: videoitems.append(vitem) else: #elif int(cat_id) == int(self.movie_id): #无媒体页的播放页 if url: vitem = self.compose_vitem([url], title, 1) if 'url' in vitem: videoitems.append(vitem) if videoitems: item["video"] = videoitems item["media"]["url"] = response.request.url Util.set_ext_id(item["media"], item["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return item
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #title title = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title' ).extract() #performer #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract() performer_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()' ).extract() if not performer_list: performer_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'主演:').extract() #director #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract() director_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()' ).extract() if not director_list: director_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'导演:').extract() #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() if not text: response.xpath( '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()' ).extract() type_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()' ).extract() if not type_list: type_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'类型:').extract() year_info = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()' ).extract() if not year_info: year_info = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'年份:').extract() play_date = None if year_info: play_date = self.get_year(year_info[0]) # dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) pers = Util.join_list_safely(performer_list) #sourceid sourceid = "" sourceid_list = response.xpath( '//div[@class="mod_bd sourceCont"]/@sourceid').extract() if sourceid_list: sourceid = sourceid_list[0] videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if play_date: ep_item["release_date"] = Util.str2date(play_date) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "qq") ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems vurl = "" url_pre = "http://s.video.qq.com/loadplaylist?vkey=" url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1" videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) mvitem["media"]["cont_id"] = videoid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) vurl = url_pre + str(sourceid) + url_tail tflag = "jQuery191048201349820010364_1425370006500" tpitem = self.parse_play_list(cat_id, vurl, tflag, response) #没有sourceid,比如专题页面 if not tpitem: tpitem = self.parse_topic_play_list(response) videoids = response.xpath( '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value' ).extract() if videoids: mvitem["media"]["cont_id"] = videoids[0] if tpitem: mvitem["video"] = tpitem Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #过滤掉skip_types类型的影片 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: play_type = match_result[0] if play_type in self.skip_types: return items #由于某些URL会有跳转,所以应保存真是的URL #http://movie.kankan.com/movie/88365 -> http://data.movie.kankan.com/movie/88365 mediaItem['url'] = request_url sels = response.xpath('//head') kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="info_list"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//ul[@class="detail_ul"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #获取媒体的剧集信息 videoItems = [] if u'综艺' == mediaItem['channel_id']: #综艺 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) elif u'电影' == mediaItem['channel_id']: #电影,从立即观看中获取 videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="section clearfix s2"]') if sels: urls = sels.xpath( './/a[starts-with(@class, "foc")]/@href').extract() thumb_urls = sels.xpath( './/a[@class="foc"]/img/@src').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) else: #电视剧 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_asc")]]') if not sels: #动漫,电视剧 sels = response.xpath( '//ul[@id[re:test(., "fenji_[\d]+_asc")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) else: logging.log(logging.INFO, '%s: no videos' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def api_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'api prase url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() mediaItem['url'] = request_url sel = response.xpath('.//script[@type="text/javascript"]') pidl = response.xpath('.//script[@type="text/javascript"]').re( '\"pid\"\D?(\d+)') vidl = response.xpath('.//script[@type="text/javascript"]').re( '\"id\"\D?(\d+)') if pidl and vidl: pid = pidl[0] vid = vidl[0] app_api = self.app_api % (self.get_auth(), pid) ismovie = False isvariaty = False if u'电影' == mediaItem['channel_id']: ismovie = True app_api = self.app_api % (self.get_auth(), vid) mediaItem['cont_id'] = str(vid) elif u'综艺' == mediaItem['channel_id']: isvariaty = True app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) else: app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) xpara = self.get_xdata(url=app_api) mediaItem = self.resolve_media_info(xpara, mediaItem, ismovie=ismovie) mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem)) max_page = self.get_max_page(xpara) video_list = [] if ismovie: videoItem = VideoItem() videoItem['title'] = mediaItem[ 'title'] if 'title' in mediaItem else None videoItem['thumb_url'] = mediaItem[ 'poster_url'] if 'poster_url' in mediaItem else None videoItem['url'] = mediaItem[ 'url'] if 'url' in mediaItem else None videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash( mediaItem['url']) if 'url' in mediaItem else None videoItem['vnum'] = mediaItem[ 'vcount'] if 'vcount' in mediaItem else 1 videoItem['cont_id'] = mediaItem[ 'cont_id'] if 'cont_id' in mediaItem else None video_list.append(videoItem) else: for i in range(1, max_page): web_api = self.web_api % (pid, i) dpara = self.get_ddata(url=web_api) video_list += self.resolve_video_item( dpara, page_num=i, isvariaty=isvariaty) if isvariaty: video_list = self.revise_video_item(video_list, xpara) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = video_list items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode_play(self, response): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = "" untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #items = [] #title title_list = response.xpath( '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title' ).extract() if not title_list: title_list = response.xpath( '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()' ).extract() #performer performer_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()' ).extract() #director director_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() ep_item = MediaItem() videoitems = [] #not film if int(cat_id) != int(self.movie_id): #video list #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li') video_list = response.xpath( '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li' ) i = 0 for tvideo in video_list: lurl = tvideo.xpath('./a/@href').extract() surl = "" #lnum = tvideo.xpath('./a/@title').extract() lnum = tvideo.xpath('./a/span/text()').extract() vitem = VideoItem() if lnum and lurl: vitem["vnum"] = lnum[0] surl = "http://film.qq.com" + lurl[0] vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,surl) turl = "" if cat_id == self.tv_id: turl = Util.normalize_url(surl, "qq", "tv") if cat_id == self.cartoon_id: turl = Util.normalize_url(surl, "qq", "cartoon") else: turl = Util.normalize_url(surl, "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) else: vitem = VideoItem() if title_list: vitem["title"] = title_list[0] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,response.request.url) turl = Util.normalize_url(response.request.url, "qq") vitem["url"] = turl vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) if len(title_list) > 0: ep_item["title"] = title_list[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if len(text) > 0: ep_item["intro"] = text[0] ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) ep_item["cont_id"] = videoid mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems #mvitem["media"]["url"] = response.request.url mvitem["media"]["url"] = Util.normalize_url( response.request.url, "qq") #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.md5hash(Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) #items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] #普通媒体页 channel_id_fun = mediaItem['channel_id'] sels = response.xpath('//div[@id="qitancommonarea"]') iqiyi_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//script[@type="text/javascript"]') iqiyi_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="mod_search_topic mb20"]') if not sels: sels = response.xpath('.//div[@id="block-B"]') iqiyi_extract.media_info_extract(sels, mediaItem) #特辑媒体页 iqiyi_extract.media_info_extract(response, mediaItem) cont_id = mediaItem['cont_id'] if 'cont_id' in mediaItem else None title = mediaItem['title'] if 'title' in mediaItem else None if cont_id and title: cont_ids = cont_id.split('|') cont_id = cont_ids[0] cont_type = cont_ids[1] ''' vip_url = self.vip_api % cont_id try: result = Util.get_url_content(vip_url) if result: json_data = json.loads(result) if json_data['code'] == self.api_success_code: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'vip url: %s' % vip_url) logging.log(logging.INFO, '-------json data----------') logging.log(logging.INFO, result) ''' mediaItem['channel_id'] = channel_id_fun channel_id_site = iqiyi_extract.list_channels_id[ channel_id_fun] if cont_type == 'source_id': #年份,都采用统一的api来获取 #years = response.xpath('//div[@data-widget="album-sourcelist"]//div[@data-widget-year="album-yearlist"]//a/@data-year').extract() #快乐大本营,天天向上的等,提供的是接口 url = self.source_year_api % (channel_id_site, cont_id) result = Util.get_url_content(url) years = self.source_year_json_resolve(result, url) for year in years: url = self.source_media_api % ( channel_id_site, cont_id, year, channel_id_site, cont_id, year) result = Util.get_url_content(url) videoItems = videoItems + self.source_media_json_resolve( result, mediaItem, url) elif cont_type == 'album_id': #其他,其他的接口 page = 1 url = self.album_media_api % (cont_id, page, cont_id, page) result = Util.get_url_content(url) videoItems = videoItems + self.album_media_json_resolve( result, mediaItem, url) if not videoItems: #特殊节目暂时不爬取,http://www.iqiyi.com/yule/cjkgbj.html #不作任何处理 videoItems = videoItems if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' mediaItem['url'] = request_url pptv_extract.media_info_extract(response, mediaItem) videoItems = [] if u'电影' == mediaItem['channel_id']: if 'cont_id' not in mediaItem or not mediaItem['cont_id']: return items videoItem = VideoItem() videoItem['url'] = mediaItem['url'] videoItem['cont_id'] = mediaItem['cont_id'] Util.copy_media_to_video(mediaItem, videoItem) self.set_video_info(videoItem) videoItems.append(videoItem) else: sel = response.xpath('//script[@type="text/javascript"]') #获取pid&cid用于获取电视剧,综艺,动漫的剧集信息 if sel: pids = sel.re('\"pid\"[ ]?:[ ]?(\d+)') cids = sel.re('\"cat_id\"[ ]?:[ ]?(\d+)') vids = sel.re('\"id\"[ ]?:[ ]?(\d+)') if pids and cids and vids: pid = pids[0] cid = cids[0] vid = vids[0] page = 1 #给media的cont_id赋值 mediaItem['cont_id'] = pid while True: meta = { 'pid': pid, 'cid': cid, 'vid': vid, 'page': page } url = self.album_api % (pid, cid, vid, page) result = Util.get_url_content(url) page_result = self.album_json_resolve( result, mediaItem, meta) if not page_result['items']: #该接口暂时由于获取不到video url,暂不提供 #for auth in self.auths: # url = self.auth_album_api % (pid, auth) # result = Util.get_url_content(url) # page_items = self.auth_album_xml_resolve(result, mediaItem, meta) # if page_items: # videoItems = page_items # break break else: videoItems = videoItems + page_result['items'] page = page + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] year_list = [] lyears = [] title_list = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title' ).extract() director_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'主演:').extract() type_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'类型:').extract() district_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'地区:').extract() year_info = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/text()' % u'地区:').extract() year = None if len(year_info) >= 2: year = self.get_year(year_info[1]) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) districts = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="juqing briefTab"]/div/text()').extract() #score score = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()' ).extract() play_url = "" tplay_url = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href' ).extract() if tplay_url: play_url = self.url_prefix + tplay_url[0].strip() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] if ep_item["title"].find(u'预:') >= 0: print "预告片,url", response.request.url return items ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = districts if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "baofeng") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item vurl = "" videoid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = videoid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) #if self.check_url(mvitem): if res: items.append(mvitem) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] title = response.request.meta['title'] actor = response.request.meta['actor'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] items = [] if not poster_url: poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract() if poster_url_list: poster_url = poster_url_list[0] if not title: title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract() if title_list: title = title_list[0] if not actor: #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract() actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u' 主演:').extract() if actor_list: actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) #performer pers = actor type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'类型:\n').extract() district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'地区:').extract() release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'年代:').extract() types = None if type_list: types = Util.join_list_safely(type_list) #director director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'编导:').extract() if not director_list: director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'导演:').extract() dirs = Util.join_list_safely(director_list) #dirs = "|".join([t.strip() for t in director_list]) #text text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract() #sourceid sourceid = self.get_tudou_showid(response.request.url) videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if release_date_list: ep_item["release_date"] = Util.str2date(release_date_list[0]) #ep_item["info_id"] = Util.md5hash(tinfo) ep_item["cont_id"] = sourceid ep_item["site_id"] = self.site_id ep_item["url"] = response.request.url ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem(); mvitem["media"] = ep_item; mvitem["video"] = videoitems lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid) info = self.httpdownload.get_data(lurl) jinfo = json.loads(info) if "items" in jinfo: for sitem in jinfo["items"]: vitem = VideoItem() vitem["title"] = sitem["itemTitle"] vitem["vnum"] = sitem["episode"] vitem["os_id"] = self.os_id trailer = sitem['trailer'] if not sitem["itemPlayUrl"]: continue #预告片 if trailer: continue turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou") vitem["url"] = turl vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_tudou_showid(turl) #if "ext_id" not in mvitem["media"]: # mvitem["media"]["ext_id"] = vitem["ext_id"] #vitem["media_ext_id"] = vitem["ext_id"] mvitem["video"].append(vitem) if len(mvitem["video"]) > 0: Util.set_ext_id(mvitem["media"],mvitem["video"]) mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"])) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_play(self, response, untrack_id, sid): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #vip title_list = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/h2/@title' ).extract() director_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'主演:').extract() text = response.xpath( '//div[@class="info info-con"]/p[@class="intro"]/text()' ).extract() pers = "|".join([t.strip() for t in performer_list]) dirs = "|".join([t.strip() for t in director_list]) playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] vid = "" vid_list = response.selector.re(re.compile(r'var vid.*?(\d+)')) if vid_list: vid = vid_list[0] if not playlistId or not vid: return mvitem ep_item = MediaItem() ep_item["cont_id"] = playlistId if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "sohu") if text: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid vitem = VideoItem() vitem["title"] = ep_item["title"] if 'title' in ep_item else None vitem["url"] = ep_item["url"] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(ep_item["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = vid videoitems = [] videoitems.append(vitem) mvitem["video"] = videoitems mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #获取播放地址 videoItems = [] videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.video_info_extract(sels, videoItem) if 'url' not in videoItem: #如果videoItem['url']为空,则表示只有影片资料,无播放地址,直接扔掉 logging.log(logging.INFO, '该影片找不到播放地址: %s' % request_url) return items url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem) videoItems.append(videoItem) #媒体属性 #设置媒体付费属性 video_prefix_url = Util.prefix_url_parse(url) if video_prefix_url in self.vip_prefix_urls: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="laMovMAIN fl"]') dy1905_extract.media_info_extract(sels, mediaItem) #剧情与演职人员 nav_sels = response.xpath( '//ul[@class="navSMb"]//li[@class="mdbpLeft2"]//div[@class="nowDefLine DefBOttom"]//a' ) if nav_sels: for sel in nav_sels: labels = sel.xpath('./text()').extract() urls = sel.xpath('./@href').extract() if labels and urls: label = labels[0].strip() if label.startswith(u'剧情') or label.startswith('演职人员'): url = urls[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) dy1905_extract.media_more_info_resolve( result, mediaItem) #设置绝对路径 url = mediaItem['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] year_list = [] lyears = [] playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] if not playlistId: logging.log( logging.INFO, "parse_episode_info error,not find playlistid,url:%s " % response.request.url) return items title_list = self.parse_title(response, cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = self.parse_district(response) type_list = self.parse_type_list(response) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() year_list = self.parse_year(response) year = None if year_list: year = year_list[0] #pers = "|".join([t.strip() for t in performer_list]) #dirs = "|".join([t.strip() for t in director_list]) pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) district = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()' ).extract() play_url = "" play_url = response.xpath( '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href' ).extract() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district: ep_item["district"] = district if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "sohu") playlistId = str(playlistId) ep_item["cont_id"] = playlistId if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid mvitem["media"] = ep_item vurl = "" ttvitem = [] if title_list: ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) if not items and playlistId: items += self.api_episode_info(mvitem, playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def album_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'album url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = response.request.meta[ 'url'] if 'url' in response.request.meta else None mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] sels = response.xpath( '//div[@class="page-videolist-tag-main"]//p[@class="pa1-nav"]') if sels: #存在tag页 #http://list.hunantv.com/album/56.html results = hunantv_extract.album_tag_extract(sels) for item in results: url = Util.get_absolute_url(item['url'], prefix_url) result = Util.get_url_content(url) videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': url}) else: #不存在tag页 #http://list.hunantv.com/album/2905.html video_sels = response.xpath( '//div[@class="page-videolist clearfix"]') if video_sels: result = video_sels.extract()[0] videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': request_url}) else: #无正片页地址 #http://www.hunantv.com/v/7/102831/f/1043648.html,有正片集的URL,但该URL是无效的 if video_url: videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) videoItem['url'] = video_url Util.copy_media_to_video(mediaItem, videoItem) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) #获取视频id match_results = video_url_regex.search(video_url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() channel_id_fun = mediaItem['channel_id'] sels = response.xpath('//script[@type="text/javascript"]') iqiyi_extract.media_info_extract(sels, mediaItem) mediaItem['channel_id'] = channel_id_fun sels = response.xpath( '//div[@itemtype="http://schema.org/ShowEpisode"]') iqiyi_extract.media_info_extract(sels, mediaItem) #播放页 - 用于直接从播放页进入 sels = response.xpath( '//div[@class="crumb_bar" or @class="mod-crumb_bar"]') iqiyi_extract.media_info_extract(sels, mediaItem) url = mediaItem['url'] if 'url' in mediaItem else '' url_type = iqiyi_extract.url_type_resolve(url) if url_type == URL_TYPE_MEDIA: mediaVideoItem['media'] = mediaItem url = mediaItem['url'] items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) else: cont_id = mediaItem[ 'cont_id'] if 'cont_id' in mediaItem else None title = mediaItem['title'] if 'title' in mediaItem else None if cont_id and title: cont_ids = cont_id.split('|') cont_id = cont_ids[0] cont_type = cont_ids[1] ''' vip_url = self.vip_api % cont_id try: result = Util.get_url_content(vip_url) if result: json_data = json.loads(result) if json_data['code'] == self.api_success_code: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'vip url: %s' % vip_url) ''' videoItems = [] if cont_type == 'source_id': #年份,都采用统一的api来获取 #years = response.xpath('//div[@data-widget="album-sourcelist"]//div[@data-widget-year="album-yearlist"]//a/@data-year').extract() #快乐大本营,天天向上的等,提供的是接口 url = self.source_year_api % (channel_id_site, cont_id) result = Util.get_url_content(url) years = self.source_year_json_resolve(result, url) for year in years: url = self.source_media_api % ( channel_id_site, cont_id, year, channel_id_site, cont_id, year) result = Util.get_url_content(url) videoItems = videoItems + self.source_media_json_resolve( result, mediaItem, url) elif cont_type == 'album_id': #其他,其他的接口 page = 1 url = self.album_media_api % (cont_id, page, cont_id, page) result = Util.get_url_content(url) videoItems = videoItems + self.album_media_json_resolve( result, mediaItem, url) if not videoItems: #特殊节目暂时不爬取,http://www.iqiyi.com/yule/cjkgbj.html #不作任何处理 videoItems = videoItems if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems print mediaVideoItem items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)