def video_info_extract(response, videoItem): try: if videoItem == None: videoItem = VideoItem() #媒体页 sels = response.xpath('.//dl[@class="imgBAyy db"]') urls = sels.xpath('./a/@href').extract() if urls: url = urls[0] #http://vip.1905.com/play/871039.shtml regex_express = 'http://[^/]*\.[m]?1905\.com/play/([\d]+)\.shtml.*' regex_pattern = re.compile(regex_express) match_result = regex_pattern.search(url) if not match_result: #http://www.1905.com/vod/info/404887.shtml #http://www.1905.com/vod/play/361717.shtml regex_express = 'http://[^/]*\.[m]?1905\.com/vod/.*/([\d]+)\.shtml.*' regex_pattern = re.compile(regex_express) match_result = regex_pattern.search(url) if match_result: #说明当前的URL是1905本站的url,可以赋值 videoItem['url'] = url videoItem['cont_id'] = match_result.groups()[0] thumb_urls = sels.xpath('./a/img/@src').extract() titles = sels.xpath('./a/@title').extract() if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] if titles: videoItem['title'] = titles[0] except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def album_tag_json_resolve(self, text, meta): items = [] try: request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_express = '(\[\{.*\}\])' video_regex = re.compile(video_express) match_results = video_regex.search(text) if match_results: video_content = match_results.groups()[0] videos = json.loads(video_content) for video in videos: videoItem = VideoItem() ext_id = video['id'] title = video['title'] vnum = video['stitle'] img = video['img'] url = video['url'] videoItem['cont_id'] = ext_id videoItem['title'] = title vnum = str(vnum) videoItem['vnum'] = filter(str.isalnum, vnum) videoItem['thumb_url'] = img videoItem['url'] = Util.get_absolute_url(url, prefix_url) self.set_video_info(videoItem) items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def zongyi_album_resolve(self, text, meta): items = [] try: if not text: return items request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) year = meta['year'] if 'year' in meta else None month = meta['month'] if 'month' in meta else None if year and month: videos_json = json.loads(text) videos = videos_json['data'] if year in videos: videos = videos[year] if month in videos: videos = videos[month] video_url = 'http://www.letv.com/ptv/vplay/%s.html' for video in videos: videoItem = VideoItem() videoItem['cont_id'] = video['id'] videoItem['title'] = video['subTitle'] if video['issue']: videoItem['vnum'] = video['issue'] videoItem['thumb_url'] = video['pic'] url = video_url % videoItem['cont_id'] videoItem['url'] = url self.set_video_info(videoItem) items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url) logging.log(logging.INFO, '===================content===================') logging.log(logging.INFO, text)
def auth_album_xml_resolve(self, text, mediaItem, meta): result = {'error': '0', 'items': []} try: pid = meta['pid'] request_url = self.auth_album_api % (pid) logging.log(logging.INFO, 'auth album xml url: %s' % request_url) xml_content = etree.fromstring(text) latests = xml_content.xpath("//video_list_count").extract() if latests: mediaItem['latest'] = latests[0] datas = xml_content.xpath("//video_list/video") items = [] for data in datas: videoItem = VideoItem() ids = data.xpath('./@id').extract() if ids: videoItem['cont_id'] = ids[0] vnums = data.xpath('./@title').extract() if vnums: vnums = re.findall(r'[\d]+', vnums[0]) if vnums: videoItem['vnum'] = vnums[0] videoItem['title'] = '第%s集' % vnums[0] thumb_urls = data.xpath('./@sloturl').extract() if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem) items.append(videoItem) result['items'] = items except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'auth album xml url: %s' % request_url) logging.log(logging.INFO, '================xml content=================') logging.log(logging.INFO, text)
def video_info_extract(response, videoItem): try: if videoItem == None: videoItem = VideoItem() sels = response.xpath('.//dt[@data-statectn="n_w150_dt"]') if sels: #媒体页-电影 results = sels.xpath('.//p[@class="p1"]//img/@src').extract() if results: videoItem['thumb_url'] = results[0] else: class_names = response.xpath('./@class').extract() if class_names and 'w120' == class_names[0]: #媒体页-电视剧,动漫 urls = response.xpath( './/p[@class="p1"]/a/@href').extract() vnums = response.xpath( './/p[@class="p1"]/a/text()').extract() titles = response.xpath('./dt/a/img/@title').extract() thumb_urls = response.xpath('./dt/a/img/@src').extract() if urls: videoItem['url'] = urls[0] if vnums: vnums = re.findall(r'[\d]+', vnums[0]) vnum = ''.join(vnums) if vnum: videoItem['vnum'] = vnum if titles: videoItem['title'] = titles[0] if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def other_album_resolve(self, text, meta): items = [] try: if not text: return items request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = 'http://www.letv.com/ptv/vplay/%s.html' videos_json = json.loads(text) videos = videos_json['body']['videoList']['videoList']['videoInfo'] for video in videos: try: videoItem = VideoItem() videoItem['cont_id'] = video['vid'] if video['episode']: try: vnum = int(float(video['episode'])) videoItem['vnum'] = vnum except Exception, e: vnum = int(float(video['porder'])) videoItem['vnum'] = vnum videoItem['title'] = video['subTitle'] if not videoItem['title']: videoItem['title'] = '第%s集' % videoItem['vnum'] for key in video['picAll']: thumb_url = video['picAll'][key] videoItem['thumb_url'] = thumb_url break url = video_url % videoItem['cont_id'] videoItem['url'] = url self.set_video_info(videoItem) items.append(videoItem) except Exception, e: continue
def parse_topic_play_list(self, response): item = None videoitems = [] try: subs = response.xpath( '//div[@class="mod_video_fragments"]/div[@class="mod_figures_1"]/ul/li' ) for sub in subs: vitem = VideoItem() title = sub.xpath('./strong/a/text()').extract() vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = sub.xpath('./strong/a/@href').extract() if title and title[0].find(u"预告") < 0: if turl and turl[0].find(".com") < 0 or ( turl and turl[0].find("qq.com") >= 0): vitem["title"] = title[0].strip() vitem["vnum"] = self.get_num(vitem["title"]) sturl = turl[0] if turl[0].find("qq.com") < 0: sturl = self.url_prefix + turl[0] vitem["url"] = Util.normalize_url(sturl, "qq", "tv") vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems
def video_extract(response): items = [] try: #list列表页、正片页 results = response.xpath('.//a/@href[re:test(., "http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/[\d]+\.html")]').extract() for item in results: videoItem = VideoItem() videoItem['url'] = item items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def api_video_list(self, vid, vcount, prefix_video_url, channel): video_list = [] try: max_page_num = vcount / 20 + 1 for i in range(max_page_num): vlu = self.video_list_url % (vid, i) jdata = self.httpdownload.get_data(vlu) if not jdata: break ddata = json.loads(jdata) code = int(ddata.get('code', 202)) if code != 200: break datal = ddata.get('data') if not datal: break for data in datal: videoItem = VideoItem() if type(data) != dict: continue #videoItem['title'] = data.get('name') videoItem['title'] = data.get('desc') videoItem['thumb_url'] = data.get('image') videoItem['vnum'] = data.get('videoIndex') videoId = data.get('videoId') #if int(videoItem['vnum']) == 0: # videoItem['vnum'] = self.get_vnum(data.get('name')) turl = self.media_info_url % (videoId) tjdata = self.httpdownload.get_data(turl) if not tjdata: continue tdjdata = json.loads(tjdata) tcode = int(tdjdata.get('code', 202)) if code != 200: continue tdatal = tdjdata.get('data') if not tdatal: continue publish_time = tdatal.get('detail').get('publishTime') if publish_time and channel == 2004: videoItem['vnum'] = self.get_vnum(publish_time) tcode = int(ddata.get('code', 202)) videoItem['cont_id'] = data.get('videoId') videoItem['url'] = Util.normalize_url( prefix_video_url % data.get('videoId'), self.site_code) videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash(videoItem['url']) video_list.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_extract(response): items = [] try: #list列表页 #普通URL results = response.xpath( './/a/@href[re:test(., "http://vod\.kankan\.com/v/[\d]+/[\d]+\.shtml.*")]' ).extract() for item in results: videoItem = VideoItem() videoItem['url'] = item items.append(videoItem) #vipURL results = response.xpath( './/a/@href[re:test(., "http://vip\.kankan\.com/vod/[\d]+\.html.*")]' ).extract() for item in results: videoItem = VideoItem() videoItem['url'] = item items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_info_extract(response, videoItem): try: if videoItem == None: videoItem = VideoItem() channel_name = videoItem['intro'] if 'intro' in videoItem else '' videoItem['intro'] = '' #媒体页 sels = response.xpath('./a[@class="pic" or @class="foc"]') if sels: urls = sels.xpath('./@href').extract() titles = sels.xpath('./@title').extract() cont_ids = sels.xpath('./@subid').extract() thumb_urls = sels.xpath('.//img/@_src').extract() if urls: url = urls[0] videoItem['url'] = url #这里得区分综艺与其他,因为综艺跟其他刚好反过来 if channel_name == u'综艺': intros = response.xpath('./*/a[@href="%s"]/text()' % url).extract() if intros: videoItem['intro'] = intros[0] else: vnums = response.xpath('./*/a[@href="%s"]/text()' % url).extract() if vnums: vnums = re.findall(r'[\d]+', vnums[0]) vnum = ''.join(vnums) if vnum: videoItem['vnum'] = vnum if titles: videoItem['title'] = titles[0] if cont_ids: videoItem['cont_id'] = cont_ids[0] if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] #这里得区分综艺与其他,因为综艺跟其他刚好反过来 if channel_name == u'综艺': vnums = response.xpath('./h4/text()').extract() if vnums: vnums = re.findall(r'[\d]+', vnums[0]) vnum = ''.join(vnums) if vnum: videoItem['vnum'] = vnum else: intros = response.xpath('./h4/text()').extract() if intros: videoItem['intro'] = intros[0] except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def compose_vitem(self, url_list, title_list, vnum): vitem = VideoItem() try: if not url_list: return vitem if title_list: vitem["title"] = title_list[0].strip() turl = Util.normalize_url(url_list[0], "sohu") vitem["url"] = turl vitem["vnum"] = str(vnum) vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(turl) vitem["site_id"] = self.site_id except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return vitem
def album_media_json_resolve(self, text, mediaItem, request_url): items = [] content = '' try: logging.log(logging.INFO, 'album media json url: %s' % request_url) regex_express = '=(\{.*\})' regex_pattern = re.compile(regex_express) match_results = regex_pattern.search(text) if match_results: content = match_results.groups()[0] json_content = json.loads(content) if json_content['code'] != self.api_success_code: return items mediaItem['vcount'] = json_content['data']['pm'] mediaItem['latest'] = json_content['data']['ic'] datas = json_content['data']['vlist'] for data in datas: #type:正片:1, 预告片:0 type = data['type'] if str(type) != '0': videoItem = VideoItem() videoItem['intro'] = data['vt'] videoItem['vnum'] = data['pd'] videoItem['thumb_url'] = data['vpic'] videoItem['title'] = data['vt'] if not videoItem['title']: videoItem['title'] = data['vn'] videoItem['cont_id'] = data['id'] videoItem['url'] = data['vurl'] self.set_video_info(videoItem) items.append(videoItem) #爬取下一页 current_count = int(json_content['data']['pn']) page_count = int(json_content['data']['pp']) if current_count != 0 and current_count == page_count: cont_id = json_content['data']['aid'] page = int(json_content['data']['pg']) + 1 url = self.album_media_api % (cont_id, page, cont_id, page) result = Util.get_url_content(url) items = items + self.album_media_json_resolve( result, mediaItem, url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album media json url: %s' % request_url) logging.log(logging.INFO, '================json content=================') logging.log(logging.INFO, text)
def resolve_video_item(self, dpara, page_num=1, isvariaty=False): videos = [] page_num -= 1 try: if dpara and int( dpara.get('err') ) == 0 and 'data' in dpara and 'list' in dpara['data']: lst = dpara['data'].get('list', []) sameV = 1 for index, item in enumerate(lst): videoItem = VideoItem() videoItem['cont_id'] = item.get('id') videoItem['url'] = Util.normalize_url( item.get('url'), self.site_code) videoItem['thumb_url'] = item.get('capture') videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash(videoItem['url']) oep = item.get('epTitle', '') nep = oep[::-1] for i in [u'上', u'中', u'下']: nep.replace(i, '', 1) nep = nep[::-1] if isvariaty and nep and nep.isdigit() and len(nep) == 8: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = item.get('title', '') + str( videoItem['vnum']) elif isvariaty: videoItem['title'] = oep if oep else item.get( 'title', '') # 对于date为空的情况,取下标作为剧集号 videoItem['vnum'] = str(index + 1 + page_num * 100) elif nep and nep.isdigit(): # '01' --> '1' videoItem['vnum'] = str(int(float(nep))) videoItem['title'] = item.get('title', '') + oep elif nep: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = oep elif not nep: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = item.get('title', '') + str( videoItem['vnum']) + oep videos.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def album_json_resolve(self, text, mediaItem, meta): result = {'error': '0', 'items': []} try: pid = meta['pid'] cid = meta['cid'] vid = meta['vid'] page = meta['page'] request_url = self.album_api % (pid, cid, vid, page) logging.log(logging.INFO, 'album json url: %s' % request_url) regex_express = '(\{.*\})' regex_pattern = re.compile(regex_express) match_results = regex_pattern.search(text) if match_results: content = match_results.groups()[0] json_content = json.loads(content) if json_content['err'] != 0: result['error'] = '1' return result mediaItem['latest'] = json_content['data']['total'] datas = json_content['data']['list'] items = [] for data in datas: #片花 is_trailer = data['isTrailer'] if is_trailer != False and str(is_trailer) != 'false': continue videoItem = VideoItem() videoItem['title'] = data['title'] videoItem['cont_id'] = data['id'] videoItem['url'] = data['url'] videoItem['thumb_url'] = data['capture'] epTitle = data['epTitle'] vnums = re.findall(r'[\d]+', epTitle) if vnums: videoItem['vnum'] = vnums[0] self.set_video_info(videoItem) items.append(videoItem) result['items'] = items except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album json url: %s' % request_url) logging.log(logging.INFO, '================json content=================') logging.log(logging.INFO, text)
def video_info_extract(response, videoItem): try: if videoItem == None: videoItem = VideoItem() #正片页 vnums = response.xpath('.//span[@class="a-pic-t1"]/text()').extract() titles = response.xpath('.//span[@class="a-pic-t2"]/text()').extract() urls = response.xpath('.//a/@href').extract() thumb_urls = response.xpath('.//img[@class="lazy"]/@data-original').extract() if vnums: vnums = re.findall(r'[\d]+', vnums[0]) if vnums: vnum = ''.join(vnums) videoItem['vnum'] = vnum if titles: videoItem['title'] = titles[0] if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] if urls: videoItem['url'] = urls[0] except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def album_tag_resolve(self, text, meta): items = [] try: request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'album tag url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) try: response = Selector(text=text) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'text to be parsed is not xml or html') return items sels = response.xpath( '//div[@class="play-index-con-box"]//ul[@class="clearfix ullist-ele"]/li' ) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) for sel in sels: videoItem = VideoItem() hunantv_extract.video_info_extract(sel, videoItem) url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url #获取视频id match_results = video_url_regex.search(url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) items.append(videoItem) #下一页 results = hunantv_extract.next_page_extract(response) if results: url = results[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) items = items + self.album_tag_resolve(text=result, meta={'url': url})
def source_media_json_resolve(self, text, mediaItem, request_url): items = [] try: logging.log(logging.INFO, 'source media json url: %s' % request_url) regex_express = '=(\{.*\})' regex_pattern = re.compile(regex_express) match_results = regex_pattern.search(text) if match_results: content = match_results.groups()[0] json_content = json.loads(content) datas = json_content['data'] for data in datas: videoItem = VideoItem() title = data['shortTitle'] if 'shortTitle' in data else None if not title: title = data[ 'tvSbtitle'] if 'tvSbtitle' in data else None if not title: title = data['videoName'] videoItem['title'] = title videoItem['intro'] = data['desc'] videoItem['url'] = data['vUrl'] videoItem['thumb_url'] = data['tvPicUrl'] videoItem['cont_id'] = data['tvId'] vnums = data['tvYear'] vnums = re.findall(r'[\d]+', vnums) vnum = ''.join(vnums) videoItem['vnum'] = vnum self.set_video_info(videoItem) items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'source media json url: %s' % request_url) logging.log(logging.INFO, '================json content=================') logging.log(logging.INFO, text)
def parse_video_item_media(self,code,pn): videoitems = [] try: getlist_url = "http://v.youku.com/x_getAjaxData?md=showlistnew&vid=%s&pl=100&pn=%d" % (code,pn) urllist_info = self.httpdownload.get_data(getlist_url,ua=self.ua) if urllist_info: try: json_data = json.loads(urllist_info) except Exception as e: return videoitems if json_data and "showlistnew" in json_data: if json_data["showlistnew"]: items = json_data["showlistnew"]["items"] vnum_name = "" if type(items)==list: videoseq = set() videostage = set() for item in items: if "preview" in item: continue videoseq.add(item["show_videoseq"]) videostage.add(item["show_videostage"]) if len(videoseq)>len(videostage): vnum_name = "show_videoseq" else: vnum_name = "show_videostage" for item in items: if "preview" in item: continue if "videoid" not in item: continue vitem = VideoItem() vitem["url"] = "http://v.youku.com/v_show/id_%s.html" % item["videoid"] vitem["vnum"] = item[vnum_name] vitem["title"] = item["title"] vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = item["videoid"] videoitems.append(vitem) elif type(items)==dict: videoseq = set() videostage = set() for k in items: item = items[k] if "preview" in item: continue videoseq.add(item["show_videoseq"]) videostage.add(item["show_videostage"]) if len(videoseq)>len(videostage): vnum_name = "show_videoseq" else: vnum_name = "show_videostage" for k in items: item = items[k] if "preview" in item: continue if "videoid" not in item: continue vitem = VideoItem() vitem["url"] = "http://v.youku.com/v_show/id_%s.html" % item["videoid"] vitem["vnum"] = item[vnum_name] vitem["title"] = item["title"] vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = item["videoid"] videoitems.append(vitem) else: logging.log(logging.ERROR, getlist_url) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return videoitems
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] title = response.request.meta['title'] actor = response.request.meta['actor'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] items = [] if not poster_url: poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract() if poster_url_list: poster_url = poster_url_list[0] if not title: title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract() if title_list: title = title_list[0] if not actor: #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract() actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u' 主演:').extract() if actor_list: actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) #performer pers = actor type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'类型:\n').extract() district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'地区:').extract() release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'年代:').extract() types = None if type_list: types = Util.join_list_safely(type_list) #director director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'编导:').extract() if not director_list: director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'导演:').extract() dirs = Util.join_list_safely(director_list) #dirs = "|".join([t.strip() for t in director_list]) #text text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract() #sourceid sourceid = self.get_tudou_showid(response.request.url) videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if release_date_list: ep_item["release_date"] = Util.str2date(release_date_list[0]) #ep_item["info_id"] = Util.md5hash(tinfo) ep_item["cont_id"] = sourceid ep_item["site_id"] = self.site_id ep_item["url"] = response.request.url ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem(); mvitem["media"] = ep_item; mvitem["video"] = videoitems lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid) info = self.httpdownload.get_data(lurl) jinfo = json.loads(info) if "items" in jinfo: for sitem in jinfo["items"]: vitem = VideoItem() vitem["title"] = sitem["itemTitle"] vitem["vnum"] = sitem["episode"] vitem["os_id"] = self.os_id trailer = sitem['trailer'] if not sitem["itemPlayUrl"]: continue #预告片 if trailer: continue turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou") vitem["url"] = turl vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_tudou_showid(turl) #if "ext_id" not in mvitem["media"]: # mvitem["media"]["ext_id"] = vitem["ext_id"] #vitem["media_ext_id"] = vitem["ext_id"] mvitem["video"].append(vitem) if len(mvitem["video"]) > 0: Util.set_ext_id(mvitem["media"],mvitem["video"]) mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"])) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #获取播放地址 videoItems = [] videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.video_info_extract(sels, videoItem) if 'url' not in videoItem: #如果videoItem['url']为空,则表示只有影片资料,无播放地址,直接扔掉 logging.log(logging.INFO, '该影片找不到播放地址: %s' % request_url) return items url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem) videoItems.append(videoItem) #媒体属性 #设置媒体付费属性 video_prefix_url = Util.prefix_url_parse(url) if video_prefix_url in self.vip_prefix_urls: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="laMovMAIN fl"]') dy1905_extract.media_info_extract(sels, mediaItem) #剧情与演职人员 nav_sels = response.xpath( '//ul[@class="navSMb"]//li[@class="mdbpLeft2"]//div[@class="nowDefLine DefBOttom"]//a' ) if nav_sels: for sel in nav_sels: labels = sel.xpath('./text()').extract() urls = sel.xpath('./@href').extract() if labels and urls: label = labels[0].strip() if label.startswith(u'剧情') or label.startswith('演职人员'): url = urls[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) dy1905_extract.media_more_info_resolve( result, mediaItem) #设置绝对路径 url = mediaItem['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #过滤掉skip_types类型的影片 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: play_type = match_result[0] if play_type in self.skip_types: return items #由于某些URL会有跳转,所以应保存真是的URL #http://movie.kankan.com/movie/88365 -> http://data.movie.kankan.com/movie/88365 mediaItem['url'] = request_url sels = response.xpath('//head') kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="info_list"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//ul[@class="detail_ul"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #获取媒体的剧集信息 videoItems = [] if u'综艺' == mediaItem['channel_id']: #综艺 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) elif u'电影' == mediaItem['channel_id']: #电影,从立即观看中获取 videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="section clearfix s2"]') if sels: urls = sels.xpath( './/a[starts-with(@class, "foc")]/@href').extract() thumb_urls = sels.xpath( './/a[@class="foc"]/img/@src').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) else: #电视剧 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_asc")]]') if not sels: #动漫,电视剧 sels = response.xpath( '//ul[@id[re:test(., "fenji_[\d]+_asc")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) else: logging.log(logging.INFO, '%s: no videos' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_play_list(self, cat_id, url, flag, response): item = None videoitems = [] try: ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item['video'] = videoitems info = None try: info = self.httpdownload.get_data(url) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems if not info or len(info) < 2: return videoitems msg = info bodylen = len(msg) - 1 index = msg.find(flag) + len(flag) + 1 info = msg[index:bodylen] jinfo = json.loads(info) if "video_play_list" not in jinfo: return videoitems itemlist = jinfo["video_play_list"]["playlist"] for titem in itemlist: if "episode_number" not in titem: continue info = titem["episode_number"] if info and titem["title"].find(u"预告") < 0 and url.find( "qq.com") >= 0: vitem = VideoItem() vitem["title"] = titem["title"] tvnum = string.replace(info, "-", "") #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html tvnum_list = re.findall(r'[\D]+', tvnum) if not tvnum_list: vitem["vnum"] = string.replace(info, "-", "") else: continue vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = "" if int(cat_id) == int(self.tv_id) or int(cat_id) == int( self.cartoon_id): turl = Util.normalize_url(titem["url"], "qq", "tv") else: turl = Util.normalize_url(titem["url"], "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) #vitem["cont_id"] = self.get_vid(response.body,turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems
def parse_episode_play(self, response): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = "" untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #items = [] #title title_list = response.xpath( '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title' ).extract() if not title_list: title_list = response.xpath( '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()' ).extract() #performer performer_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()' ).extract() #director director_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() ep_item = MediaItem() videoitems = [] #not film if int(cat_id) != int(self.movie_id): #video list #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li') video_list = response.xpath( '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li' ) i = 0 for tvideo in video_list: lurl = tvideo.xpath('./a/@href').extract() surl = "" #lnum = tvideo.xpath('./a/@title').extract() lnum = tvideo.xpath('./a/span/text()').extract() vitem = VideoItem() if lnum and lurl: vitem["vnum"] = lnum[0] surl = "http://film.qq.com" + lurl[0] vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,surl) turl = "" if cat_id == self.tv_id: turl = Util.normalize_url(surl, "qq", "tv") if cat_id == self.cartoon_id: turl = Util.normalize_url(surl, "qq", "cartoon") else: turl = Util.normalize_url(surl, "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) else: vitem = VideoItem() if title_list: vitem["title"] = title_list[0] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,response.request.url) turl = Util.normalize_url(response.request.url, "qq") vitem["url"] = turl vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) if len(title_list) > 0: ep_item["title"] = title_list[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if len(text) > 0: ep_item["intro"] = text[0] ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) ep_item["cont_id"] = videoid mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems #mvitem["media"]["url"] = response.request.url mvitem["media"]["url"] = Util.normalize_url( response.request.url, "qq") #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.md5hash(Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) #items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def parse_episode_play(self, response, untrack_id, sid): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #vip title_list = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/h2/@title' ).extract() director_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'主演:').extract() text = response.xpath( '//div[@class="info info-con"]/p[@class="intro"]/text()' ).extract() pers = "|".join([t.strip() for t in performer_list]) dirs = "|".join([t.strip() for t in director_list]) playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] vid = "" vid_list = response.selector.re(re.compile(r'var vid.*?(\d+)')) if vid_list: vid = vid_list[0] if not playlistId or not vid: return mvitem ep_item = MediaItem() ep_item["cont_id"] = playlistId if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "sohu") if text: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid vitem = VideoItem() vitem["title"] = ep_item["title"] if 'title' in ep_item else None vitem["url"] = ep_item["url"] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(ep_item["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = vid videoitems = [] videoitems.append(vitem) mvitem["video"] = videoitems mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' mediaItem['url'] = request_url pptv_extract.media_info_extract(response, mediaItem) videoItems = [] if u'电影' == mediaItem['channel_id']: if 'cont_id' not in mediaItem or not mediaItem['cont_id']: return items videoItem = VideoItem() videoItem['url'] = mediaItem['url'] videoItem['cont_id'] = mediaItem['cont_id'] Util.copy_media_to_video(mediaItem, videoItem) self.set_video_info(videoItem) videoItems.append(videoItem) else: sel = response.xpath('//script[@type="text/javascript"]') #获取pid&cid用于获取电视剧,综艺,动漫的剧集信息 if sel: pids = sel.re('\"pid\"[ ]?:[ ]?(\d+)') cids = sel.re('\"cat_id\"[ ]?:[ ]?(\d+)') vids = sel.re('\"id\"[ ]?:[ ]?(\d+)') if pids and cids and vids: pid = pids[0] cid = cids[0] vid = vids[0] page = 1 #给media的cont_id赋值 mediaItem['cont_id'] = pid while True: meta = { 'pid': pid, 'cid': cid, 'vid': vid, 'page': page } url = self.album_api % (pid, cid, vid, page) result = Util.get_url_content(url) page_result = self.album_json_resolve( result, mediaItem, meta) if not page_result['items']: #该接口暂时由于获取不到video url,暂不提供 #for auth in self.auths: # url = self.auth_album_api % (pid, auth) # result = Util.get_url_content(url) # page_items = self.auth_album_xml_resolve(result, mediaItem, meta) # if page_items: # videoItems = page_items # break break else: videoItems = videoItems + page_result['items'] page = page + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def api_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'api prase url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() mediaItem['url'] = request_url sel = response.xpath('.//script[@type="text/javascript"]') pidl = response.xpath('.//script[@type="text/javascript"]').re( '\"pid\"\D?(\d+)') vidl = response.xpath('.//script[@type="text/javascript"]').re( '\"id\"\D?(\d+)') if pidl and vidl: pid = pidl[0] vid = vidl[0] app_api = self.app_api % (self.get_auth(), pid) ismovie = False isvariaty = False if u'电影' == mediaItem['channel_id']: ismovie = True app_api = self.app_api % (self.get_auth(), vid) mediaItem['cont_id'] = str(vid) elif u'综艺' == mediaItem['channel_id']: isvariaty = True app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) else: app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) xpara = self.get_xdata(url=app_api) mediaItem = self.resolve_media_info(xpara, mediaItem, ismovie=ismovie) mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem)) max_page = self.get_max_page(xpara) video_list = [] if ismovie: videoItem = VideoItem() videoItem['title'] = mediaItem[ 'title'] if 'title' in mediaItem else None videoItem['thumb_url'] = mediaItem[ 'poster_url'] if 'poster_url' in mediaItem else None videoItem['url'] = mediaItem[ 'url'] if 'url' in mediaItem else None videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash( mediaItem['url']) if 'url' in mediaItem else None videoItem['vnum'] = mediaItem[ 'vcount'] if 'vcount' in mediaItem else 1 videoItem['cont_id'] = mediaItem[ 'cont_id'] if 'cont_id' in mediaItem else None video_list.append(videoItem) else: for i in range(1, max_page): web_api = self.web_api % (pid, i) dpara = self.get_ddata(url=web_api) video_list += self.resolve_video_item( dpara, page_num=i, isvariaty=isvariaty) if isvariaty: video_list = self.revise_video_item(video_list, xpara) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = video_list items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_info_extract(response, videoItem): try: if videoItem == None: videoItem = VideoItem() except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def album_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'album url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = response.request.meta[ 'url'] if 'url' in response.request.meta else None mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] sels = response.xpath( '//div[@class="page-videolist-tag-main"]//p[@class="pa1-nav"]') if sels: #存在tag页 #http://list.hunantv.com/album/56.html results = hunantv_extract.album_tag_extract(sels) for item in results: url = Util.get_absolute_url(item['url'], prefix_url) result = Util.get_url_content(url) videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': url}) else: #不存在tag页 #http://list.hunantv.com/album/2905.html video_sels = response.xpath( '//div[@class="page-videolist clearfix"]') if video_sels: result = video_sels.extract()[0] videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': request_url}) else: #无正片页地址 #http://www.hunantv.com/v/7/102831/f/1043648.html,有正片集的URL,但该URL是无效的 if video_url: videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) videoItem['url'] = video_url Util.copy_media_to_video(mediaItem, videoItem) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) #获取视频id match_results = video_url_regex.search(video_url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album url: %s' % request_url)