def parse_video(self, response): try: ext_video = [] cluster_id = get_cluster_id(response.request.url) cluster_site = response.xpath( '//ul[@id="supplies"]/li/@site').extract() cluster_site_popup = response.xpath( '//div[@id="supplies-popup"]/div/@site').extract() cluster_site_single = response.xpath( '//div[@id="listing"]/div/div[@class="content"]/@site' ).extract() cluster_src = set(cluster_site + cluster_site_popup + cluster_site_single) for site in cluster_src: url = 'http://www.360kan.com/cover/zongyilist?id=%s&do=switchsite&site=%s' % ( cluster_id, site) downloader = HTTPDownload() content = downloader.get_data(url) json_data = json.loads(content) sel = Selector(text=json_data['data'], type="html") video = sel.xpath('//dl/dt/a/@href').extract() if video: ext_video.append( Util.normalize_url(Util.convert_url(video[0]), channel='variaty')) return ext_video except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def album_tag_json_resolve(self, text, meta): items = [] try: request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_express = '(\[\{.*\}\])' video_regex = re.compile(video_express) match_results = video_regex.search(text) if match_results: video_content = match_results.groups()[0] videos = json.loads(video_content) for video in videos: videoItem = VideoItem() ext_id = video['id'] title = video['title'] vnum = video['stitle'] img = video['img'] url = video['url'] videoItem['cont_id'] = ext_id videoItem['title'] = title vnum = str(vnum) videoItem['vnum'] = filter(str.isalnum, vnum) videoItem['thumb_url'] = img videoItem['url'] = Util.get_absolute_url(url, prefix_url) self.set_video_info(videoItem) items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def pack_media_info(self, response, mediaItem, title_class="", div_class=""): v_meta = '//div[@class="%s"]/p/em[re:test(text(), ".*%s.*")]/../%s/text()' director_list = response.xpath(v_meta % (div_class, u'导演', 'a')).extract() actor_list = response.xpath(v_meta % (div_class, u'主演', 'a')).extract() district_list = response.xpath(v_meta % (div_class, u'地区', 'a')).extract() tag_list = response.xpath(v_meta % (div_class, u'类型', 'a')).extract() intro_list = response.xpath(v_meta % (div_class, u'简介', 'span')).extract() director = Util.join_list_safely(director_list) actor = Util.join_list_safely(actor_list) district = Util.join_list_safely(district_list) tag = Util.join_list_safely(tag_list) intro = Util.join_list_safely(intro_list) if director: mediaItem['director'] = director if actor: mediaItem['actor'] = actor if district: mediaItem['district'] = district if tag: mediaItem['type'] = tag if intro: mediaItem['intro'] = intro return mediaItem
def text_infos_resolve(labels, infos, mediaItem): try: if mediaItem == None: return if labels and infos: labels = str(labels[0]).splitlines() label = ''.join(labels) label = label.replace(' ','') if label.startswith(u'导演'): mediaItem['director'] = Util.join_list_safely(infos) elif label.startswith(u'主演'): mediaItem['actor'] = Util.join_list_safely(infos) elif label.startswith(u'类型'): mediaItem['type'] = Util.join_list_safely(infos) elif label.startswith(u'地区'): mediaItem['district'] = Util.join_list_safely(infos) elif label.startswith(u'上映'): release_date = ''.join(infos) release_dates = re.findall(r'[\d]+', release_date) release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date elif label.startswith(u'片长'): duration = ''.join(infos) durations = re.findall(r'[\d]+', duration) duration = ''.join(durations) mediaItem['duration'] = duration elif label.startswith(u'人气'): score = ''.join(infos) scores = re.findall(r'[\d]+', score) score = ''.join(scores) mediaItem['score'] = score except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_topic_play_list(self, response): item = None videoitems = [] try: subs = response.xpath( '//div[@class="mod_video_fragments"]/div[@class="mod_figures_1"]/ul/li' ) for sub in subs: vitem = VideoItem() title = sub.xpath('./strong/a/text()').extract() vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = sub.xpath('./strong/a/@href').extract() if title and title[0].find(u"预告") < 0: if turl and turl[0].find(".com") < 0 or ( turl and turl[0].find("qq.com") >= 0): vitem["title"] = title[0].strip() vitem["vnum"] = self.get_num(vitem["title"]) sturl = turl[0] if turl[0].find("qq.com") < 0: sturl = self.url_prefix + turl[0] vitem["url"] = Util.normalize_url(sturl, "qq", "tv") vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems
def api_parse(self, mediaVideoItem): items = [] try: mediaItem = mediaVideoItem['media'] logging.log(logging.INFO, 'api parse pid: %s' % mediaItem['cont_id']) self.api_media_info(mediaItem) if 'title' in mediaItem: videoItems = [] pagenum = 1 while True: videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) if not page_items: break videoItems = videoItems + page_items pagenum = pagenum + 1 if videoItems: if 'url' not in mediaItem: mediaItem['url'] = videoItems[0]['url'] Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def set_media_info(self, mediaItem): mediaItem['site_id'] = self.site_id #由于之前的channel_id存放的是中文的频道名称,需要转换成真正的channel_id channel_name = mediaItem['channel_id'] mediaItem['channel_id'] = self.channels_name_id[channel_name] #设置info_id mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem))
def set_video_info(self, videoItem): videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id url = videoItem['url'] url = Util.normalize_url(url, self.site_code) videoItem['url'] = url videoItem['ext_id'] = Util.md5hash(url)
def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] poster_url = response.request.meta['poster_url'] urls = response.xpath( '//div[@class="play-nav-l-new"]/h1/a/@href').extract() if urls: for iurl in urls: turl = self.url_prefix + iurl surl = Util.normalize_url(turl, "baofeng") if surl and self.site_name == Util.guess_site(surl): #if turl and self.site_name == Util.guess_site(turl): items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url, 'page': 1, "untrack_id": untrack_id, "sid": sid })) #付费电影,不能跳转到媒体页 else: pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None #播放页获取详细信息 sels = response.xpath('//script[@type="text/javascript"]') hunantv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="play-xxmes clearfix"]') hunantv_extract.media_info_extract(sels, mediaItem) #获得媒体页地址 url_express = '(http://www\.hunantv\.com/v/[\d]+/[\d]+)/[a-zA-Z]/[\d]+\.html' url_regex = re.compile(url_express) match_results = url_regex.search(request_url) if match_results: url_content = match_results.groups()[0] mediaItem['url'] = url_content #获取正片地址 url_exist = False sels = response.xpath( '//div[@class="play-index-con-til clearfix"]//*[@class="mppl-til"]' ) for sel in sels: results = hunantv_extract.album_extract(sel) if results: item = results[0] url = item['url'] url = Util.get_absolute_url(url, prefix_url) mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.album_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) url_exist = True break #不存在正在播放的链接,如“芒果捞星闻” if 'url' in mediaItem and not url_exist: year_api = mediaItem['url'] + '/s/json.year.js' mediaVideoItem['media'] = mediaItem items.append( Request(url=year_api, callback=self.album_json_parse, meta={ 'item': mediaVideoItem, 'url': year_api })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def play_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'play url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) route_url_list = response.xpath( '//div[@class="play-content"]//div[@class="v-panel-route"]/a/@href' ).extract() media_url = '' if route_url_list: media_url = route_url_list[-1] if media_url: # 有媒体页url,媒体页抓取媒体信息 items.append( Request(url=media_url, callback=self.media_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) else: # 电影没有媒体页,在播放页抓取媒体信息 mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() title_class = "v-info v-info-film e-follow" div_class = "v-meta v-meta-film" v_title = '//div[@class="%s"]//h1[@class="title"]/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) # 没有媒体页,播放地址作为媒体地址 mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaVideoItem['media'] = mediaItem r = re.compile('.*/(\d+).html') m = r.match(mediaItem['url']) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', mediaItem['url']) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'play url: %s' % request_url)
def set_media_info(self, mediaItem): #将cont_id中的id提取出来 cont_id = mediaItem['cont_id'] cont_ids = cont_id.split('|') cont_id = cont_ids[0] mediaItem['cont_id'] = cont_id mediaItem['site_id'] = self.site_id #由于之前的channel_id存放的是中文的频道名称,需要转换成真正的channel_id channel_name = mediaItem['channel_id'] mediaItem['channel_id'] = self.channels_name_id[channel_name] #设置info_id mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem))
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = None if not sels: #Detail电视剧、综艺、动漫 sels = response.xpath( '//div[@data-statectn="play_info"]//ul[@class="intro_box"]' ) if not sels: #Info:普通影片,动漫 sels = response.xpath( '//div[@data-statectn="newplay_info"]//ul[@class="info_list"]' ) if not sels: #收费影片 sels = response.xpath( '//div[@class="Player"]//span[@class="video_info"]') if sels: results = letv_extract.media_extract(sels) if results: item = results[0] url = Util.get_absolute_url(item['url'], prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) if not items: #视频播放页找不到媒体页地址,尝试直接采用接口爬取 if 'cont_id' in mediaItem: self.api_parse(mediaVideoItem) else: logging.log(logging.INFO, '该视频播放页找不到媒体页地址,也无法直接采用接口: %s' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def api_video_list(self, vid, vcount, prefix_video_url, channel): video_list = [] try: max_page_num = vcount / 20 + 1 for i in range(max_page_num): vlu = self.video_list_url % (vid, i) jdata = self.httpdownload.get_data(vlu) if not jdata: break ddata = json.loads(jdata) code = int(ddata.get('code', 202)) if code != 200: break datal = ddata.get('data') if not datal: break for data in datal: videoItem = VideoItem() if type(data) != dict: continue #videoItem['title'] = data.get('name') videoItem['title'] = data.get('desc') videoItem['thumb_url'] = data.get('image') videoItem['vnum'] = data.get('videoIndex') videoId = data.get('videoId') #if int(videoItem['vnum']) == 0: # videoItem['vnum'] = self.get_vnum(data.get('name')) turl = self.media_info_url % (videoId) tjdata = self.httpdownload.get_data(turl) if not tjdata: continue tdjdata = json.loads(tjdata) tcode = int(tdjdata.get('code', 202)) if code != 200: continue tdatal = tdjdata.get('data') if not tdatal: continue publish_time = tdatal.get('detail').get('publishTime') if publish_time and channel == 2004: videoItem['vnum'] = self.get_vnum(publish_time) tcode = int(ddata.get('code', 202)) videoItem['cont_id'] = data.get('videoId') videoItem['url'] = Util.normalize_url( prefix_video_url % data.get('videoId'), self.site_code) videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash(videoItem['url']) video_list.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_video(self, response): try: supplies = response.xpath( '//div[@id="listing"]/div/div[@class="content"]/div/div[@class="part"][1]/a[1]/@href' ).extract() return [ Util.normalize_url(Util.convert_url(u), channel='cartoon') for u in supplies ] except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def compose_mvitem(self, response, title_list, pers, dirs, play_url, cat_id, poster_url, text): try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0].strip() ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "wasu") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item mid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = mid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) if not res: return None except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] mid = response.request.meta[ 'mid'] if 'mid' in response.request.meta else "" poster_url = response.request.meta['poster_url'] #解析媒体页信息 urls = response.xpath( '//div[@class="play_site mb10"]/div[1]/h3/a/@href').extract() if not urls: #通过标题不能进入媒体页,要通过分级目录 turls = response.xpath( '//div[@class="play_site mb10"]/div[1]/div[@class="play_seat"]/a/@href' ).extract() for turl in turls: tiurl = self.get_episode_url(turl) if tiurl: urls.append(tiurl) if urls: for iurl in urls: if not Util.guess_site(iurl): iurl = self.url_prefix + iurl surl = Util.normalize_url(iurl, "wasu") if surl and self.site_name == Util.guess_site(surl): items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url, 'page': 1, "untrack_id": untrack_id, "sid": sid, "mid": mid })) else: #电影视频,没有媒体页,只有播放页 #动漫电影,没有媒体页,只有播放页 titems = self.parse_play_page(response) for item in titems: if mid: item['mid'] = mid items.append(item) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def set_media_info(self, mediaItem): mediaItem['site_id'] = self.site_id #由于之前的channel_id存放的是中文的频道名称,需要转换成真正的channel_id channel_name = mediaItem['channel_id'] mediaItem['channel_id'] = self.channels_name_id[channel_name] url = mediaItem['url'] media_url_express = 'http://www.1905.com/mdb/film/([\d]+).*' media_url_regex = re.compile(media_url_express) match_results = media_url_regex.search(url) if match_results: id = match_results.groups()[0] mediaItem['cont_id'] = id #设置info_id mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem))
def parse_video(self, response): try: sup = response.xpath('//ul[@id="supplies"]/li/a/@href').extract() sup_more = response.xpath( '//div[@class="menu"]//ul/li/a/@href').extract() supplies = sup + sup_more return [ Util.normalize_url(Util.convert_url(u), channel='movie') for u in supplies ] except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def album_json_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None url = response.request.meta[ 'url'] if 'url' in response.request.meta else None if url != request_url: #被重定向,表明不存在 return items year_express = '(\[.*\])' year_regex = re.compile(year_express) match_results = year_regex.search(response.body) if match_results: videoItems = [] year_content = match_results.groups()[0] years = json.loads(year_content) for year in years: video_url = mediaItem['url'] + '/s/json.%s.js' % year result = Util.get_url_content(video_url) videoItems = videoItems + self.album_tag_json_resolve( text=result, meta={'url': video_url}) if videoItems: Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def parse_single_episode(self,response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] urls = response.xpath('//div[@class="base_info"]/h1[@class="title"]/a/@href').extract() if urls: for iurl in urls: surl = Util.normalize_url(iurl,"youku") if surl: items.append(Request(url=surl, callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':'','page':1,"untrack_id":untrack_id,"sid":sid,"mid":mid})) else: logging.log(logging.INFO, 'miss media page: %s' % response.request.url) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse(self,response): try: logging.log(logging.INFO, 'parse: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #poster_url = response.request.meta['poster_url'] items = [] play_url = "" jinfo = json.loads(response.body) for tmedia in jinfo["items"]: title = tmedia["title"] actor_list = [] for tactor in tmedia["actors"]: actor_list.append(tactor["name"]) actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) poster_url = tmedia["picUrl_200x300"] play_url = tmedia["playUrl"] if "updateInfo" in tmedia and tmedia["updateInfo"].find("预告") >= 0: continue else: items.append(Request(url=play_url, callback=self.parse_episode_play, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,'actor':actor})) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def insert_ext_video(self, item, commit=True): try: media_dao = self._daos['media'] video_dao = self._daos['video'] site_id = item['site_id'] channel_id = item['channel_id'] media_ext_id = item['media_ext_id'] sid = media_dao.get_media(media_ext_id, None) for v in item['urls']: if not v: continue video_ext_id = Util.md5hash(v) self.insert_untrack_ss({ 'url': v, 'md5': video_ext_id, 'channel_id': channel_id, 'sid': sid }) ''' video_ext_id = Util.md5hash(v) video_mid = video_dao.get_video_mid(video_ext_id) if video_mid: self.insert_sync_rel({'sid': sid, 'mid': video_mid, 'site_id': site_id}) self.insert_untrack({'url': v, 'md5': video_ext_id, 'channel_id': channel_id, 'sid': sid, 'stat': 1}) else: self.insert_untrack({'url': v, 'md5': video_ext_id, 'channel_id': channel_id, 'sid': sid}) ''' except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def compose_vitem(self, url_list, title_list, vnum): vitem = VideoItem() try: if not url_list: return vitem if title_list: vitem["title"] = title_list[0].strip() turl = Util.normalize_url(url_list[0], "sohu") vitem["url"] = turl vitem["vnum"] = str(vnum) vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(turl) vitem["site_id"] = self.site_id except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return vitem
def insert_untrack(self, item, commit=True): try: dao = self._daos['untrack'] value_dict = {} sid = item['sid'] md5 = item['md5'] url = item['url'] site_code = Util.guess_site(url) value_dict['sid'] = sid value_dict['md5'] = md5 value_dict['url'] = url value_dict['site_code'] = site_code if 'channel_id' in item: value_dict['channel_id'] = item['channel_id'] if 'stat' in item: value_dict['stat'] = item['stat'] res = dao.get_untrack(md5) if not res: dao.insert_untrack(value_dict) else: dao.update_untrack(res, value_dict) if commit: self.commit_transaction() except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def other_album_resolve(self, text, meta): items = [] try: if not text: return items request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = 'http://www.letv.com/ptv/vplay/%s.html' videos_json = json.loads(text) videos = videos_json['body']['videoList']['videoList']['videoInfo'] for video in videos: try: videoItem = VideoItem() videoItem['cont_id'] = video['vid'] if video['episode']: try: vnum = int(float(video['episode'])) videoItem['vnum'] = vnum except Exception, e: vnum = int(float(video['porder'])) videoItem['vnum'] = vnum videoItem['title'] = video['subTitle'] if not videoItem['title']: videoItem['title'] = '第%s集' % videoItem['vnum'] for key in video['picAll']: thumb_url = video['picAll'][key] videoItem['thumb_url'] = thumb_url break url = video_url % videoItem['cont_id'] videoItem['url'] = url self.set_video_info(videoItem) items.append(videoItem) except Exception, e: continue
def zongyi_album_resolve(self, text, meta): items = [] try: if not text: return items request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) year = meta['year'] if 'year' in meta else None month = meta['month'] if 'month' in meta else None if year and month: videos_json = json.loads(text) videos = videos_json['data'] if year in videos: videos = videos[year] if month in videos: videos = videos[month] video_url = 'http://www.letv.com/ptv/vplay/%s.html' for video in videos: videoItem = VideoItem() videoItem['cont_id'] = video['id'] videoItem['title'] = video['subTitle'] if video['issue']: videoItem['vnum'] = video['issue'] videoItem['thumb_url'] = video['pic'] url = video_url % videoItem['cont_id'] videoItem['url'] = url self.set_video_info(videoItem) items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url) logging.log(logging.INFO, '===================content===================') logging.log(logging.INFO, text)
def list_parse(self, response): try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None sels = response.xpath( '//div[@class="retrieval"]//dl[@class="retrieval-dl"]') if self.max_mark_depth > 0: size = self.max_mark_depth if self.max_mark_depth < len( sels) else len(sels) else: size = len(sels) if level <= size: sel = sels[level - 1] level = level + 1 urls = sel.xpath( './/ul[@class="retrieval-list"]//a/@href').extract() for url in urls: url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': channel_id })) #获取当前层的所有list数据 #按照排序方式再进行细分一次 urls = response.xpath( '//div[@class="filter"]//ul[@class="tab-sya"]//li/a/@href' ).extract() for url in urls: url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': 1, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def set_video_info(self, videoItem, channel_name): videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id url = videoItem['url'] if u'电影' == channel_name: channel_name = kankan_extract.list_channels_pinyin[channel_name] url = Util.normalize_url(url, self.site_code, channel_name) else: url = Util.normalize_url(url, self.site_code) videoItem['url'] = url videoItem['ext_id'] = Util.md5hash(url) video_url_express = 'http://[^/]*.kankan.com.+?/([\d]+).[s]?html' video_url_regex = re.compile(video_url_express) match_results = video_url_regex.search(url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id
def resolve_video_item(self, dpara, page_num=1, isvariaty=False): videos = [] page_num -= 1 try: if dpara and int( dpara.get('err') ) == 0 and 'data' in dpara and 'list' in dpara['data']: lst = dpara['data'].get('list', []) sameV = 1 for index, item in enumerate(lst): videoItem = VideoItem() videoItem['cont_id'] = item.get('id') videoItem['url'] = Util.normalize_url( item.get('url'), self.site_code) videoItem['thumb_url'] = item.get('capture') videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash(videoItem['url']) oep = item.get('epTitle', '') nep = oep[::-1] for i in [u'上', u'中', u'下']: nep.replace(i, '', 1) nep = nep[::-1] if isvariaty and nep and nep.isdigit() and len(nep) == 8: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = item.get('title', '') + str( videoItem['vnum']) elif isvariaty: videoItem['title'] = oep if oep else item.get( 'title', '') # 对于date为空的情况,取下标作为剧集号 videoItem['vnum'] = str(index + 1 + page_num * 100) elif nep and nep.isdigit(): # '01' --> '1' videoItem['vnum'] = str(int(float(nep))) videoItem['title'] = item.get('title', '') + oep elif nep: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = oep elif not nep: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = item.get('title', '') + str( videoItem['vnum']) + oep videos.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())