def parse_topic_play_list(self, response): item = None videoitems = [] try: subs = response.xpath( '//div[@class="mod_video_fragments"]/div[@class="mod_figures_1"]/ul/li' ) for sub in subs: vitem = VideoItem() title = sub.xpath('./strong/a/text()').extract() vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = sub.xpath('./strong/a/@href').extract() if title and title[0].find(u"预告") < 0: if turl and turl[0].find(".com") < 0 or ( turl and turl[0].find("qq.com") >= 0): vitem["title"] = title[0].strip() vitem["vnum"] = self.get_num(vitem["title"]) sturl = turl[0] if turl[0].find("qq.com") < 0: sturl = self.url_prefix + turl[0] vitem["url"] = Util.normalize_url(sturl, "qq", "tv") vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems
def parse_single_episode(self,response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] urls = response.xpath('//div[@class="base_info"]/h1[@class="title"]/a/@href').extract() if urls: for iurl in urls: surl = Util.normalize_url(iurl,"youku") if surl: items.append(Request(url=surl, callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':'','page':1,"untrack_id":untrack_id,"sid":sid,"mid":mid})) else: logging.log(logging.INFO, 'miss media page: %s' % response.request.url) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_video(self, response): try: ext_video = [] cluster_id = get_cluster_id(response.request.url) cluster_site = response.xpath( '//ul[@id="supplies"]/li/@site').extract() cluster_site_popup = response.xpath( '//div[@id="supplies-popup"]/div/@site').extract() cluster_site_single = response.xpath( '//div[@id="listing"]/div/div[@class="content"]/@site' ).extract() cluster_src = set(cluster_site + cluster_site_popup + cluster_site_single) for site in cluster_src: url = 'http://www.360kan.com/cover/zongyilist?id=%s&do=switchsite&site=%s' % ( cluster_id, site) downloader = HTTPDownload() content = downloader.get_data(url) json_data = json.loads(content) sel = Selector(text=json_data['data'], type="html") video = sel.xpath('//dl/dt/a/@href').extract() if video: ext_video.append( Util.normalize_url(Util.convert_url(video[0]), channel='variaty')) return ext_video except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def set_video_info(self, videoItem): videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id url = videoItem['url'] url = Util.normalize_url(url, self.site_code) videoItem['url'] = url videoItem['ext_id'] = Util.md5hash(url)
def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] poster_url = response.request.meta['poster_url'] urls = response.xpath( '//div[@class="play-nav-l-new"]/h1/a/@href').extract() if urls: for iurl in urls: turl = self.url_prefix + iurl surl = Util.normalize_url(turl, "baofeng") if surl and self.site_name == Util.guess_site(surl): #if turl and self.site_name == Util.guess_site(turl): items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url, 'page': 1, "untrack_id": untrack_id, "sid": sid })) #付费电影,不能跳转到媒体页 else: pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def set_video_info(self, videoItem, channel_name): videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id url = videoItem['url'] if u'电影' == channel_name: channel_name = kankan_extract.list_channels_pinyin[channel_name] url = Util.normalize_url(url, self.site_code, channel_name) else: url = Util.normalize_url(url, self.site_code) videoItem['url'] = url videoItem['ext_id'] = Util.md5hash(url) video_url_express = 'http://[^/]*.kankan.com.+?/([\d]+).[s]?html' video_url_regex = re.compile(video_url_express) match_results = video_url_regex.search(url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id
def play_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'play url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) route_url_list = response.xpath( '//div[@class="play-content"]//div[@class="v-panel-route"]/a/@href' ).extract() media_url = '' if route_url_list: media_url = route_url_list[-1] if media_url: # 有媒体页url,媒体页抓取媒体信息 items.append( Request(url=media_url, callback=self.media_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) else: # 电影没有媒体页,在播放页抓取媒体信息 mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() title_class = "v-info v-info-film e-follow" div_class = "v-meta v-meta-film" v_title = '//div[@class="%s"]//h1[@class="title"]/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) # 没有媒体页,播放地址作为媒体地址 mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaVideoItem['media'] = mediaItem r = re.compile('.*/(\d+).html') m = r.match(mediaItem['url']) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', mediaItem['url']) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'play url: %s' % request_url)
def api_video_list(self, vid, vcount, prefix_video_url, channel): video_list = [] try: max_page_num = vcount / 20 + 1 for i in range(max_page_num): vlu = self.video_list_url % (vid, i) jdata = self.httpdownload.get_data(vlu) if not jdata: break ddata = json.loads(jdata) code = int(ddata.get('code', 202)) if code != 200: break datal = ddata.get('data') if not datal: break for data in datal: videoItem = VideoItem() if type(data) != dict: continue #videoItem['title'] = data.get('name') videoItem['title'] = data.get('desc') videoItem['thumb_url'] = data.get('image') videoItem['vnum'] = data.get('videoIndex') videoId = data.get('videoId') #if int(videoItem['vnum']) == 0: # videoItem['vnum'] = self.get_vnum(data.get('name')) turl = self.media_info_url % (videoId) tjdata = self.httpdownload.get_data(turl) if not tjdata: continue tdjdata = json.loads(tjdata) tcode = int(tdjdata.get('code', 202)) if code != 200: continue tdatal = tdjdata.get('data') if not tdatal: continue publish_time = tdatal.get('detail').get('publishTime') if publish_time and channel == 2004: videoItem['vnum'] = self.get_vnum(publish_time) tcode = int(ddata.get('code', 202)) videoItem['cont_id'] = data.get('videoId') videoItem['url'] = Util.normalize_url( prefix_video_url % data.get('videoId'), self.site_code) videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash(videoItem['url']) video_list.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_video(self, response): try: supplies = response.xpath( '//div[@id="listing"]/div/div[@class="content"]/div/div[@class="part"][1]/a[1]/@href' ).extract() return [ Util.normalize_url(Util.convert_url(u), channel='cartoon') for u in supplies ] except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def compose_mvitem(self, response, title_list, pers, dirs, play_url, cat_id, poster_url, text): try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0].strip() ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "wasu") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item mid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = mid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) if not res: return None except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] mid = response.request.meta[ 'mid'] if 'mid' in response.request.meta else "" poster_url = response.request.meta['poster_url'] #解析媒体页信息 urls = response.xpath( '//div[@class="play_site mb10"]/div[1]/h3/a/@href').extract() if not urls: #通过标题不能进入媒体页,要通过分级目录 turls = response.xpath( '//div[@class="play_site mb10"]/div[1]/div[@class="play_seat"]/a/@href' ).extract() for turl in turls: tiurl = self.get_episode_url(turl) if tiurl: urls.append(tiurl) if urls: for iurl in urls: if not Util.guess_site(iurl): iurl = self.url_prefix + iurl surl = Util.normalize_url(iurl, "wasu") if surl and self.site_name == Util.guess_site(surl): items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url, 'page': 1, "untrack_id": untrack_id, "sid": sid, "mid": mid })) else: #电影视频,没有媒体页,只有播放页 #动漫电影,没有媒体页,只有播放页 titems = self.parse_play_page(response) for item in titems: if mid: item['mid'] = mid items.append(item) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_parse(self, response): items = [] try: media_url = response.request.url logging.log(logging.INFO, 'media url: %s' % media_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() # 媒体页获取媒体信息 title_class = "v-info v-info-album " div_class = "v-meta v-meta-album" v_title = '//div[@class="%s"]//h1[@class="title"]/span/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) mediaItem['url'] = Util.normalize_url(media_url, self.site_code) request_url = response.meta['url'] request_url = Util.normalize_url(request_url, self.site_code) r = re.compile('.*/(\d+).html') m = r.match(request_url) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', request_url) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: pass except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_video(self, response): try: sup = response.xpath('//ul[@id="supplies"]/li/a/@href').extract() sup_more = response.xpath( '//div[@class="menu"]//ul/li/a/@href').extract() supplies = sup + sup_more return [ Util.normalize_url(Util.convert_url(u), channel='movie') for u in supplies ] except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def compose_vitem(self, url_list, title_list, vnum): vitem = VideoItem() try: if not url_list: return vitem if title_list: vitem["title"] = title_list[0].strip() turl = Util.normalize_url(url_list[0], "sohu") vitem["url"] = turl vitem["vnum"] = str(vnum) vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(turl) vitem["site_id"] = self.site_id except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return vitem
def resolve_video_item(self, dpara, page_num=1, isvariaty=False): videos = [] page_num -= 1 try: if dpara and int( dpara.get('err') ) == 0 and 'data' in dpara and 'list' in dpara['data']: lst = dpara['data'].get('list', []) sameV = 1 for index, item in enumerate(lst): videoItem = VideoItem() videoItem['cont_id'] = item.get('id') videoItem['url'] = Util.normalize_url( item.get('url'), self.site_code) videoItem['thumb_url'] = item.get('capture') videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash(videoItem['url']) oep = item.get('epTitle', '') nep = oep[::-1] for i in [u'上', u'中', u'下']: nep.replace(i, '', 1) nep = nep[::-1] if isvariaty and nep and nep.isdigit() and len(nep) == 8: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = item.get('title', '') + str( videoItem['vnum']) elif isvariaty: videoItem['title'] = oep if oep else item.get( 'title', '') # 对于date为空的情况,取下标作为剧集号 videoItem['vnum'] = str(index + 1 + page_num * 100) elif nep and nep.isdigit(): # '01' --> '1' videoItem['vnum'] = str(int(float(nep))) videoItem['title'] = item.get('title', '') + oep elif nep: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = oep elif not nep: videoItem['vnum'] = str(index + 1 + page_num * 100) videoItem['title'] = item.get('title', '') + str( videoItem['vnum']) + oep videos.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_video_item(self, response, cat_id, title, media_page_id): videoitems = [] try: play_url = self.parse_play_url(response) if play_url: url = Util.normalize_url(play_url[0], "youku") cont_id = self.get_youku_showid(url) i=1 while True: item = self.parse_video_item_media(cont_id,i) if item: videoitems = videoitems + item i = i+1 else: break except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return videoitems
def start_requests(self): try: items = [] if not self.cat_urls: cat_urls = [{'url':'http://www.youku.com/v_olist/c_85', 'id': self.channel_map['variaty']}] ''' cat_urls = [{'url':'http://www.youku.com/v_olist/c_96', 'id': self.channel_map['movie']}, {'url':'http://www.youku.com/v_olist/c_97', 'id': self.channel_map['tv']}, {'url':'http://www.youku.com/v_olist/c_85', 'id': self.channel_map['variaty']}, {'url':'http://www.youku.com/v_olist/c_100', 'id':self.channel_map['cartoon']}] ''' for cat in cat_urls: items.append(Request(url=cat['url'], callback=self.parse_list, meta={'cat_id': cat['id'],'page':1})) else: for cat in self.cat_urls: turl = Util.normalize_url(cat['url'],"youku") items.append(Request(url=turl, callback=self.parse_single_episode, meta={'cat_id': cat["id"],'page':1,"untrack_id":cat["untrack_id"],"sid":cat["sid"],"mid":cat["mid"]})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def start_requests(self): try: items = [] self.movie_id = str(self.mgr.get_channel('电影')["channel_id"]) self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"]) self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"]) self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"]) self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "baofeng") items.append( Request(url=self.test_page_url, callback=self.parse_page, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if not self._cat_urls: if self.global_spider: cat_urls = [{ 'url': 'http://www.baofeng.com/movie/682/list-sid-1-p-1.shtml', 'id': self.movie_id }, { 'url': 'http://www.baofeng.com/tv/914/list-type-2-ishot-1-sid-1-p-1.shtml', 'id': self.tv_id }, { 'url': 'http://www.baofeng.com/enc/444/list-type-4-ishot-1-sid-1-p-1.shtml', 'id': self.variety_id }, { 'url': 'http://www.baofeng.com/comic/924/list-type-3-ishot-1-sid-1-p-1.shtml', 'id': self.cartoon_id }] #cat_urls = [{'url':'http://www.baofeng.com/enc/444/list-type-4-ishot-1-sid-1-p-1.shtml','id':self.variety_id}] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_area, meta={ 'cat_id': cat['id'], 'page': 1 })) #items.append(Request(url=cat['url'], callback=self.parse_page, meta={'cat_id': cat['id'],'page':1})) else: for cat in self._cat_urls: turl = Util.normalize_url(cat['url'], "baofeng") items.append( Request(url=turl, callback=self.parse_single_episode, meta={ 'cat_id': cat["id"], 'page': 1, "poster_url": "", "untrack_id": cat["untrack_id"], "sid": cat["sid"] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] year_list = [] lyears = [] title_list = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title' ).extract() director_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'主演:').extract() type_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'类型:').extract() district_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'地区:').extract() year_info = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/text()' % u'地区:').extract() year = None if len(year_info) >= 2: year = self.get_year(year_info[1]) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) districts = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="juqing briefTab"]/div/text()').extract() #score score = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()' ).extract() play_url = "" tplay_url = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href' ).extract() if tplay_url: play_url = self.url_prefix + tplay_url[0].strip() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] if ep_item["title"].find(u'预:') >= 0: print "预告片,url", response.request.url return items ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = districts if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "baofeng") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item vurl = "" videoid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = videoid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) #if self.check_url(mvitem): if res: items.append(mvitem) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] title = response.request.meta['title'] actor = response.request.meta['actor'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] items = [] if not poster_url: poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract() if poster_url_list: poster_url = poster_url_list[0] if not title: title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract() if title_list: title = title_list[0] if not actor: #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract() actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u' 主演:').extract() if actor_list: actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) #performer pers = actor type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'类型:\n').extract() district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'地区:').extract() release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'年代:').extract() types = None if type_list: types = Util.join_list_safely(type_list) #director director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'编导:').extract() if not director_list: director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'导演:').extract() dirs = Util.join_list_safely(director_list) #dirs = "|".join([t.strip() for t in director_list]) #text text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract() #sourceid sourceid = self.get_tudou_showid(response.request.url) videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if release_date_list: ep_item["release_date"] = Util.str2date(release_date_list[0]) #ep_item["info_id"] = Util.md5hash(tinfo) ep_item["cont_id"] = sourceid ep_item["site_id"] = self.site_id ep_item["url"] = response.request.url ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem(); mvitem["media"] = ep_item; mvitem["video"] = videoitems lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid) info = self.httpdownload.get_data(lurl) jinfo = json.loads(info) if "items" in jinfo: for sitem in jinfo["items"]: vitem = VideoItem() vitem["title"] = sitem["itemTitle"] vitem["vnum"] = sitem["episode"] vitem["os_id"] = self.os_id trailer = sitem['trailer'] if not sitem["itemPlayUrl"]: continue #预告片 if trailer: continue turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou") vitem["url"] = turl vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_tudou_showid(turl) #if "ext_id" not in mvitem["media"]: # mvitem["media"]["ext_id"] = vitem["ext_id"] #vitem["media_ext_id"] = vitem["ext_id"] mvitem["video"].append(vitem) if len(mvitem["video"]) > 0: Util.set_ext_id(mvitem["media"],mvitem["video"]) mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"])) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def start_requests(self): try: items = [] self.movie_id = str(self.mgr.get_channel('电影')["channel_id"]) self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"]) self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"]) self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"]) self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "sohu") items.append( Request(url=self.test_page_url, callback=self.parse_page, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if self.cmd_json: items.append( Request(url=self.cmd_json['url'], callback=self.parse_episode_info, meta={ 'cat_id': self.cmd_json["id"], 'poster_url': '' })) return items if not self._cat_urls: #cat_urls = [{'url':'http://so.tv.sohu.com/list_p1106_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html','id':self.variety_id}] cat_urls = [{ 'url': 'http://so.tv.sohu.com/list_p1100_p2_p3_p4_p5_p6_p73_p80_p9_2d1_p10_p11_p12_p13.html', 'id': self.movie_id }, { 'url': 'http://so.tv.sohu.com/list_p1101_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html', 'id': self.tv_id }, { 'url': 'http://so.tv.sohu.com/list_p1106_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html', 'id': self.variety_id }, { 'url': 'http://so.tv.sohu.com/list_p1115_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html', 'id': self.cartoon_id }] #cat_urls = [{'url':'http://so.tv.sohu.com/list_p1100_p2_p3_p4_p5_p6_p73_p80_p9_2d1_p10_p11_p12_p13.html','id':self.movie_id}] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_type, meta={ 'cat_id': cat['id'], 'page': 1 })) else: for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse_single_episode, meta={ 'cat_id': cat["id"], 'page': 1, "untrack_id": cat["untrack_id"], "sid": cat["sid"], "mid": cat["mid"] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] mid = response.request.meta[ 'mid'] if 'mid' in response.request.meta else "" playtype_list = response.selector.re( re.compile(r'var pagetype = .*?(\D+)')) #发现新的类型页面,http://tv.sohu.com/20100804/n273985736.shtml #http://my.tv.sohu.com/us/49390690/29200993.shtml 该URL利用现有的逻辑无法爬取到 urls = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/div[@class="crumbs"]/a[last()]' ) attributes = urls.xpath('./@*').extract() size = len(attributes) urls = urls.xpath('./@href').extract() if size == 1 and urls and not playtype_list: for iurl in urls: surl = Util.normalize_url(iurl, "sohu") if surl and "http" in surl: items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': '', 'page': 1, "untrack_id": untrack_id, "sid": sid, "mid": mid })) #付费电影,不能跳转到媒体页 else: mvitem = self.parse_episode_play(response, untrack_id, sid) if mid: mvitem['mid'] = mid if mvitem and "media" in mvitem and "url" in mvitem[ "media"] and "ext_id" in mvitem["media"]: if self.check_url(mvitem): items.append(mvitem) if not items: mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid ep_item = MediaItem() ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id mvitem["media"] = ep_item playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] items += self.api_episode_info(mvItem=mvitem, playlistId=playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] page_id = self.get_youku_pageid(response.request.url) if not page_id: log.error('miss content id: %s' % response.request.url) return untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] items = [] year_list = [] title = self.parse_title(response,cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'地区:').extract() type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'类型:').extract() play_date = self.parse_play_date(response) total_num = self.parse_total_num(response) year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) #text text = response.xpath('//div[@class="detail"]/span/text()').extract() videoitems = [] ep_item = MediaItem() if title: ep_item["title"] = title[0].strip() if pers: ep_item["actor"] = pers if dirs > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if play_date: ep_item["release_date"] = Util.str2date(play_date) if total_num: ep_item["vcount"] = total_num ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url,"youku") if text: ep_item["intro"] = text[0].strip() ep_item["cont_id"] = page_id ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item)) mvitem = MediaVideoItem(); if mid: mvitem['mid'] = mid mvitem["media"] = ep_item; if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id) mvitem['video'] = video_list Util.set_ext_id(mvitem["media"], mvitem["video"]) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def start_requests(self): items = [] try: cat_urls = [] self.movie_id = self.mgr.get_channel('电影')["channel_id"] self.tv_id = self.mgr.get_channel('电视剧')["channel_id"] self.variety_id = self.mgr.get_channel('综艺')["channel_id"] self.cartoon_id = self.mgr.get_channel('动漫')["channel_id"] self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "qq") items.append( Request(url=self.test_page_url, callback=self.parse_single_episode, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if not self._cat_urls: #cat_urls = [{'url':'http://v.qq.com/list/2_-1_-1_-1_0_1_1_10_-1_-1_0.html','id':self.tv_id}] cat_urls = [{ 'url': 'http://v.qq.com/movielist/10001/0/0/1/0/10/1/0.html', 'id': self.movie_id }, { 'url': 'http://v.qq.com/list/2_-1_-1_-1_0_1_1_10_-1_-1_0.html', 'id': self.tv_id }, { 'url': 'http://v.qq.com/variety/type/list_-1_0_0.html', 'id': self.variety_id }, { 'url': 'http://v.qq.com/cartlist/0/3_-1_-1_-1_-1_1_0_1_10.html', 'id': self.cartoon_id }] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_type, meta={ 'cat_id': cat['id'], 'page': 1 })) else: for cat in self._cat_urls: channel_id = str(cat["id"]) items.append( Request(url=cat['url'], callback=self.parse_single_episode, meta={ 'cat_id': channel_id, 'page': 1, "untrack_id": cat["untrack_id"], "sid": cat["sid"] })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] year_list = [] lyears = [] playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] if not playlistId: logging.log( logging.INFO, "parse_episode_info error,not find playlistid,url:%s " % response.request.url) return items title_list = self.parse_title(response, cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = self.parse_district(response) type_list = self.parse_type_list(response) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() year_list = self.parse_year(response) year = None if year_list: year = year_list[0] #pers = "|".join([t.strip() for t in performer_list]) #dirs = "|".join([t.strip() for t in director_list]) pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) district = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()' ).extract() play_url = "" play_url = response.xpath( '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href' ).extract() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district: ep_item["district"] = district if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "sohu") playlistId = str(playlistId) ep_item["cont_id"] = playlistId if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid mvitem["media"] = ep_item vurl = "" ttvitem = [] if title_list: ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) if not items and playlistId: items += self.api_episode_info(mvitem, playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_play(self, response, untrack_id, sid): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #vip title_list = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/h2/@title' ).extract() director_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'主演:').extract() text = response.xpath( '//div[@class="info info-con"]/p[@class="intro"]/text()' ).extract() pers = "|".join([t.strip() for t in performer_list]) dirs = "|".join([t.strip() for t in director_list]) playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] vid = "" vid_list = response.selector.re(re.compile(r'var vid.*?(\d+)')) if vid_list: vid = vid_list[0] if not playlistId or not vid: return mvitem ep_item = MediaItem() ep_item["cont_id"] = playlistId if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "sohu") if text: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid vitem = VideoItem() vitem["title"] = ep_item["title"] if 'title' in ep_item else None vitem["url"] = ep_item["url"] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(ep_item["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = vid videoitems = [] videoitems.append(vitem) mvitem["video"] = videoitems mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def parse_play_list(self, cat_id, url, flag, response): item = None videoitems = [] try: ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item['video'] = videoitems info = None try: info = self.httpdownload.get_data(url) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems if not info or len(info) < 2: return videoitems msg = info bodylen = len(msg) - 1 index = msg.find(flag) + len(flag) + 1 info = msg[index:bodylen] jinfo = json.loads(info) if "video_play_list" not in jinfo: return videoitems itemlist = jinfo["video_play_list"]["playlist"] for titem in itemlist: if "episode_number" not in titem: continue info = titem["episode_number"] if info and titem["title"].find(u"预告") < 0 and url.find( "qq.com") >= 0: vitem = VideoItem() vitem["title"] = titem["title"] tvnum = string.replace(info, "-", "") #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html tvnum_list = re.findall(r'[\D]+', tvnum) if not tvnum_list: vitem["vnum"] = string.replace(info, "-", "") else: continue vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = "" if int(cat_id) == int(self.tv_id) or int(cat_id) == int( self.cartoon_id): turl = Util.normalize_url(titem["url"], "qq", "tv") else: turl = Util.normalize_url(titem["url"], "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) #vitem["cont_id"] = self.get_vid(response.body,turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #title title = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title' ).extract() #performer #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract() performer_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()' ).extract() if not performer_list: performer_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'主演:').extract() #director #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract() director_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()' ).extract() if not director_list: director_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'导演:').extract() #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() if not text: response.xpath( '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()' ).extract() type_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()' ).extract() if not type_list: type_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'类型:').extract() year_info = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()' ).extract() if not year_info: year_info = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'年份:').extract() play_date = None if year_info: play_date = self.get_year(year_info[0]) # dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) pers = Util.join_list_safely(performer_list) #sourceid sourceid = "" sourceid_list = response.xpath( '//div[@class="mod_bd sourceCont"]/@sourceid').extract() if sourceid_list: sourceid = sourceid_list[0] videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if play_date: ep_item["release_date"] = Util.str2date(play_date) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "qq") ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems vurl = "" url_pre = "http://s.video.qq.com/loadplaylist?vkey=" url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1" videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) mvitem["media"]["cont_id"] = videoid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) vurl = url_pre + str(sourceid) + url_tail tflag = "jQuery191048201349820010364_1425370006500" tpitem = self.parse_play_list(cat_id, vurl, tflag, response) #没有sourceid,比如专题页面 if not tpitem: tpitem = self.parse_topic_play_list(response) videoids = response.xpath( '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value' ).extract() if videoids: mvitem["media"]["cont_id"] = videoids[0] if tpitem: mvitem["video"] = tpitem Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_play(self, response): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = "" untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #items = [] #title title_list = response.xpath( '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title' ).extract() if not title_list: title_list = response.xpath( '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()' ).extract() #performer performer_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()' ).extract() #director director_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() ep_item = MediaItem() videoitems = [] #not film if int(cat_id) != int(self.movie_id): #video list #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li') video_list = response.xpath( '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li' ) i = 0 for tvideo in video_list: lurl = tvideo.xpath('./a/@href').extract() surl = "" #lnum = tvideo.xpath('./a/@title').extract() lnum = tvideo.xpath('./a/span/text()').extract() vitem = VideoItem() if lnum and lurl: vitem["vnum"] = lnum[0] surl = "http://film.qq.com" + lurl[0] vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,surl) turl = "" if cat_id == self.tv_id: turl = Util.normalize_url(surl, "qq", "tv") if cat_id == self.cartoon_id: turl = Util.normalize_url(surl, "qq", "cartoon") else: turl = Util.normalize_url(surl, "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) else: vitem = VideoItem() if title_list: vitem["title"] = title_list[0] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,response.request.url) turl = Util.normalize_url(response.request.url, "qq") vitem["url"] = turl vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) if len(title_list) > 0: ep_item["title"] = title_list[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if len(text) > 0: ep_item["intro"] = text[0] ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) ep_item["cont_id"] = videoid mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems #mvitem["media"]["url"] = response.request.url mvitem["media"]["url"] = Util.normalize_url( response.request.url, "qq") #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.md5hash(Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) #items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def api_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'api prase url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() mediaItem['url'] = request_url sel = response.xpath('.//script[@type="text/javascript"]') pidl = response.xpath('.//script[@type="text/javascript"]').re( '\"pid\"\D?(\d+)') vidl = response.xpath('.//script[@type="text/javascript"]').re( '\"id\"\D?(\d+)') if pidl and vidl: pid = pidl[0] vid = vidl[0] app_api = self.app_api % (self.get_auth(), pid) ismovie = False isvariaty = False if u'电影' == mediaItem['channel_id']: ismovie = True app_api = self.app_api % (self.get_auth(), vid) mediaItem['cont_id'] = str(vid) elif u'综艺' == mediaItem['channel_id']: isvariaty = True app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) else: app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) xpara = self.get_xdata(url=app_api) mediaItem = self.resolve_media_info(xpara, mediaItem, ismovie=ismovie) mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem)) max_page = self.get_max_page(xpara) video_list = [] if ismovie: videoItem = VideoItem() videoItem['title'] = mediaItem[ 'title'] if 'title' in mediaItem else None videoItem['thumb_url'] = mediaItem[ 'poster_url'] if 'poster_url' in mediaItem else None videoItem['url'] = mediaItem[ 'url'] if 'url' in mediaItem else None videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash( mediaItem['url']) if 'url' in mediaItem else None videoItem['vnum'] = mediaItem[ 'vcount'] if 'vcount' in mediaItem else 1 videoItem['cont_id'] = mediaItem[ 'cont_id'] if 'cont_id' in mediaItem else None video_list.append(videoItem) else: for i in range(1, max_page): web_api = self.web_api % (pid, i) dpara = self.get_ddata(url=web_api) video_list += self.resolve_video_item( dpara, page_num=i, isvariaty=isvariaty) if isvariaty: video_list = self.revise_video_item(video_list, xpara) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = video_list items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())