def __init__(self, *args, **kwargs): super(V360Spider, self).__init__(*args, **kwargs) self.mgr = DbManager.instance() self.parser = {'movie': V360ParserMovie(), 'tv': V360ParserTv(), 'variaty': V360ParserVariaty(), 'cartoon': V360ParserCartoon()} self.poster_filter_md5 = self.mgr.get_poster_filter_md5() ''' if 'json_data' in kwargs: data = json.loads(kwargs['json_data']) task = [] if data['cmd'] == 'trig': stat = data['stat'] if 'stat' in data else None task = self.mgr.get_untrack_url('360kan', stat) elif data['cmd'] == 'assign': task = data['task'] self.start = [{'channel': t['code'], 'url': t['url'], 'type': URL_TYPE_PLAY} for t in task] else: ''' self.start = [{'channel': 'movie', 'url': 'http://www.360kan.com/dianying/list.php', 'type': URL_TYPE_MAIN}, \ {'channel': 'tv', 'url': 'http://www.360kan.com/dianshi/list.php', 'type': URL_TYPE_MAIN}, \ {'channel': 'variaty', 'url': 'http://www.360kan.com/zongyi/list.php', 'type': URL_TYPE_MAIN}, \ {'channel': 'cartoon', 'url': 'http://www.360kan.com/dongman/list.php', 'type': URL_TYPE_MAIN}, \ #{'channel': 'variaty', 'url': 'http://www.360kan.com/va/Zsgoa6dv7JM8ED.html', 'type': URL_TYPE_MEDIA}, \ # {'channel': 'movie', 'url': 'http://www.360kan.com/m/f6bkZkUqcHr4TR.html', 'type': URL_TYPE_MEDIA} ]
class pps_spider(Spider): ''' pps爬虫流程: (1)list列表页 -> 播放页 -> 媒体页(若存在) (2)播放页 -> 媒体页(若存在) ''' site_code = 'pps' name = site_code mgr = DbManager.instance() max_mark_depth = 3 max_number = 100000 #通过json传递的参数 json_data = None def __init__(self, json_data=None, *args, **kwargs): super(pps_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: logging.log(logging.INFO, '由于pps站点处于十分不稳定状态,所以放弃该站点的爬虫') ''' for list_channel in pps_extract.list_channels_url: url = pps_extract.list_channels_url[list_channel] items.append(Request(url=url, callback=self.list_parse, meta={'level':1, 'id':list_channel})) ''' except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally:
def __init__(self, *args, **kwargs): super(H360Spider, self).__init__(*args, **kwargs) self.mgr = DbManager.instance() self.parser = { 'movie': V360ParserMovie(), 'tv': V360ParserTv(), 'variaty': V360ParserVariaty(), 'cartoon': V360ParserCartoon() } self.site_id = self.mgr.get_site(site_code=self.site_name)['site_id'] self.os_id = self.mgr.get_os(os_name='web') self.channel_map = self.mgr.get_channel_map()
class kankan_spider(Spider): ''' kankan爬虫流程: (1)list列表页 -> 播放页 -> 媒体页 (2)播放页 -> 媒体页 由于kankan在list表页的全部即代表全部,所以无需每个标签都爬取 ''' site_code = 'kankan' name = site_code mgr = DbManager.instance() max_number = '100000' vip_prefix_url = 'http://vip.kankan.com' #通过json传递的参数 json_data = None #统计数据用 #count = 0 #忽略类型:预告片 skip_types = {'pre': u'预告片'} def __init__(self, json_data=None, *args, **kwargs): super(kankan_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: self.load_member_variable() if self.json_data: items = items + self.load_video_urls() else: list_prefix_url = 'http://movie.kankan.com/type/%s/' for list_channel in kankan_extract.list_channels: list_channel_pinyin = kankan_extract.list_channels_pinyin[ list_channel] url = list_prefix_url % list_channel_pinyin items.append( Request(url=url, callback=self.list_parse, meta={ 'first': True, 'id': list_channel })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally:
class pptv_spider(Spider): ''' pptv爬虫流程: (1)list列表页 -> 播放页 (2)播放页 ''' site_code = 'pptv' name = site_code mgr = DbManager.instance() max_mark_depth = 6 max_number = 100000 list_prefix_url = 'http://list.pptv.com/channel_list.html' vip_prefix_url = 'http://ddp.vip.pptv.com' #老api,已经被放弃 #album_api = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=%s&cid=%s&page=%s' #该接口时常不稳定 album_api = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=%s&cat_id=%s&highlight=%s&page=%s' #当album_api不稳定时,利用下一个接口:该接口需要auth,一个跟设备绑定的参数 auths = ["d410fafad87e7bbf6c6dd62434345818"] auth_album_api = "http://epg.api.pptv.com/detail.api?vid=%s&auth=%s" #通过json传递的参数 json_data = None httpcli = HTTPDownload() app_api = "http://epg.api.pptv.com/detail.api?auth=%s&vid=%s" web_api = "http://v.pptv.com/show/videoList?&cb=videoList&pid=%s&page=%s" def __init__(self, json_data=None, *args, **kwargs): super(pptv_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: self.load_member_variable() if self.json_data: items = items + self.load_video_urls() else: url = 'http://list.pptv.com' items.append( Request(url=url, callback=self.list_parse, meta={'level': 0})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally:
class dy1905_spider(Spider): ''' dy1905爬虫流程: (1)list列表页 -> 媒体页(无需进入播放页) (2)播放页->媒体页 由于dy1905在list表页的全部即代表全部,所以无需每个标签都爬取 ''' site_code = '1905' name = site_code mgr = DbManager.instance() max_number = 100000 vip_prefix_urls = ['http://vip.1905.com', 'http://vip.m1905.com'] max_mark_depth = 10 #通过json传递的参数 json_data = None #统计数据用 #count = 0 def __init__(self, json_data=None, *args, **kwargs): super(dy1905_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: self.load_member_variable() if self.json_data: items = items + self.load_video_urls() else: for list_channel in dy1905_extract.list_channels: if list_channel == u'电影': url = 'http://www.1905.com/mdb/film/list' items.append( Request(url=url, callback=self.list_parse, meta={ 'level': 0, 'id': list_channel })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally:
class letv_spider(Spider): ''' letv爬虫流程: (1)list列表页 -> 播放页 -> 媒体页 (2)播放页 -> 媒体页 注意:乐视需要分标签进行爬取 ''' site_code = 'letv' name = site_code mgr = DbManager.instance() max_mark_depth = 5 max_number = 100000 list_json_prefix_url = 'http://list.letv.com/apin/chandata.json' zongyi_album_api = 'http://api.letv.com/mms/out/albumInfo/getVideoListByIdAndDate?&year=%s&month=%s&id=%s' # other_album_api = 'http://api.mob.app.letv.com/play/vlist?pid=%s&pagenum=%s' other_album_api = 'http://api.mob.app.letv.com/play/cards?pid=%s&version=6.2.2&pagenum=%s' #通过json传递的参数 json_data = None def __init__(self, json_data=None, *args, **kwargs): super(letv_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: self.load_member_variable() if self.json_data: items = items + self.load_video_urls() else: url = 'http://list.letv.com' items.append( Request(url=url, callback=self.list_parse, meta={'level': 0})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally:
class hunantv_spider(Spider): ''' hunantv爬虫流程: (1)list列表页 -> 播放页 -> 正片页[ -> 媒体页] (2)播放页 -> 正片页 由于hunantv在list表页的全部即代表全部,所以无需每个标签都爬取 ''' site_code = 'hunantv' name = site_code mgr = DbManager.instance() max_number = 100000 max_mark_depth = 10 #通过json传递的参数 json_data = None httpdownload = HTTPDownload() media_info_url = "http://m.api.hunantv.com/video/getbyid?videoId=%s" video_list_url = "http://m.api.hunantv.com/video/getList?videoId=%s&pageNum=%s" def __init__(self, json_data=None, *args, **kwargs): super(hunantv_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: self.load_member_variable() if self.json_data: items = items + self.load_video_urls() else: url = 'http://list.hunantv.com' items.append( Request(url=url, callback=self.list_parse, meta={'level': 0})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally:
class baofeng_spider(Spider): name = "baofeng" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] site_code = "baofeng" site_id = "" #baofeng allowed_domains = ["www.baofeng.com", "g.hd.baofeng.com"] url_prefix = 'http://www.baofeng.com' site_name = Util.guess_site(url_prefix) mgr = DbManager.instance() os_id = mgr.get_os('web')["os_id"] site_id = str(mgr.get_site(site_code)["site_id"]) channel_map = {} channel_map = mgr.get_channel_map() max_update_page = get_project_settings().get('MAX_UPDATE_PAGE') global_spider = True httpdownload = HTTPDownload() channel_info = {} test_page_url = None test_channel_id = None def __init__(self, json_data=None, *args, **kwargs): super(baofeng_spider, self).__init__(*args, **kwargs) cat_urls = [] tasks = None if json_data: data = json.loads(json_data) if "type" in data: spider_type = data["type"] if spider_type != "global": self.global_spider = False tasks = [] ttask = {} if "id" in data and "url" in data: ttask["id"] = data["id"] ttask["url"] = data["url"] ttask["sid"] = "" ttask["untrack_id"] = "" cat_urls.append(ttask) cmd = data["cmd"] if cmd == "assign": tasks = data["task"] elif cmd == "trig": stat = data['stat'] if 'stat' in data else None tasks = self.mgr.get_untrack_url(self.site_code, stat) elif cmd == "test" and 'id' in data and 'url' in data: self.test_page_url = data["url"] self.test_channel_id = data["id"] if tasks: for task in tasks: ttask = {} ttask["url"] = task["url"] code = task["code"] ttask["id"] = self.channel_map[code] ttask["untrack_id"] = task["untrack_id"] ttask["sid"] = task["sid"] cat_urls.append(ttask) self._cat_urls = [] if cat_urls: self._cat_urls = cat_urls def start_requests(self): try: items = [] self.movie_id = str(self.mgr.get_channel('电影')["channel_id"]) self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"]) self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"]) self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"]) self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "baofeng") items.append( Request(url=self.test_page_url, callback=self.parse_page, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if not self._cat_urls: if self.global_spider: cat_urls = [{ 'url': 'http://www.baofeng.com/movie/682/list-sid-1-p-1.shtml', 'id': self.movie_id }, { 'url': 'http://www.baofeng.com/tv/914/list-type-2-ishot-1-sid-1-p-1.shtml', 'id': self.tv_id }, { 'url': 'http://www.baofeng.com/enc/444/list-type-4-ishot-1-sid-1-p-1.shtml', 'id': self.variety_id }, { 'url': 'http://www.baofeng.com/comic/924/list-type-3-ishot-1-sid-1-p-1.shtml', 'id': self.cartoon_id }] #cat_urls = [{'url':'http://www.baofeng.com/enc/444/list-type-4-ishot-1-sid-1-p-1.shtml','id':self.variety_id}] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_area, meta={ 'cat_id': cat['id'], 'page': 1 })) #items.append(Request(url=cat['url'], callback=self.parse_page, meta={'cat_id': cat['id'],'page':1})) else: for cat in self._cat_urls: turl = Util.normalize_url(cat['url'], "baofeng") items.append( Request(url=turl, callback=self.parse_single_episode, meta={ 'cat_id': cat["id"], 'page': 1, "poster_url": "", "untrack_id": cat["untrack_id"], "sid": cat["sid"] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_area(self, response): items = [] try: #logging.log(logging.INFO, 'parse_area: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="selecter"]/div[1]/div[@class="clearfix rp"]/a/@href' ).extract() for sub in subs: items.append( Request(url=self.url_prefix + sub, callback=self.parse_type, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_type(self, response): items = [] try: #logging.log(logging.INFO, 'parse_type: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="selecter"]/div[2]/div[@class="clearfix rp"]/a/@href' ).extract() for sub in subs: items.append( Request(url=self.url_prefix + sub, callback=self.parse_time, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_time(self, response): items = [] try: #logging.log(logging.INFO, 'parse_time: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="selecter"]/div[3]/div[@class="clearfix rp"]/a/@href' ).extract() for sub in subs: items.append( Request(url=self.url_prefix + sub, callback=self.parse_page, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_page(self, response): items = [] try: cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] logging.log(logging.INFO, 'parse_page: %s,%s' % (response.request.url, page)) #if int(page) > int(self.max_update_page) and not self.global_spider: # return items = [] play_url = "" subs = response.xpath( '//div[@class="sort-list-r-mod02"]/ul[@class="sort-list-r-poster clearfix"]/li' ) for sub in subs: play_url = sub.xpath('./div[1]/p[1]/a/@href').extract() pic_urls = sub.xpath('./div[1]/p[1]/a/img/@src').extract() #pic_urls = sub.xpath('./div[@class="hot-pic-like js-collect shadow-cut"]/p[1]/a/img/@src').extract() pic_url = "" if pic_urls: pic_url = pic_urls[0] if play_url: rplay_url = play_url[0].strip() items.append( Request(url=self.url_prefix + rplay_url, callback=self.parse_single_episode, meta={ 'cat_id': cat_id, 'poster_url': pic_url, 'untrack_id': '', 'sid': '' })) next_page = response.xpath( '//div[@class="sort-list-r-mod02"]/div[@class="pages"]/ul[@class="clearfix"]/li/a[text()="%s"]/@href' % u'下一页').extract() if next_page: snext_page = next_page[0].strip() if snext_page.find(self.url_prefix) < 0: snext_page = self.url_prefix + snext_page items.append( Request(url=snext_page, callback=self.parse_page, meta={ 'page': page + 1, 'cat_id': cat_id })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] poster_url = response.request.meta['poster_url'] urls = response.xpath( '//div[@class="play-nav-l-new"]/h1/a/@href').extract() if urls: for iurl in urls: turl = self.url_prefix + iurl surl = Util.normalize_url(turl, "baofeng") if surl and self.site_name == Util.guess_site(surl): #if turl and self.site_name == Util.guess_site(turl): items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url, 'page': 1, "untrack_id": untrack_id, "sid": sid })) #付费电影,不能跳转到媒体页 else: pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] year_list = [] lyears = [] title_list = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title' ).extract() director_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'主演:').extract() type_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'类型:').extract() district_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'地区:').extract() year_info = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/text()' % u'地区:').extract() year = None if len(year_info) >= 2: year = self.get_year(year_info[1]) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) districts = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="juqing briefTab"]/div/text()').extract() #score score = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()' ).extract() play_url = "" tplay_url = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href' ).extract() if tplay_url: play_url = self.url_prefix + tplay_url[0].strip() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] if ep_item["title"].find(u'预:') >= 0: print "预告片,url", response.request.url return items ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = districts if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "baofeng") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item vurl = "" videoid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = videoid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) #if self.check_url(mvitem): if res: items.append(mvitem) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_video_item(self, response, cat_id, url, title, playlistId): #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title)) videoitems = [] ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item["video"] = videoitems try: if int(cat_id) != int(self.movie_id): ul_list = response.xpath( '//div[@class="episodes clearfix "]/a') if not ul_list: ul_list = response.xpath( '//div[@class="episodes clearfix enc-episodes-detail"]/a' ) for li in ul_list: url = li.xpath('./@href').extract() ttitle = li.xpath('./@title').extract() snum = li.xpath('./text()').extract() if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num = self.getvnum(self.url_prefix + url[0]) if not ttitle: ttitle = [play_num] vitem = self.compose_vitem([self.url_prefix + url[0]], title, play_num) if 'url' in vitem: videoitems.append(vitem) elif int(cat_id) == int(self.movie_id): if url: vitem = self.compose_vitem([url], title, 1) if 'url' in vitem: videoitems.append(vitem) if videoitems: item["video"] = videoitems item["media"]["url"] = response.request.url Util.set_ext_id(item["media"], item["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return item def compose_vitem(self, url_list, title_list, vnum): vitem = VideoItem() try: if not url_list: return vitem if title_list: vitem["title"] = title_list[0].strip() turl = Util.normalize_url(url_list[0], "baofeng") vitem["url"] = turl vitem["vnum"] = str(vnum) vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(turl) vitem["site_id"] = self.site_id vitem["cont_id"] = self.getshowid(turl) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return vitem def get_play_num(self, title): num = "" try: num_list = re.findall('([\d]+)', title) if num_list: num_size = len(num_list) num = num_list[num_size - 1] except Exception as e: pass return num def check_url(self, mvitem): res = True try: if 'video' in mvitem: for video in mvitem['video']: if 'url' in video: if Util.guess_site(video['url']) != self.site_name: res = False break except Exception as e: pass return res def is_same_site(self, url): res = True try: tsite = Util.guess_site(url) if tsite != self.site_name: res = False except Exception as e: pass res = False return res def getshowid(self, url): id = "" try: #http://www.baofeng.com/play/497/play-786997.html #r = re.compile(r'http://.+/id_([^_]+).*\.html') #r = re.compile(r'http://.*[]-([\d]+).html') #r = re.compile(r'http://.*[play|detail]-([\d]+).*html') r = re.compile(r'http://.*/\w+-(\d+).*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def getvnum(self, url): id = "" try: #http://www.baofeng.com/play/363/play-786863-drama-10.html r = re.compile(r'http://.*-drama-(\d+).*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def get_year(self, info): year = None try: r = re.compile(ur'.*(\d+).*') m = r.search(info) if m: return m.group(1) except Exception as e: pass return year
class youku_spider(Spider): name = "youku" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] site_code = "youku" allowed_domains = ["youku.com","v.youku.com"] url_prefix = 'http://www.youku.com' ua='Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5' mgr = DbManager.instance() os_id = mgr.get_os('web')["os_id"] site_id = str(mgr.get_site(site_code)["site_id"]) channel_map = mgr.get_channel_map() #code -> id channel_map_rev = dict([[str(v), k] for k, v in channel_map.items()]) #id -> code max_update_page = get_project_settings().get('MAX_UPDATE_PAGE') httpdownload = HTTPDownload() cat_urls = [] def __init__(self, json_data=None, *args, **kwargs): super(youku_spider, self).__init__(*args, **kwargs) if json_data: data = json.loads(json_data) tasks=[] cmd = data["cmd"] if cmd == "assign": #task from command tasks = data["task"] elif cmd == "trig": #task from untrack stat = data['stat'] if 'stat' in data else None tasks = self.mgr.get_untrack_url(self.site_code, stat) elif cmd == 'carpet': tasks = self.mgr.get_video_url(self.site_code) elif cmd == "test" and 'id' in data and 'url' in data: #assign task by channel_id and url self.cat_urls.append({'id': data["id"], 'url': data["url"], 'sid': '', 'untrack_id': ''}) for task in tasks: ttask={} ttask["url"] = task["url"] code = task["code"] ttask["id"] = self.channel_map[code] ttask["untrack_id"] = task["untrack_id"] if 'untrack_id' in task else None ttask["sid"] = task["sid"] if 'sid' in task else None ttask['mid'] = task['mid'] if 'mid' in task else None self.cat_urls.append(ttask) def start_requests(self): try: items = [] if not self.cat_urls: cat_urls = [{'url':'http://www.youku.com/v_olist/c_85', 'id': self.channel_map['variaty']}] ''' cat_urls = [{'url':'http://www.youku.com/v_olist/c_96', 'id': self.channel_map['movie']}, {'url':'http://www.youku.com/v_olist/c_97', 'id': self.channel_map['tv']}, {'url':'http://www.youku.com/v_olist/c_85', 'id': self.channel_map['variaty']}, {'url':'http://www.youku.com/v_olist/c_100', 'id':self.channel_map['cartoon']}] ''' for cat in cat_urls: items.append(Request(url=cat['url'], callback=self.parse_list, meta={'cat_id': cat['id'],'page':1})) else: for cat in self.cat_urls: turl = Util.normalize_url(cat['url'],"youku") items.append(Request(url=turl, callback=self.parse_single_episode, meta={'cat_id': cat["id"],'page':1,"untrack_id":cat["untrack_id"],"sid":cat["sid"],"mid":cat["mid"]})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_single_episode(self,response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] urls = response.xpath('//div[@class="base_info"]/h1[@class="title"]/a/@href').extract() if urls: for iurl in urls: surl = Util.normalize_url(iurl,"youku") if surl: items.append(Request(url=surl, callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':'','page':1,"untrack_id":untrack_id,"sid":sid,"mid":mid})) else: logging.log(logging.INFO, 'miss media page: %s' % response.request.url) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_list(self,response): items = [] try: logging.log(logging.INFO, 'parse_list: %s' % response.request.url) cat_id = response.request.meta['cat_id'] area_list = response.xpath('//div[@class="yk-filter-panel"]/div/label[text()="%s"]/../ul/li/a/text() ' % u"地区").extract() type_list = response.xpath('//div[@class="yk-filter-panel"]/div/label[text()="%s"]/../ul/li/a/text() ' % u"类型").extract() year_list = response.xpath('//div[@class="yk-filter-panel"]/div/label[text()="%s"]/../ul/li/a/text() ' % u"时间").extract() s_list = ['1','2','4','5','6'] d_list = ['1','2','4'] for area in area_list: for type in type_list: for s_sub in s_list: url_pref = response.request.url + "_a_" + area + "_g_" + type + "_u_1" + "_s_" + s_sub +"_d_1" + ".html" items.append(Request(url=url_pref, callback=self.parse_page, meta={'cat_id': cat_id,'page':1})) titem = self.parse_page(response) if titem: items.extend(titem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_page(self,response): items = [] try: logging.log(logging.INFO, 'parse_page: %s' % response.request.url) page = response.request.meta['page'] logging.log(logging.INFO, 'parse_page: %s,%s' % (str(page),response.request.url)) #if int(page) > int(self.max_update_page) and self.global_spider: # logging.log(logging.INFO, 'parse_page: %s,%s' % (str(page),response.request.url)) # return cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] items = [] subs = response.xpath('//div[@class="yk-row yk-v-80"]/div') for sub in subs: pic_urls = sub.xpath('./div[@class="p p-small"]/div[@class="p-thumb"]/img/@src').extract() play_url = sub.xpath('./div[@class="p p-small"]/div[@class="p-link"]/a/@href').extract() pic_url = "" if pic_urls: pic_url = pic_urls[0] if play_url: items.append(Request(url=play_url[0].strip(),callback=self.parse_episode_info,meta={'cat_id': cat_id,'poster_url':pic_url})) next_page = response.xpath("//div[@class='yk-pager']/ul[@class='yk-pages']/li[@title='%s']/a/@href" % u'下一页').extract() if next_page: snext_page = next_page[0].strip() if snext_page.find(self.url_prefix) < 0: snext_page = self.url_prefix + snext_page items.append(Request(url=snext_page, callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id})) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] page_id = self.get_youku_pageid(response.request.url) if not page_id: log.error('miss content id: %s' % response.request.url) return untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] items = [] year_list = [] title = self.parse_title(response,cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'地区:').extract() type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'类型:').extract() play_date = self.parse_play_date(response) total_num = self.parse_total_num(response) year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) #text text = response.xpath('//div[@class="detail"]/span/text()').extract() videoitems = [] ep_item = MediaItem() if title: ep_item["title"] = title[0].strip() if pers: ep_item["actor"] = pers if dirs > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if play_date: ep_item["release_date"] = Util.str2date(play_date) if total_num: ep_item["vcount"] = total_num ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url,"youku") if text: ep_item["intro"] = text[0].strip() ep_item["cont_id"] = page_id ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item)) mvitem = MediaVideoItem(); if mid: mvitem['mid'] = mid mvitem["media"] = ep_item; if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id) mvitem['video'] = video_list Util.set_ext_id(mvitem["media"], mvitem["video"]) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_video_item_media(self,code,pn): videoitems = [] try: getlist_url = "http://v.youku.com/x_getAjaxData?md=showlistnew&vid=%s&pl=100&pn=%d" % (code,pn) urllist_info = self.httpdownload.get_data(getlist_url,ua=self.ua) if urllist_info: try: json_data = json.loads(urllist_info) except Exception as e: return videoitems if json_data and "showlistnew" in json_data: if json_data["showlistnew"]: items = json_data["showlistnew"]["items"] vnum_name = "" if type(items)==list: videoseq = set() videostage = set() for item in items: if "preview" in item: continue videoseq.add(item["show_videoseq"]) videostage.add(item["show_videostage"]) if len(videoseq)>len(videostage): vnum_name = "show_videoseq" else: vnum_name = "show_videostage" for item in items: if "preview" in item: continue if "videoid" not in item: continue vitem = VideoItem() vitem["url"] = "http://v.youku.com/v_show/id_%s.html" % item["videoid"] vitem["vnum"] = item[vnum_name] vitem["title"] = item["title"] vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = item["videoid"] videoitems.append(vitem) elif type(items)==dict: videoseq = set() videostage = set() for k in items: item = items[k] if "preview" in item: continue videoseq.add(item["show_videoseq"]) videostage.add(item["show_videostage"]) if len(videoseq)>len(videostage): vnum_name = "show_videoseq" else: vnum_name = "show_videostage" for k in items: item = items[k] if "preview" in item: continue if "videoid" not in item: continue vitem = VideoItem() vitem["url"] = "http://v.youku.com/v_show/id_%s.html" % item["videoid"] vitem["vnum"] = item[vnum_name] vitem["title"] = item["title"] vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = item["videoid"] videoitems.append(vitem) else: logging.log(logging.ERROR, getlist_url) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return videoitems def parse_video_item(self, response, cat_id, title, media_page_id): videoitems = [] try: play_url = self.parse_play_url(response) if play_url: url = Util.normalize_url(play_url[0], "youku") cont_id = self.get_youku_showid(url) i=1 while True: item = self.parse_video_item_media(cont_id,i) if item: videoitems = videoitems + item i = i+1 else: break except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return videoitems def parse_title(self,response,cat_id): title = [] try: #title = response.xpath('//div[@id="title_wrap"]/div[@id="title"]/h1/span[@class="name"]/text()').extract() title = response.xpath('//div[@id="title_wrap"]/div[@id="title"]/div[@class="base"]/h1/span[@class="name"]/text()').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return title def parse_actor(self,response): performer_list = [] try: performer_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'主演:').extract() if not performer_list: performer_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'主持人:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return performer_list def parse_director(self,response): director_list = [] try: director_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'导演:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return director_list def parse_play_url(self,response): play_list = [] try: play_list = response.xpath("//div[@class='showInfo poster_w yk-interact']/ul[@class='baseaction']/li[@class='action']/a/em[text()='%s']/../@href" % u"播放正片").extract() if not play_list: play_list = response.xpath("//div[@class='showInfo poster_w yk-interact']/ul[@class='baseaction']/li[@class='action']/a/em[text()='%s']/../@href" % u"播放").extract() if not play_list: play_list = response.xpath("//div[@class='showInfo poster_w yk-interact']/ul[@class='baseaction']/li[@class='action']/a/em[text()='%s']/../@href" % u"免费试看").extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return play_list def get_youku_pageid(self,url): id = "" try: #http://www.youku.com/show_page/id_zed6b4c7497b811e4b522.html r = re.compile(r'http://www.youku.com/show_page/id_([^_]+).*\.html') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def get_youku_showid(self,url): #http://v.youku.com/v_show/id_XNzUyMDUwOTAw.html id = "" try: r = re.compile(r'http://v.youku.com/v_show/id_([^/]+).*\.html') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def parse_play_date(self,response): res = [] strdate = None try: res = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../text()' % u'优酷上映:').extract() if not res: res = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../text()' % u'优酷开播:').extract() if not res: res = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../text()' % u'上映:').extract() if res: strdate = res[0] except Exception as e: pass return strdate def parse_total_num(self,response): res = None try: info_list = response.xpath('//div[@class="basenotice"]/text()').extract() for info in info_list: r = re.compile(ur'.*%s(\d+)%s.*' % (u'共',u'集')) m = r.search(info) if m: return m.group(1) except Exception as e: pass return res
class qq_spider(Spider): name = "qq" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] site_code = "qq" site_id = "" #qq allowed_domains = ["v.qq.com", "film.qq.com", "s.video.qq.com"] url_prefix = 'http://v.qq.com' #used for guess_site site_name = Util.guess_site(url_prefix) mgr = DbManager.instance() os_id = mgr.get_os('web')["os_id"] site_id = str(mgr.get_site(site_code)["site_id"]) #site_code = str(mgr.get_site(site_name)["site_code"]) channel_map = {} channel_map = mgr.get_channel_map() max_update_page = get_project_settings().get('MAX_UPDATE_PAGE') global_spider = True httpdownload = HTTPDownload() channel_info = {} movie_id = "" tv_id = "" variety_id = "" cartoon_id = "" test_page_url = None test_channel_id = None def __init__(self, json_data=None, *args, **kwargs): super(qq_spider, self).__init__(*args, **kwargs) cat_urls = [] tasks = None if json_data: data = json.loads(json_data) if "type" in data: spider_type = data["type"] if spider_type != "global": self.global_spider = False tasks = [] if "id" in data and "url" in data: ttask = {} ttask["id"] = data["id"] ttask["url"] = data["url"] ttask["sid"] = "" ttask["untrack_id"] = "" cat_urls.append(ttask) cmd = data["cmd"] if cmd == "assign": tasks = data["task"] elif cmd == "trig": stat = data['stat'] if 'stat' in data else None tasks = self.mgr.get_untrack_url(self.site_code, stat) elif cmd == "test" and 'id' in data and 'url' in data: self.test_page_url = data["url"] self.test_channel_id = data["id"] if tasks: for task in tasks: ttask = {} ttask["url"] = task["url"] code = task["code"] ttask["id"] = self.channel_map[code] ttask["untrack_id"] = task["untrack_id"] ttask["sid"] = task["sid"] cat_urls.append(ttask) self._cat_urls = [] if cat_urls: self._cat_urls = cat_urls def start_requests(self): items = [] try: cat_urls = [] self.movie_id = self.mgr.get_channel('电影')["channel_id"] self.tv_id = self.mgr.get_channel('电视剧')["channel_id"] self.variety_id = self.mgr.get_channel('综艺')["channel_id"] self.cartoon_id = self.mgr.get_channel('动漫')["channel_id"] self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "qq") items.append( Request(url=self.test_page_url, callback=self.parse_single_episode, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if not self._cat_urls: #cat_urls = [{'url':'http://v.qq.com/list/2_-1_-1_-1_0_1_1_10_-1_-1_0.html','id':self.tv_id}] cat_urls = [{ 'url': 'http://v.qq.com/movielist/10001/0/0/1/0/10/1/0.html', 'id': self.movie_id }, { 'url': 'http://v.qq.com/list/2_-1_-1_-1_0_1_1_10_-1_-1_0.html', 'id': self.tv_id }, { 'url': 'http://v.qq.com/variety/type/list_-1_0_0.html', 'id': self.variety_id }, { 'url': 'http://v.qq.com/cartlist/0/3_-1_-1_-1_-1_1_0_1_10.html', 'id': self.cartoon_id }] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_type, meta={ 'cat_id': cat['id'], 'page': 1 })) else: for cat in self._cat_urls: channel_id = str(cat["id"]) items.append( Request(url=cat['url'], callback=self.parse_single_episode, meta={ 'cat_id': channel_id, 'page': 1, "untrack_id": cat["untrack_id"], "sid": cat["sid"] })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] urls = response.xpath( '//div[@class="breadcrumb"]/a[@class="breadcrumb_item"]/@href' ).extract() #carton is different if not urls: turls = response.xpath( '//div[@class="mod_player_head cf"]/div[1]/div[1]/a/@href' ).extract() if turls: tlen = len(turls) urls = [turls[tlen - 1]] if urls: turl = self.url_prefix + urls[0] #print "turl",turl #turl = "http://v.qq.com/p/tv/detail/hqg/index.html" items.append( Request(url=turl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': '', 'page': 1, "untrack_id": untrack_id, "sid": sid })) else: ttitem = self.parse_episode_play(response) if ttitem and self.check_url(ttitem): items.append(ttitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_type(self, response): items = [] try: cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="mod_indexs bor"]/div[@class="mod_cont"]/ul[1]/li/a/@href' ).extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_area, meta={ 'cat_id': cat_id, 'page': 1 })) titem = self.parse(response) if titem: items.extend(titem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_area(self, response): items = [] try: logging.log(logging.INFO, 'parse_area: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="mod_indexs bor"]/div[@class="mod_cont"]/ul[2]/li/a/@href' ).extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_year, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_year(self, response): items = [] try: logging.log(logging.INFO, 'parse_year: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="mod_indexs bor"]/div[@class="mod_cont"]/ul[3]/li/a/@href' ).extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_sort, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_sort(self, response): items = [] try: logging.log(logging.INFO, 'parse_sort: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="mod_tab_sort"]/ul/li/a/@href').extract() for sub in subs: items.append( Request(url=sub, callback=self.parse, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items #for each category parse all its sub-categories or types,called by parse_sort def parse(self, response): items = [] try: page = response.request.meta['page'] logging.log(logging.INFO, 'lev1: %s,%s' % (str(page), response.request.url)) #if int(page) > int(self.max_update_page) and not self.global_spider: # return cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] play_url = "" subs = response.xpath( '//div[@class="grid_18"]/div[2]/div[@class="mod_cont"]/div[@class="mod_item"]' ) # 综艺页面不统一 if not subs: subs = response.xpath( '//div[@class="grid_18"]/div[2]/div[@class="mod_cont"]/div[@class="mod_item pic_160"]' ) for sub in subs: play_url = sub.xpath( './div[@class="mod_txt"]/div[@class="mod_operate"]/a/@href' ).extract() if not play_url: play_url = sub.xpath( './div[@class="mod_txt"]/div[@class="mod_item_tit"]/h6/a/@href' ).extract() pic_urls = sub.xpath( './div[@class="mod_pic"]/a/img/@src').extract() pic_url = "" if pic_urls: pic_url = pic_urls[0] items.append( Request(url=play_url[0].strip(), callback=self.parse_episode, meta={ 'cat_id': cat_id, 'poster_url': pic_url })) next_page = response.xpath( "//div[@class='mod_pagenav']/p/a[@title='%s']/@href" % u'下一页').extract() if next_page: snext_page = next_page[0].strip() if snext_page.find("v.qq.com") < 0: snext_page = "http://v.qq.com" + snext_page items.append( Request(url=snext_page, callback=self.parse, meta={ 'page': page + 1, 'cat_id': cat_id })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_play(self, response): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = "" untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #items = [] #title title_list = response.xpath( '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title' ).extract() if not title_list: title_list = response.xpath( '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()' ).extract() #performer performer_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()' ).extract() #director director_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() ep_item = MediaItem() videoitems = [] #not film if int(cat_id) != int(self.movie_id): #video list #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li') video_list = response.xpath( '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li' ) i = 0 for tvideo in video_list: lurl = tvideo.xpath('./a/@href').extract() surl = "" #lnum = tvideo.xpath('./a/@title').extract() lnum = tvideo.xpath('./a/span/text()').extract() vitem = VideoItem() if lnum and lurl: vitem["vnum"] = lnum[0] surl = "http://film.qq.com" + lurl[0] vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,surl) turl = "" if cat_id == self.tv_id: turl = Util.normalize_url(surl, "qq", "tv") if cat_id == self.cartoon_id: turl = Util.normalize_url(surl, "qq", "cartoon") else: turl = Util.normalize_url(surl, "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) else: vitem = VideoItem() if title_list: vitem["title"] = title_list[0] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,response.request.url) turl = Util.normalize_url(response.request.url, "qq") vitem["url"] = turl vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) if len(title_list) > 0: ep_item["title"] = title_list[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if len(text) > 0: ep_item["intro"] = text[0] ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) ep_item["cont_id"] = videoid mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems #mvitem["media"]["url"] = response.request.url mvitem["media"]["url"] = Util.normalize_url( response.request.url, "qq") #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.md5hash(Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) #items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem #先进入播放页,再进入媒体页,判断是否能进入媒体页,如果不能进入,就直接解析播放页信息 def parse_episode(self, response): items = [] try: logging.log(logging.INFO, 'lev2: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] urls = response.xpath( '//div[@class="breadcrumb"]/a[@class="breadcrumb_item"]/@href' ).extract() #carton is different if not urls: turls = response.xpath( '//div[@class="mod_player_head cf"]/div[1]/div[1]/a/@href' ).extract() if turls: tlen = len(turls) urls = [turls[tlen - 1]] if urls: turl = self.url_prefix + urls[0] items.append( Request(url=turl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url })) #不就跳转到媒体页 else: print "2-----------------------not jump to episode ,", response.request.url titem = self.parse_episode_play(response) if titem and self.check_url(titem): items.append(titem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #title title = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title' ).extract() #performer #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract() performer_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()' ).extract() if not performer_list: performer_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'主演:').extract() #director #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract() director_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()' ).extract() if not director_list: director_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'导演:').extract() #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() if not text: response.xpath( '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()' ).extract() type_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()' ).extract() if not type_list: type_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'类型:').extract() year_info = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()' ).extract() if not year_info: year_info = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'年份:').extract() play_date = None if year_info: play_date = self.get_year(year_info[0]) # dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) pers = Util.join_list_safely(performer_list) #sourceid sourceid = "" sourceid_list = response.xpath( '//div[@class="mod_bd sourceCont"]/@sourceid').extract() if sourceid_list: sourceid = sourceid_list[0] videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if play_date: ep_item["release_date"] = Util.str2date(play_date) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "qq") ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems vurl = "" url_pre = "http://s.video.qq.com/loadplaylist?vkey=" url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1" videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) mvitem["media"]["cont_id"] = videoid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) vurl = url_pre + str(sourceid) + url_tail tflag = "jQuery191048201349820010364_1425370006500" tpitem = self.parse_play_list(cat_id, vurl, tflag, response) #没有sourceid,比如专题页面 if not tpitem: tpitem = self.parse_topic_play_list(response) videoids = response.xpath( '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value' ).extract() if videoids: mvitem["media"]["cont_id"] = videoids[0] if tpitem: mvitem["video"] = tpitem Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_play_list(self, cat_id, url, flag, response): item = None videoitems = [] try: ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item['video'] = videoitems info = None try: info = self.httpdownload.get_data(url) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems if not info or len(info) < 2: return videoitems msg = info bodylen = len(msg) - 1 index = msg.find(flag) + len(flag) + 1 info = msg[index:bodylen] jinfo = json.loads(info) if "video_play_list" not in jinfo: return videoitems itemlist = jinfo["video_play_list"]["playlist"] for titem in itemlist: if "episode_number" not in titem: continue info = titem["episode_number"] if info and titem["title"].find(u"预告") < 0 and url.find( "qq.com") >= 0: vitem = VideoItem() vitem["title"] = titem["title"] tvnum = string.replace(info, "-", "") #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html tvnum_list = re.findall(r'[\D]+', tvnum) if not tvnum_list: vitem["vnum"] = string.replace(info, "-", "") else: continue vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = "" if int(cat_id) == int(self.tv_id) or int(cat_id) == int( self.cartoon_id): turl = Util.normalize_url(titem["url"], "qq", "tv") else: turl = Util.normalize_url(titem["url"], "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) #vitem["cont_id"] = self.get_vid(response.body,turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def parse_topic_play_list(self, response): item = None videoitems = [] try: subs = response.xpath( '//div[@class="mod_video_fragments"]/div[@class="mod_figures_1"]/ul/li' ) for sub in subs: vitem = VideoItem() title = sub.xpath('./strong/a/text()').extract() vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = sub.xpath('./strong/a/@href').extract() if title and title[0].find(u"预告") < 0: if turl and turl[0].find(".com") < 0 or ( turl and turl[0].find("qq.com") >= 0): vitem["title"] = title[0].strip() vitem["vnum"] = self.get_num(vitem["title"]) sturl = turl[0] if turl[0].find("qq.com") < 0: sturl = self.url_prefix + turl[0] vitem["url"] = Util.normalize_url(sturl, "qq", "tv") vitem["ext_id"] = Util.md5hash(vitem["url"]) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def get_qq_showid(self, url): id = "" try: #http://v.qq.com/detail/j/jlw8mddv9wkv1a3.html #http://film.qq.com/cover/y/yuq5nnt2wwlwfle.html #r = re.compile(r'http://.+/id_([^_]+).*\.html') #r = re.compile(r'http://.+/.+/[0-9a-zA-Z]/([^_]+).*\.html') r = re.compile(r'http://[^/]*.qq.com/cover/.+?/([^/]*).html') m = r.match(url) if m: return m.group(1) else: r = re.compile(r'http://[^/]*.qq.com/[^/]*/.+?/([^/]*).html') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def get_vid(self, content, url): id = "" try: #url=http://v.qq.com/cover/k/krl2051za26trxu.html?vid=r0016fx050p" if url and url.find("vid") != -1: r = re.compile(r'.*[?&]vid=([^&]+)') m = r.search(url) if m: id = m.group(1) if not id and len(content) > 0: #vid:"f0016l11uqt" #r = re.compile(r'vid:.([^"])"') r = re.compile(r'vid:.(.*)".*') m = r.search(content) if m: id = m.group(1) if not id: #r = re.compile(r".*vid.:.(.*)'.*") r = re.compile(r".*vid.:.'(.*)'.*") m = r.search(content) if m: id = m.group(1) if not id: id = self.get_qq_showid(url) except Exception as e: pass return id def convert_url(self, url): res = url try: pass except Exception as e: pass return res def check_all(self, mvitem): res = True try: if 'video' not in mvitem: res = False if 'video' in mvitem: if len(mvitem['video']) == 0: res = False if res: res = self.check_url(mvitem) except Exception as e: pass return res def check_url(self, mvitem): res = True try: if 'video' in mvitem: for video in mvitem['video']: if 'url' in video: tres = self.is_same_site(video['url']) if not tres: res = False break except Exception as e: pass return res def is_same_site(self, url): res = True try: tsite = Util.guess_site(url) if tsite != self.site_name: res = False except Exception as e: pass res = False return res def get_year(self, data): year = None try: #r = re.compile(r'.*([\d]+).*') #m = r.match(data) #m = r.search(data) #if m: # print "get year",data,m.group(1) # return m.group(1) tyear = re.findall(r'[\d]+', data) if tyear: return tyear[0] except Exception as e: pass return year def get_num(self, data): num = None try: #r = re.compile(r'.*(\d+).*') #m = r.search(data) #if m: # return m.group(1) num = re.findall(r'[\d]+', data) if num: return num[0] except Exception as e: pass return num
class sohu_spider(Spider): name = "sohu" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] site_code = "sohu" #sohu site_id = "" #sohu allowed_domains = ["so.tv.sohu.com", "tv.sohu.com"] url_prefix = 'http://so.tv.sohu.com' #used for guess_site site_name = Util.guess_site(url_prefix) mgr = DbManager.instance() os_id = mgr.get_os('web')["os_id"] site_id = str(mgr.get_site(site_code)["site_id"]) channel_map = {} channel_map = mgr.get_channel_map() max_update_page = get_project_settings().get('MAX_UPDATE_PAGE') global_spider = True httpdownload = HTTPDownload() channel_info = {} movie_id = None tv_id = None variety_id = None cartoon_id = None test_page_url = None test_channel_id = None cmd_json = {} album_api = 'http://pl.hd.sohu.com/videolist?playlistid=%s&pagenum=%s' def __init__(self, json_data=None, *args, **kwargs): super(sohu_spider, self).__init__(*args, **kwargs) self._cat_urls = [] tcat_urls = [] if json_data: data = json.loads(json_data) if "type" in data: spider_type = data["type"] if spider_type != "global": self.global_spider = False tasks = [] if "id" in data and "url" in data: ttask = {} ttask["id"] = data["id"] ttask["url"] = data["url"] ttask["sid"] = "" ttask["untrack_id"] = "" self._cat_urls.append(ttask) cmd = data["cmd"] if cmd == "assign": tasks = data["task"] elif cmd == "trig": stat = data['stat'] if 'stat' in data else None tasks = self.mgr.get_untrack_url(self.site_code, stat) elif cmd == 'carpet': tasks = self.mgr.get_video_url(self.site_code) elif cmd == "test" and 'id' in data and 'url' in data: self.test_page_url = data["url"] self.test_channel_id = data["id"] elif cmd == "episode" and 'id' in data and 'url' in data: self.cmd_json = data elif cmd == "debug": #tasks = [{"mid":"503669", "url":"http://tv.sohu.com/20151204/n429762764.shtml", "name":"综艺", "code":"variaty"}] #tasks = [{"mid":"510798", "url":"http://tv.sohu.com/20090824/n266189779.shtml", "name":"综艺", "code":"variaty"}] tasks = [{ "mid": "502525", "url": "http://tv.sohu.com/20110617/n310505202.shtml", "name": "综艺", "code": "variaty" }] for task in tasks: ttask = {} ttask["url"] = task["url"] code = task["code"] ttask["id"] = self.channel_map[code] ttask["untrack_id"] = task[ "untrack_id"] if 'untrack_id' in task else None ttask["sid"] = task["sid"] if 'sid' in task else None ttask['mid'] = task['mid'] if 'mid' in task else None self._cat_urls.append(ttask) def start_requests(self): try: items = [] self.movie_id = str(self.mgr.get_channel('电影')["channel_id"]) self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"]) self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"]) self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"]) self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "sohu") items.append( Request(url=self.test_page_url, callback=self.parse_page, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if self.cmd_json: items.append( Request(url=self.cmd_json['url'], callback=self.parse_episode_info, meta={ 'cat_id': self.cmd_json["id"], 'poster_url': '' })) return items if not self._cat_urls: #cat_urls = [{'url':'http://so.tv.sohu.com/list_p1106_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html','id':self.variety_id}] cat_urls = [{ 'url': 'http://so.tv.sohu.com/list_p1100_p2_p3_p4_p5_p6_p73_p80_p9_2d1_p10_p11_p12_p13.html', 'id': self.movie_id }, { 'url': 'http://so.tv.sohu.com/list_p1101_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html', 'id': self.tv_id }, { 'url': 'http://so.tv.sohu.com/list_p1106_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html', 'id': self.variety_id }, { 'url': 'http://so.tv.sohu.com/list_p1115_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html', 'id': self.cartoon_id }] #cat_urls = [{'url':'http://so.tv.sohu.com/list_p1100_p2_p3_p4_p5_p6_p73_p80_p9_2d1_p10_p11_p12_p13.html','id':self.movie_id}] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_type, meta={ 'cat_id': cat['id'], 'page': 1 })) else: for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse_single_episode, meta={ 'cat_id': cat["id"], 'page': 1, "untrack_id": cat["untrack_id"], "sid": cat["sid"], "mid": cat["mid"] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] mid = response.request.meta[ 'mid'] if 'mid' in response.request.meta else "" playtype_list = response.selector.re( re.compile(r'var pagetype = .*?(\D+)')) #发现新的类型页面,http://tv.sohu.com/20100804/n273985736.shtml #http://my.tv.sohu.com/us/49390690/29200993.shtml 该URL利用现有的逻辑无法爬取到 urls = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/div[@class="crumbs"]/a[last()]' ) attributes = urls.xpath('./@*').extract() size = len(attributes) urls = urls.xpath('./@href').extract() if size == 1 and urls and not playtype_list: for iurl in urls: surl = Util.normalize_url(iurl, "sohu") if surl and "http" in surl: items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': '', 'page': 1, "untrack_id": untrack_id, "sid": sid, "mid": mid })) #付费电影,不能跳转到媒体页 else: mvitem = self.parse_episode_play(response, untrack_id, sid) if mid: mvitem['mid'] = mid if mvitem and "media" in mvitem and "url" in mvitem[ "media"] and "ext_id" in mvitem["media"]: if self.check_url(mvitem): items.append(mvitem) if not items: mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid ep_item = MediaItem() ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id mvitem["media"] = ep_item playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] items += self.api_episode_info(mvItem=mvitem, playlistId=playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_type(self, response): items = [] try: #logging.log(logging.INFO, 'parse_typ: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="sort-type"]/dl[1]/dd[@class="sort-tag"]/a/@href' ).extract() for sub in subs: items.append( Request(url=self.url_prefix + sub, callback=self.parse_area, meta={ 'cat_id': cat_id, 'page': 1 })) titem = self.parse_page(response) if titem: items.extend(titem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_area(self, response): items = [] try: #logging.log(logging.INFO, 'parse_area: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="sort-type"]/dl[2]/dd[@class="sort-tag"]/a/@href' ).extract() for sub in subs: items.append( Request(url=self.url_prefix + sub, callback=self.parse_sort, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_sort(self, response): items = [] try: #logging.log(logging.INFO, 'parse_sort: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="sort-column area"]/div[@class="column-hd"]/p[@class="st-link"]/a/@href' ).extract() for sub in subs: items.append( Request(url=self.url_prefix + sub, callback=self.parse_page, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_page(self, response): try: cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] #logging.log(logging.INFO, 'parse_page: %s,%s' % (response.request.url,page)) #if int(page) > int(self.max_update_page) and not self.global_spider: # return items = [] play_url = "" subs = response.xpath('//div[@class="column-bd cfix"]/ul[1]/li') for sub in subs: play_url = sub.xpath( './div[@class="st-pic"]/a/@href').extract() pic_urls = sub.xpath( './div[@class="st-pic"]/a/img/@src').extract() pic_url = "" if pic_urls: pic_url = pic_urls[0] if play_url: items.append( Request(url=play_url[0].strip(), callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': pic_url })) next_page = response.xpath( "//div[@class='column-bd cfix']/div[1]/a[@title='%s']/@href" % u'下一页').extract() if next_page: snext_page = next_page[0].strip() if snext_page.find(self.url_prefix) < 0: snext_page = self.url_prefix + snext_page items.append( Request(url=snext_page, callback=self.parse_page, meta={ 'page': page + 1, 'cat_id': cat_id })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] year_list = [] lyears = [] playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] if not playlistId: logging.log( logging.INFO, "parse_episode_info error,not find playlistid,url:%s " % response.request.url) return items title_list = self.parse_title(response, cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = self.parse_district(response) type_list = self.parse_type_list(response) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() year_list = self.parse_year(response) year = None if year_list: year = year_list[0] #pers = "|".join([t.strip() for t in performer_list]) #dirs = "|".join([t.strip() for t in director_list]) pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) district = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()' ).extract() play_url = "" play_url = response.xpath( '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href' ).extract() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district: ep_item["district"] = district if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "sohu") playlistId = str(playlistId) ep_item["cont_id"] = playlistId if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid mvitem["media"] = ep_item vurl = "" ttvitem = [] if title_list: ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) if not items and playlistId: items += self.api_episode_info(mvitem, playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def api_episode_info(self, mvItem=None, playlistId='', cat_id=''): # 应该保证mvItem,playlistId不为空,且包含mid或者sid、untrack_id,包含channel_id、site_id items = [] try: mvitem = mvItem ep_item = mvitem["media"] url = self.album_api % (playlistId, 1) logging.log(logging.INFO, 'api_episode_info, info url %s' % url) info = self.httpdownload.get_data(url) info = info.decode('gbk').encode('utf-8') info_json = json.loads(info) actor_list = info_json.get("mainActors") director_list = info_json.get("directors") type_list = info_json.get("categories") if "actor" not in ep_item and actor_list: ep_item["actor"] = Util.join_list_safely(actor_list) if "director" not in ep_item and director_list: ep_item["director"] = Util.join_list_safely(director_list) if "type" not in ep_item and type_list: ep_item["type"] = Util.join_list_safely(type_list) if "title" not in ep_item: ep_item["title"] = info_json.get("albumName") if "district" not in ep_item: ep_item["district"] = info_json.get("area") if "release_date" not in ep_item and info_json.get("publishYear"): ep_item["release_date"] = Util.str2date( str(info_json.get("publishYear"))) if "intro" not in ep_item: ep_item["intro"] = info_json.get("albumDesc") if "poster_url" not in ep_item or not str.strip( str(ep_item["poster_url"])): ep_item["poster_url"] = info_json.get("pic240_330") if "cont_id" not in ep_item: ep_item["cont_id"] = playlistId ttvitem = [] if ep_item['title']: mvitem['media'] = ep_item ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem if "url" not in mvitem["media"]: mvitem["media"]["url"] = ttvitem[0]['url'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_play(self, response, untrack_id, sid): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #vip title_list = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/h2/@title' ).extract() director_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'主演:').extract() text = response.xpath( '//div[@class="info info-con"]/p[@class="intro"]/text()' ).extract() pers = "|".join([t.strip() for t in performer_list]) dirs = "|".join([t.strip() for t in director_list]) playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] vid = "" vid_list = response.selector.re(re.compile(r'var vid.*?(\d+)')) if vid_list: vid = vid_list[0] if not playlistId or not vid: return mvitem ep_item = MediaItem() ep_item["cont_id"] = playlistId if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "sohu") if text: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid vitem = VideoItem() vitem["title"] = ep_item["title"] if 'title' in ep_item else None vitem["url"] = ep_item["url"] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(ep_item["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = vid videoitems = [] videoitems.append(vitem) mvitem["video"] = videoitems mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem def parse_title(self, response, cat_id): gtitle = [] title = [] try: title = response.xpath( '//div[@class="wrapper"]/div[1]/h2/text()').extract() gtitle = self.strip_title(cat_id, title) if not gtitle: title = response.xpath( '//div[@class="wrapper"]/div[1]/h2/text()').extract() gtitle = self.strip_title(cat_id, title) if not gtitle: title = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/h2/span/text()' ).extract() gtitle = self.strip_title(cat_id, title) if not gtitle: title = response.xpath( '//div[@class="wrapper"]/div[1]/h2/span/text()').extract() gtitle = self.strip_title(cat_id, title) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return gtitle def strip_title(self, cat_id, title): gtitle = [] try: if len(title): ttitle = title[0].strip() index = ttitle.find(self.channel_info[str(cat_id)]) len1 = 0 if index >= 0: len1 = len(self.channel_info[str(cat_id)]) + 1 else: index = 0 tinfo = ttitle[index + len1:] if len(tinfo) > 0: gtitle.append(tinfo) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return gtitle def parse_actor(self, response): performer_list = [] try: performer_list = response.xpath( '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()' % u'主演:').extract() if not performer_list: performer_list = response.xpath( '//div[@class="infoR"]/ul/li/span[text()="%s"]/../a/text()' % u'主持人:').extract() if not performer_list: performer_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'配音:').extract() if not performer_list: performer_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'声优:').extract() if not performer_list: performer_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'主演:').extract() if not performer_list: performer_list = response.xpath( '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()' % u'主演:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return performer_list def parse_type_list(self, response): type_list = [] try: type_list = response.xpath( '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()' % u'类型:').extract() if not type_list: type_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'类型:').extract() if not type_list: type_list = performer_list = response.xpath( '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()' % u'类型:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return type_list def parse_district(self, response): type_list = [] try: type_list = response.xpath( '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()' % u'地区:').extract() if not type_list: type_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'地区:').extract() if not type_list: type_list = performer_list = response.xpath( '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()' % u'地区:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return type_list def parse_year(self, response): type_list = [] try: type_list = response.xpath( '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()' % u'上映时间:').extract() if not type_list: type_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'上映时间:').extract() if not type_list: type_list = response.xpath( '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../text()' % u'上映时间:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return type_list def parse_director(self, response): director_list = [] try: director_list = response.xpath( '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()' % u'导演:').extract() if not director_list: director_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'导演:').extract() if not director_list: director_list = response.xpath( '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()' % u'监督:').extract() if not director_list: director_list = response.xpath( '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()' % u'导演:').extract() except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return director_list def parse_video_item(self, cat_id, playlistId): logging.log(logging.INFO, 'parse_video_item , playlistId %s' % playlistId) videoitems = [] try: #新接口代码 page = 1 while True: page_items = self.parse_videos_info(cat_id, playlistId, page) if not page_items: break videoitems = videoitems + page_items page = page + 1 except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def parse_videos_info(self, cat_id, playlistId, page): videoitems = [] try: url = self.album_api % (playlistId, page) logging.log(logging.INFO, 'parse_videos_info, info url %s' % url) info = self.httpdownload.get_data(url) info = info.decode('gbk').encode('utf-8') info_json = json.loads(info) videos = info_json['videos'] if int(cat_id) == int(self.variety_id): for video in videos: tvSType = str( video['tvSType']) if 'tvSType' in video else '-1' if tvSType != '1' and tvSType != '36': continue #综艺采用日期 play_num = self.get_play_num(video['showDate']) if not play_num: play_num = self.get_play_num_date(video['publishTime']) vitem = self.compose_vitem([video['pageUrl']], [video['name']], play_num) vitem['cont_id'] = video['vid'] vitem['thumb_url'] = video['smallPicUrl'] videoitems.append(vitem) else: for video in videos: tvSType = str( video['tvSType']) if 'tvSType' in video else '-1' if tvSType != '1' and tvSType != '36': continue #非综艺采用order play_num = self.get_play_num(video['order']) vitem = self.compose_vitem([video['pageUrl']], [video['name']], play_num) vitem['cont_id'] = video['vid'] vitem['thumb_url'] = video['smallPicUrl'] videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def parse_variety_info(self, playlistId, response): logging.log(logging.INFO, 'parse_variety_info, info url %s' % response.request.url) videoitems = [] try: year_list = response.xpath( '//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()' ).extract() if not year_list: year_list = ["2015", "2014", "2013", "2012", "2011", "2010"] for year in year_list: turl1 = "http://tv.sohu.com/item/VideoServlet?source=sohu&id=" + str( playlistId) + "&year=" + year + "&month=0&page=1" info = self.httpdownload.get_data(turl1) videolist = self.parse_play_list(info) if videolist: for titem in videolist: videoitems.append(titem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def parse_play_list(self, info): videoitems = [] try: if not info or len(info) < len("{pageTotal: 1,videos:[]"): return None jinfo = {} try: jinfo = json.loads(info) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) if "videos" not in jinfo: return videoitems itemlist = jinfo["videos"] for titem in itemlist: vitem = self.compose_vitem([titem["url"]], [titem["title"]], titem["showDate"]) vitem['cont_id'] = titem['id'] videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def get_carton_list(self, response): videoitems = [] try: ul_list = response.xpath( '//div[@id="blockA"]/div[@id="allist"]/div[@id="list_asc"]/div[@class="pp similarLists"]/ul' ) for ul in ul_list: li_list = ul.xpath('./li') for li in li_list: url = li.xpath('./a/@href').extract() ttitle = li.xpath('./span/strong/a/text()').extract() play_num = self.get_play_num(ttitle[0]) vitem = self.compose_vitem(url, ttitle, play_num) if 'url' in vitem: videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def compose_vitem(self, url_list, title_list, vnum): vitem = VideoItem() try: if not url_list: return vitem if title_list: vitem["title"] = title_list[0].strip() turl = Util.normalize_url(url_list[0], "sohu") vitem["url"] = turl vitem["vnum"] = str(vnum) vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(turl) vitem["site_id"] = self.site_id except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return vitem def get_sohu_showid(self, url): id = "" try: #http://tv.sohu.com/item/MTE4NTk2MA==.html #http://tv.sohu.com/item/MTE0NjQwNg==.html #r = re.compile(r'http://tv.sohu.com.+?/[^/]*./([^/]*)\.html') r = re.compile(r'http://tv.sohu.com/[^/].*/([^/].*)\.[s]?html') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def get_play_num(self, title): num = "" try: num_list = re.findall('([\d]+)', title) if num_list: num_size = len(num_list) num = num_list[num_size - 1] except Exception as e: pass return num def get_play_num_date(self, title): num = "" try: num_list = re.findall('([\d]+)', title) if num_list: num = "".join(num_list) except Exception as e: pass return num def check_url(self, mvitem): res = True try: if 'video' in mvitem: for video in mvitem['video']: if 'url' in video: tres = self.is_same_site(video['url']) if not tres: res = False break except Exception as e: pass return res def is_same_site(self, url): res = True try: tsite = Util.guess_site(url) if tsite != self.site_name: res = False except Exception as e: pass res = False return res def get_year(self, info): year = None try: r = re.compile(ur'.*%s.*(\d+).*' % (u'上映时间')) m = r.search(info) if m: return m.group(1) except Exception as e: pass return year
def __init__(self): self.__db_mgr = DbManager.instance()
class tudou_spider(Spider): name = "tudou" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] site_code = "tudou" site_id = "" #tudou allowed_domains = ["www.tudou.com"] pre_url = "http://www.tudou.com/s3portal/service/pianku/data.action?pageSize=90&app=mainsitepc&deviceType=1&tags=&tagType=3&firstTagId=" tail_url = "&areaCode=110000&initials=&hotSingerId=&sortDesc=pubTime&pageNo=" #used for guess_site site_name = Util.guess_site("http://www.tudou.com") mgr = DbManager.instance() os_id = mgr.get_os('web')["os_id"] site_id = str(mgr.get_site(site_code)["site_id"]) channel_map = {} channel_map = mgr.get_channel_map() max_update_page = get_project_settings().get('MAX_UPDATE_PAGE') id_map = {} httpdownload = HTTPDownload() cmd = None def __init__(self, json_data=None, *args, **kwargs): super(tudou_spider, self).__init__(*args, **kwargs) cat_urls = [] tasks = [] if json_data: data = json.loads(json_data) self.cmd = data["cmd"] if self.cmd == "assign": tasks = data["task"] elif self.cmd == "trig": stat = data['stat'] if 'stat' in data else None tasks = self.mgr.get_untrack_url(self.site_code, stat) ttask={} if "id" in data and "url" in data: ttask["id"] = data["id"] ttask["url"] = data["url"] ttask["sid"] = "" ttask["untrack_id"] = "" cat_urls.append(ttask) if tasks: for task in tasks: ttask={} ttask["url"] = task["url"] code = task["code"] ttask["id"] = self.channel_map[code] ttask["untrack_id"] = task["untrack_id"] ttask["sid"] = task["sid"] cat_urls.append(ttask) #cat_urls = data["cat_urls"] self._cat_urls = [] if cat_urls: self._cat_urls = cat_urls def start_requests(self): try: items = [] cat_urls = [] movie_id = self.mgr.get_channel('电影')["channel_id"] tv_id = self.mgr.get_channel('电视剧')["channel_id"] variety_id = self.mgr.get_channel('综艺')["channel_id"] cartoon_id = self.mgr.get_channel('动漫')["channel_id"] self.id_map = {str(movie_id):"5",str(tv_id):"3",str(variety_id):"6",str(cartoon_id):"4"} #不需要url字段,通过土豆网不同频道的id来拼出url if not self._cat_urls and not self.cmd: #cat_urls = [{'url':'','id':tv_id}] cat_urls = [{'url':'','id':movie_id}, {'url':'','id':tv_id}, {'url':'','id':variety_id}, {'url':'','id':cartoon_id}] for cat in cat_urls: url = "" type_id = "" if cat['id'] == movie_id: type_id = self.id_map[str(movie_id)] elif cat['id'] == tv_id: type_id = self.id_map[str(tv_id)] elif cat['id'] == variety_id: type_id = self.id_map[str(variety_id)] elif cat['id'] == cartoon_id: type_id = self.id_map[str(cartoon_id)] url = self.pre_url + type_id + self.tail_url page_num = int(self.get_page_num(url+ "10000"))/90 + 1 #page_num = 4 for i in range(page_num): surl = self.pre_url + type_id + self.tail_url + str(i+1) items.append(Request(url=surl, callback=self.parse, meta={'cat_id': cat['id'],'page':1})) else: for cat in self._cat_urls: channel_id = str(cat["id"]) items.append(Request(url=cat['url'], callback=self.parse_single_episode, meta={'cat_id': channel_id,'page':1,"untrack_id":cat["untrack_id"],"sid":cat["sid"]})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def get_page_num(self,url): num = None try: info = self.httpdownload.get_data(url) jinfo = json.loads(info) num = jinfo["total"] except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return num def parse_single_episode(self,response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] urls = response.xpath('//div[@class="breadcrumb"]/a[@class="breadcrumb_item"]/@href').extract() #carton is different if not urls: urls = response.xpath('//div[@class="mod_player_head cf"]/div[1]/div[1]/a[3]/@href').extract() if urls: turl = self.url_prefix + urls[0] items.append(Request(url=turl, callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':'','page':1,"untrack_id":untrack_id,"sid":sid})) else: poster_url = "" title = "" actor = "" info_url = response.xpath('//div[@class="summary_main"]/div[@class="fix"]/h1[@class="kw"]/a/@href').extract() if info_url: items.append(Request(url=info_url[0], callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,"actor":actor,"untrack_id":untrack_id,"sid":sid})) #items.append(Request(url=response.request.url, callback=self.parse_episode_play, meta={'cat_id': cat_id,'poster_url':'','page':1})) #response.request.meta['poster_url'] = '' #self.parse_episode_play(response) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse(self,response): try: logging.log(logging.INFO, 'parse: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #poster_url = response.request.meta['poster_url'] items = [] play_url = "" jinfo = json.loads(response.body) for tmedia in jinfo["items"]: title = tmedia["title"] actor_list = [] for tactor in tmedia["actors"]: actor_list.append(tactor["name"]) actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) poster_url = tmedia["picUrl_200x300"] play_url = tmedia["playUrl"] if "updateInfo" in tmedia and tmedia["updateInfo"].find("预告") >= 0: continue else: items.append(Request(url=play_url, callback=self.parse_episode_play, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,'actor':actor})) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_play(self,response): try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] title = response.request.meta['title'] actor = response.request.meta['actor'] items = [] info_url = response.xpath('//div[@class="summary_main"]/div[@class="fix"]/h1[@class="kw"]/a/@href').extract() if info_url: items.append(Request(url=info_url[0], callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,"actor":actor})) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] title = response.request.meta['title'] actor = response.request.meta['actor'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] items = [] if not poster_url: poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract() if poster_url_list: poster_url = poster_url_list[0] if not title: title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract() if title_list: title = title_list[0] if not actor: #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract() actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u' 主演:').extract() if actor_list: actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) #performer pers = actor type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'类型:\n').extract() district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'地区:').extract() release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'年代:').extract() types = None if type_list: types = Util.join_list_safely(type_list) #director director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'编导:').extract() if not director_list: director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'导演:').extract() dirs = Util.join_list_safely(director_list) #dirs = "|".join([t.strip() for t in director_list]) #text text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract() #sourceid sourceid = self.get_tudou_showid(response.request.url) videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if release_date_list: ep_item["release_date"] = Util.str2date(release_date_list[0]) #ep_item["info_id"] = Util.md5hash(tinfo) ep_item["cont_id"] = sourceid ep_item["site_id"] = self.site_id ep_item["url"] = response.request.url ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem(); mvitem["media"] = ep_item; mvitem["video"] = videoitems lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid) info = self.httpdownload.get_data(lurl) jinfo = json.loads(info) if "items" in jinfo: for sitem in jinfo["items"]: vitem = VideoItem() vitem["title"] = sitem["itemTitle"] vitem["vnum"] = sitem["episode"] vitem["os_id"] = self.os_id trailer = sitem['trailer'] if not sitem["itemPlayUrl"]: continue #预告片 if trailer: continue turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou") vitem["url"] = turl vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_tudou_showid(turl) #if "ext_id" not in mvitem["media"]: # mvitem["media"]["ext_id"] = vitem["ext_id"] #vitem["media_ext_id"] = vitem["ext_id"] mvitem["video"].append(vitem) if len(mvitem["video"]) > 0: Util.set_ext_id(mvitem["media"],mvitem["video"]) mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"])) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def get_tudou_showid(self,url): id = "" try: #http://www.tudou.com/albumcover/ZPUPBy0CC6c.html r = re.compile(r'http://.+/.*/([^/].*).html') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def check_url(self,mvitem): res = True try: if 'video' in mvitem: for video in mvitem['video']: if 'url' in video: tres = self.is_same_site(video['url']) if not tres: res = False break except Exception as e: pass return res def is_same_site(self,url): res = True try: tsite = Util.guess_site(url) if tsite != self.site_name: res = False except Exception as e: pass res = False return res
class wasu_spider(Spider): name = "wasu" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] site_code = "wasu" site_id = "" #wasu allowed_domains = ["www.wasu.cn", "all.wasu.cn"] url_prefix = 'http://www.wasu.cn' site_name = Util.guess_site(url_prefix) mgr = DbManager.instance() os_id = mgr.get_os('web')["os_id"] site_id = str(mgr.get_site(site_code)["site_id"]) channel_map = {} channel_map = mgr.get_channel_map() max_update_page = get_project_settings().get('MAX_UPDATE_PAGE') global_spider = True httpdownload = HTTPDownload() channel_info = {} test_page_url = None test_channel_id = None album_api = 'http://www.wasu.cn/Column/ajax_list?uid=%s&y=%s&mon=%s' def __init__(self, json_data=None, *args, **kwargs): super(wasu_spider, self).__init__(*args, **kwargs) cat_urls = [] tasks = None if json_data: data = json.loads(json_data) if "type" in data: spider_type = data["type"] if spider_type != "global": self.global_spider = False tasks = [] ttask = {} if "id" in data and "url" in data: ttask["id"] = data["id"] ttask["url"] = data["url"] ttask["sid"] = "" ttask["untrack_id"] = "" cat_urls.append(ttask) cmd = data["cmd"] if cmd == "assign": tasks = data["task"] elif cmd == "trig": stat = data['stat'] if 'stat' in data else None tasks = self.mgr.get_untrack_url(self.site_code, stat) elif cmd == 'carpet': tasks = self.mgr.get_video_url(self.site_code) elif cmd == "test" and 'id' in data and 'url' in data: self.test_page_url = data["url"] self.test_channel_id = data["id"] if tasks: for task in tasks: ttask = {} ttask["url"] = task["url"] code = task["code"] ttask["id"] = self.channel_map[code] ttask["untrack_id"] = task[ "untrack_id"] if 'untrack_id' in task else None ttask["sid"] = task["sid"] if 'sid' in task else None ttask['mid'] = task['mid'] if 'mid' in task else None cat_urls.append(ttask) self._cat_urls = [] if cat_urls: self._cat_urls = cat_urls def start_requests(self): try: items = [] self.movie_id = str(self.mgr.get_channel('电影')["channel_id"]) self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"]) self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"]) self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"]) self.channel_info = { self.movie_id: u"电影", self.tv_id: u"电视剧", self.variety_id: u"综艺", self.cartoon_id: u"动漫" } if self.test_page_url: turl = Util.normalize_url(self.test_page_url, "wasu") items.append( Request(url=self.test_page_url, callback=self.parse_page, meta={ 'cat_id': self.test_channel_id, 'page': 1 })) return items if not self._cat_urls: if self.global_spider: cat_urls = [{ 'url': 'http://all.wasu.cn/index/cid/1', 'id': self.movie_id }, { 'url': 'http://all.wasu.cn/index/cid/11', 'id': self.tv_id }, { 'url': 'http://all.wasu.cn/index/cid/37', 'id': self.variety_id }, { 'url': 'http://all.wasu.cn/index/cid/19', 'id': self.cartoon_id }] for cat in cat_urls: items.append( Request(url=cat['url'], callback=self.parse_type, meta={ 'cat_id': cat['id'], 'page': 1 })) else: for cat in self._cat_urls: turl = Util.normalize_url(cat['url'], "wasu") items.append( Request(url=turl, callback=self.parse_single_episode, meta={ 'cat_id': cat["id"], 'page': 1, "poster_url": "", "untrack_id": cat["untrack_id"], "sid": cat["sid"], "mid": cat["mid"] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_type(self, response): items = [] try: #logging.log(logging.INFO, 'parse_type: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="ws_all_span"]/ul/li[1]/a/@href').extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_tag, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_tag(self, response): items = [] try: logging.log(logging.INFO, 'parse_tag: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="ws_all_span"]/ul/li[2]/a/@href').extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_area, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_area(self, response): items = [] try: logging.log(logging.INFO, 'parse_area: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="ws_all_span"]/ul/li[3]/a/@href').extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_time, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_time(self, response): items = [] try: logging.log(logging.INFO, 'parse_time: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="ws_all_span"]/ul/li[4]/a/@href').extract() for sub in subs: items.append( Request(url=sub, callback=self.parse_sort, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_sort(self, response): items = [] # 默认最近更新 time_url = response.request.url try: logging.log(logging.INFO, 'parse_sort: %s' % response.request.url) cat_id = response.request.meta['cat_id'] subs = response.xpath( '//div[@class="pxfs"]/div[@class="l"]/ul/li/a/@href').extract( ) # 优先爬取最近更新 subs.insert(0, time_url) for sub in subs: items.append( Request(url=sub, callback=self.parse_page, meta={ 'cat_id': cat_id, 'page': 1 })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_page(self, response): items = [] try: cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] logging.log(logging.INFO, 'parse_page: %s,%s' % (response.request.url, page)) #if int(page) > int(self.max_update_page) and not self.global_spider: # return items = [] play_url = "" subs = response.xpath('//div[@class="ws_row mb25"]/div') #if not subs: # subs = response.xpath('./div/div[@class="ws_row mb25"]/div[@class=" col2 mb20"]/div[@class="hezhip]') for sub in subs: play_urls = sub.xpath( './div/div[@class="v mb5"]/div[@class="v_link"]/a/@href' ).extract() pic_urls = sub.xpath( './div/div[@class="v mb5"]/div[@class="v_img"]/img/@src' ).extract() if not play_urls: play_urls = sub.xpath( './div/div[@class="v mb5"]/div[@class="p_link"]/a/@href' ).extract() if not pic_urls: pic_urls = sub.xpath( './div/div[@class="v mb5"]/div[@class="p_img"]/img/@src' ).extract() pic_url = "" if pic_urls: pic_url = pic_urls[0] if play_urls: rplay_url = play_urls[0].strip() if '/Play/show' in rplay_url: #if int(cat_id) == int(self.movie_id): items.append( Request(url=rplay_url, callback=self.parse_single_episode, meta={ 'cat_id': cat_id, 'poster_url': pic_url, 'untrack_id': '', 'sid': '' })) else: items.append( Request(url=rplay_url, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': pic_url, 'untrack_id': '', 'sid': '' })) next_page = response.xpath( '//div[@class="item_page"]/a[text()="%s"]/@href' % u'下一页').extract() page_prefix = "http://all.wasu.cn" if next_page: snext_page = next_page[0].strip() if snext_page.find(page_prefix) < 0: snext_page = page_prefix + snext_page items.append( Request(url=snext_page, callback=self.parse_page, meta={ 'page': page + 1, 'cat_id': cat_id })) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] mid = response.request.meta[ 'mid'] if 'mid' in response.request.meta else "" poster_url = response.request.meta['poster_url'] #解析媒体页信息 urls = response.xpath( '//div[@class="play_site mb10"]/div[1]/h3/a/@href').extract() if not urls: #通过标题不能进入媒体页,要通过分级目录 turls = response.xpath( '//div[@class="play_site mb10"]/div[1]/div[@class="play_seat"]/a/@href' ).extract() for turl in turls: tiurl = self.get_episode_url(turl) if tiurl: urls.append(tiurl) if urls: for iurl in urls: if not Util.guess_site(iurl): iurl = self.url_prefix + iurl surl = Util.normalize_url(iurl, "wasu") if surl and self.site_name == Util.guess_site(surl): items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': poster_url, 'page': 1, "untrack_id": untrack_id, "sid": sid, "mid": mid })) else: #电影视频,没有媒体页,只有播放页 #动漫电影,没有媒体页,只有播放页 titems = self.parse_play_page(response) for item in titems: if mid: item['mid'] = mid items.append(item) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_episode_info(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'parse_episode_info: %s' % request_url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] #此处因考虑不想过多改变原来的程序结构,其实这些属性可以通过接口获得 #http://clientapi.wasu.cn/Phone/vodinfo/id/6786984 title_list = response.xpath( '//div[@class="cloudotm1"]/p[1]/a/text()').extract() if not title_list: title_list = response.xpath( '//div[@class="tele_txts"]/h4[1]/a/text()').extract() director_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'导演').extract() if not director_list: director_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'导演').extract() performer_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'演员').extract() if not performer_list: performer_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'演员').extract() area_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'地区').extract() if not area_list: area_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'地区').extract() tag_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'标签').extract() if not tag_list: tag_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'类型').extract() if not tag_list: tag_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'标签').extract() if not tag_list: tag_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'类型').extract() year_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'年份').extract() if not year_list: year_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'年份').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) areas = Util.join_list_safely(area_list) tags = Util.join_list_safely(tag_list) #text text = response.xpath( '//div[@class="right_fl"]/p/span[@id="infoS"]/text()').extract( ) if text: text = response.xpath( '//div[@class="tele_b_otm"]/p/span[@id="infoS"]/text()' ).extract() play_url = "" mvitem = self.compose_mvitem(response, title_list, pers, dirs, response.request.url, cat_id, poster_url, text) if mid: mvitem['mid'] = mid if mvitem and 'video' in mvitem and 'url' in mvitem['video'][ 0] and mvitem['video'][0]['url']: mvitem['media']['type'] = tags mvitem['media']['district'] = areas if year_list: mvitem['media']['release_date'] = Util.str2date( year_list[0]) tlen = len(mvitem['video']) logging.log( logging.INFO, "++++url: %s video len: %d " % (response.request.url, tlen)) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_play_page(self, response): items = [] try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] title_list = response.xpath( '//div[@class="play_site mb10"]/div/h3/text()').extract() director_list = response.xpath( '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/span[text()="%s"]/../a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/div/span[text()="%s"]/../../div[@class="r"]/a/text()' % u'主演:').extract() tag_list = response.xpath( '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/span[text()="%s"]/../a/text()' % u'类型:').extract() area_list = response.xpath( '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/span[text()="%s"]/../a/text()' % u'地区:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) areas = Util.join_list_safely(area_list) tags = Util.join_list_safely(tag_list) text = response.xpath( '//div[@class="play_information play_intro"]/div[@class="play_information_b intro_down"]/div[@class="one"]/b/text()' ).extract() mvitem = self.compose_mvitem(response, title_list, pers, dirs, response.request.url, cat_id, poster_url, text) if mvitem: mvitem['media']['type'] = tags mvitem['media']['district'] = areas items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items def parse_video_item(self, response, cat_id, url, title, playlistId): #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title)) videoitems = [] ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item["video"] = videoitems try: if int(cat_id) == int(self.variety_id): tvideoitems = self.parse_variety(response) if tvideoitems: for titem in tvideoitems: videoitems.append(titem) elif '/Play/show' not in url: #if int(cat_id) != int(self.movie_id): #ul_list = response.xpath('//div[@class="teleplay_gather tab_box"]/div[@class="list_tabs_cont"]/ul/li') ul_list = response.xpath( '//div[@class="teleplay_gather tab_box"]/div/ul/li') if ul_list: #http://www.wasu.cn/Tele/index/id/6539647 for li in ul_list: yugaopian = li.xpath('.//i[@class="yugao"]').extract() if yugaopian: continue url = li.xpath('./a/@href').extract() ttitle = li.xpath('./a/@title').extract() snum = li.xpath('./a/text()').extract() play_num = "" if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num1 = self.getvnum(self.url_prefix + url[0]) if play_num1: play_num = play_num1 if not ttitle: ttitle = [play_num] vitem = None if self.site_name == Util.guess_site(url[0]): vitem = self.compose_vitem([url[0]], [title[0].strip()], play_num) else: vitem = self.compose_vitem( [self.url_prefix + url[0]], [title[0].strip()], play_num) if 'url' in vitem: videoitems.append(vitem) if not ul_list: #http://www.wasu.cn/Tele/index/id/6786984 ul_list = response.xpath( '//div[@class="tab_box"]//div[ends-with(@class, "col2")]' ) for li in ul_list: yugaopian = li.xpath('.//i[@class="yugao"]').extract() if yugaopian: continue url = li.xpath( './div[@class="ws_des"]/p[1]/a/@href').extract() ttitle = li.xpath( './div[@class="ws_des"]/p[2]/span/text()').extract( ) snum = li.xpath( './div[@class="ws_des"]/p[1]/a/text()').extract() play_num = "" if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num1 = self.getvnum(self.url_prefix + url[0]) if play_num1: play_num = play_num1 if not ttitle: ttitle = [play_num] vitem = None if self.site_name == Util.guess_site(url[0]): vitem = self.compose_vitem([url[0]], [title[0].strip()], play_num) else: vitem = self.compose_vitem( [self.url_prefix + url[0]], [title[0].strip()], play_num) if 'url' in vitem: videoitems.append(vitem) else: #elif int(cat_id) == int(self.movie_id): #无媒体页的播放页 if url: vitem = self.compose_vitem([url], title, 1) if 'url' in vitem: videoitems.append(vitem) if videoitems: item["video"] = videoitems item["media"]["url"] = response.request.url Util.set_ext_id(item["media"], item["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return item def compose_mvitem(self, response, title_list, pers, dirs, play_url, cat_id, poster_url, text): try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0].strip() ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "wasu") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item mid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = mid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) if not res: return None except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem def compose_vitem(self, url_list, title_list, vnum): vitem = VideoItem() try: if not url_list: return vitem if title_list: vitem["title"] = title_list[0].strip() turl = Util.normalize_url(url_list[0], "wasu") vitem["url"] = turl vitem["vnum"] = str(vnum) vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(turl) vitem["site_id"] = self.site_id vitem["cont_id"] = self.getshowid(turl) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return vitem #解析娱乐频道的videos def parse_variety(self, response): videoitems = [] try: #year list year_list = response.xpath( '//div[@id="play_year"]/div[@id="divselect"]/div[@class="play_sel"]/p/a/text()' ).extract() uid = self.getuid(response.request.url) cid = None month_list = [ "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1" ] cid_url_list = response.xpath( '//div[@class="head1 mb10"]/a/@href').extract() for cid_url in cid_url_list: cid = self.getcid(cid_url) if cid: break #http://www.wasu.cn/Column/ajax_list?uid=252&y=2015&mon=7&cid=39 for year in year_list: for month in month_list: if uid and year and month: turl = 'http://www.wasu.cn/Column/ajax_list?uid=%s&y=%s&mon=%s&cid=%s' % ( uid, year, month, cid) info = self.httpdownload.get_data(turl) if not info: continue jinfo = json.loads(info) if "con" in jinfo and jinfo["con"]: tinfo = jinfo["con"].replace("\/", "/") tsel = Selector(text=tinfo).xpath( '//div[@id="itemContainer"]/div[@class="col2 play_love"]' ) for isel in tsel: title = isel.xpath( './div[@class="v"]/div[@class="v_link"]/a/@title' ).extract() url = isel.xpath( './div[@class="v"]/div[@class="v_link"]/a/@href' ).extract() vnum = isel.xpath( './div[@class="v"]/div[@class="v_meta"]/div[@class="meta_tr"]/text()' ).extract() tvnum = vnum[0].strip() svnum = tvnum.replace("-", "") titem = self.compose_vitem( [self.url_prefix + url[0]], title, svnum) if titem: videoitems.append(titem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems def get_play_num(self, title): num = "" try: num_list = re.findall('([\d]+)', title) if num_list: num_size = len(num_list) num = num_list[num_size - 1] except Exception as e: pass return num def check_url(self, mvitem): res = True try: if 'video' in mvitem: for video in mvitem['video']: if 'url' in video: if Util.guess_site(video['url']) != self.site_name: res = False break except Exception as e: pass return res def is_same_site(self, url): res = True try: tsite = Util.guess_site(url) if tsite != self.site_name: res = False except Exception as e: pass res = False return res def getshowid(self, url): id = "" try: #http://www.wasu.cn/Play/show/id/5871821 #http://www.wasu.cn/Tele/index/id/6786984 #http://www.wasu.cn/Column/show/column/252 r = re.compile(r'http://.*/id/(\d+)[\?]?.*') m = r.match(url) if m: return m.group(1) else: r = re.compile(r'http://.*/show/.*/(\d+)[\?]?.*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def getvnum(self, url): id = "" try: r = re.compile(r'http://.*-drama-(\d+).*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return id def getuid(self, url): uid = "" try: #http://www.wasu.cn/Column/show/column/252 r = re.compile(r'.*/column/([\d]+)') m = r.match(url) if m: return m.group(1) except Exception as e: pass return uid def getcid(self, url): cid = "" try: #http://all.wasu.cn/index/cid/39 r = re.compile(r'.*/cid/([\d]+)*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return cid def getareaid(self, url): cid = "" try: #http://all.wasu.cn/index/cid/39 r = re.compile(r'.*/area/([\d]+)*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return cid def getyearid(self, url): cid = "" try: #http://all.wasu.cn/index/cid/39 r = re.compile(r'.*/year/([\d]+)*') m = r.match(url) if m: return m.group(1) except Exception as e: pass return cid def get_episode_url(self, url): rurl = "" try: #http://www.wasu.cn/Play/show/id/5871821 #http://www.wasu.cn/Column/show/column/252 r = re.compile(r'(.*/show/.*/\d+)') m = r.match(url) if m: return m.group(1) except Exception as e: pass return rurl
class iqiyi_spider(Spider): ''' iqiyi浏览流程: (1)从list进入 电视剧, 综艺, :list列表页 -> 媒体页 电影:list列表页 -> 播放页 动漫: (1)电影版:list列表页 -> 播放页 (2)普通版:list列表页 -> 媒体页 (2)从播放页进入 (1)播放页 -> 媒体页 (2)播放页 iqiyi爬虫流程: (1)list列表页进入 -> (判断URL类型,确定媒体页还是播放页)获取本页的信息,结束 (2)播放页进入 -> 获取播放页信息,判断是否存在媒体页 -> 媒体页 由于iqiyi在list表页的最多只能浏览到30页,所以采用如下策略爬取 (1)按一级一级类别细分成各个分支 (2)当细分的页小于30时,该分支停止细分 (2)当分支细分结束,页数仍大于30,则再利用不同的排序,再遍历,以尽量减少因无法访问30页之后所带来的内容缺失 ps.一直采用细分到叶子,感觉整个流程比较深,所以采用截枝的方式 ''' site_code = 'iqiyi' name = site_code mgr = DbManager.instance() max_number = 100000 #因为在类别细分生成树,本爬虫为了提高效率,采用将当前分支的页面数小于max_broswe_page,就截枝(不在细分)的方法 max_broswe_page = '30' list_prefix_url = 'http://list.iqiyi.com' #http://cache.video.qiyi.com/jp/sdlst/6/1300000156/ source_year_api = 'http://cache.video.qiyi.com/jp/sdlst/%s/%s/' #http://cache.video.qiyi.com/jp/sdvlst/6/1300001662/2014/?categoryId=6&sourceId=1300001662&tvYear=2014 source_media_api = 'http://cache.video.qiyi.com/jp/sdvlst/%s/%s/%s/?categoryId=%s&sourceId=%s&tvYear=%s' #http://cache.video.qiyi.com/jp/avlist/202321801/1/?albumId=202321801&pageNo=1 album_media_api = 'http://cache.video.qiyi.com/jp/avlist/%s/%s/?albumId=%s&pageNo=%s' vip_api = 'http://serv.vip.iqiyi.com/pay/movieBuy.action?aid=%s' api_success_code = u'A00000' max_mark_depth = 10 #通过json传递的参数 json_data = None #统计数据用 #count = 0 def __init__(self, json_data=None, *args, **kwargs): super(iqiyi_spider, self).__init__(*args, **kwargs) if json_data: self.json_data = json.loads(json_data) def start_requests(self): items = [] try: self.load_member_variable() if self.json_data: items = items + self.load_video_urls() else: url = self.list_prefix_url items.append( Request(url=url, callback=self.list_parse, meta={'level': 0})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) finally: