def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #过滤掉skip_types类型的影片 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: play_type = match_result[0] if play_type in self.skip_types: return items #由于某些URL会有跳转,所以应保存真是的URL #http://movie.kankan.com/movie/88365 -> http://data.movie.kankan.com/movie/88365 mediaItem['url'] = request_url sels = response.xpath('//head') kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="info_list"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//ul[@class="detail_ul"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #获取媒体的剧集信息 videoItems = [] if u'综艺' == mediaItem['channel_id']: #综艺 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) elif u'电影' == mediaItem['channel_id']: #电影,从立即观看中获取 videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="section clearfix s2"]') if sels: urls = sels.xpath( './/a[starts-with(@class, "foc")]/@href').extract() thumb_urls = sels.xpath( './/a[@class="foc"]/img/@src').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) else: #电视剧 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_asc")]]') if not sels: #动漫,电视剧 sels = response.xpath( '//ul[@id[re:test(., "fenji_[\d]+_asc")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) else: logging.log(logging.INFO, '%s: no videos' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def album_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'album url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = response.request.meta[ 'url'] if 'url' in response.request.meta else None mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] sels = response.xpath( '//div[@class="page-videolist-tag-main"]//p[@class="pa1-nav"]') if sels: #存在tag页 #http://list.hunantv.com/album/56.html results = hunantv_extract.album_tag_extract(sels) for item in results: url = Util.get_absolute_url(item['url'], prefix_url) result = Util.get_url_content(url) videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': url}) else: #不存在tag页 #http://list.hunantv.com/album/2905.html video_sels = response.xpath( '//div[@class="page-videolist clearfix"]') if video_sels: result = video_sels.extract()[0] videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': request_url}) else: #无正片页地址 #http://www.hunantv.com/v/7/102831/f/1043648.html,有正片集的URL,但该URL是无效的 if video_url: videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) videoItem['url'] = video_url Util.copy_media_to_video(mediaItem, videoItem) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) #获取视频id match_results = video_url_regex.search(video_url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' mediaItem['url'] = request_url pptv_extract.media_info_extract(response, mediaItem) videoItems = [] if u'电影' == mediaItem['channel_id']: if 'cont_id' not in mediaItem or not mediaItem['cont_id']: return items videoItem = VideoItem() videoItem['url'] = mediaItem['url'] videoItem['cont_id'] = mediaItem['cont_id'] Util.copy_media_to_video(mediaItem, videoItem) self.set_video_info(videoItem) videoItems.append(videoItem) else: sel = response.xpath('//script[@type="text/javascript"]') #获取pid&cid用于获取电视剧,综艺,动漫的剧集信息 if sel: pids = sel.re('\"pid\"[ ]?:[ ]?(\d+)') cids = sel.re('\"cat_id\"[ ]?:[ ]?(\d+)') vids = sel.re('\"id\"[ ]?:[ ]?(\d+)') if pids and cids and vids: pid = pids[0] cid = cids[0] vid = vids[0] page = 1 #给media的cont_id赋值 mediaItem['cont_id'] = pid while True: meta = { 'pid': pid, 'cid': cid, 'vid': vid, 'page': page } url = self.album_api % (pid, cid, vid, page) result = Util.get_url_content(url) page_result = self.album_json_resolve( result, mediaItem, meta) if not page_result['items']: #该接口暂时由于获取不到video url,暂不提供 #for auth in self.auths: # url = self.auth_album_api % (pid, auth) # result = Util.get_url_content(url) # page_items = self.auth_album_xml_resolve(result, mediaItem, meta) # if page_items: # videoItems = page_items # break break else: videoItems = videoItems + page_result['items'] page = page + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #获取播放地址 videoItems = [] videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.video_info_extract(sels, videoItem) if 'url' not in videoItem: #如果videoItem['url']为空,则表示只有影片资料,无播放地址,直接扔掉 logging.log(logging.INFO, '该影片找不到播放地址: %s' % request_url) return items url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem) videoItems.append(videoItem) #媒体属性 #设置媒体付费属性 video_prefix_url = Util.prefix_url_parse(url) if video_prefix_url in self.vip_prefix_urls: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="laMovMAIN fl"]') dy1905_extract.media_info_extract(sels, mediaItem) #剧情与演职人员 nav_sels = response.xpath( '//ul[@class="navSMb"]//li[@class="mdbpLeft2"]//div[@class="nowDefLine DefBOttom"]//a' ) if nav_sels: for sel in nav_sels: labels = sel.xpath('./text()').extract() urls = sel.xpath('./@href').extract() if labels and urls: label = labels[0].strip() if label.startswith(u'剧情') or label.startswith('演职人员'): url = urls[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) dy1905_extract.media_more_info_resolve( result, mediaItem) #设置绝对路径 url = mediaItem['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)