def bilibili_download_playlist_by_url(url, **kwargs): url = url_locations([url])[0] # a bangumi here? possible? if 'live.bilibili' in url: site.download_by_url(url) elif 'bangumi.bilibili' in url: bangumi_id = re.search(r'(\d+)', url).group(1) bangumi_data = get_bangumi_info(bangumi_id) ep_ids = collect_bangumi_epids(bangumi_data) base_url = url.split('#')[0] for ep_id in ep_ids: ep_url = '#'.join([base_url, ep_id]) Bilibili().download_by_url(ep_url, **kwargs) else: aid = re.search(r'av(\d+)', url).group(1) page_list = json.loads(get_content( 'https://www.bilibili.com/widget/getPageList?aid={}'.format(aid) )) page_cnt = len(page_list) for no in range(1, page_cnt+1): page_url = ( 'https://www.bilibili.com/video/av{}/index_{}.html'.format( aid, no ) ) subtitle = page_list[no-1]['pagename'] # 循环里面不能用同一个实例,self.streams 不会改变的,它里面始终存的是第一个地址的最高清晰度的 url,parse_bili_xml L109 # noqa Bilibili().download_by_url(page_url, subtitle=subtitle, **kwargs)
def prepare(self, **kwargs): if socket.getdefaulttimeout() == 600: # no timeout specified socket.setdefaulttimeout(2) # fail fast, very speedy! # handle 'watchlater' URLs if '/watchlater/' in self.url: aid = match1(self.url, r'av(\d+)') self.url = 'https://www.bilibili.com/video/av{}/'.format(aid) self.ua = FAKE_HEADERS['User-Agent'] if 'bangumi' not in self.url: # bangumi redirect will miss fragment argument here # http://bangumi.bilibili.com/anime/21542/play#173286 -> # https://www.bilibili.com/bangumi/play/ss21542 # It should be https://www.bilibili.com/bangumi/play/ss21542#173286 self.url = url_locations([self.url])[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: page = match1(frag, r'page=(\d+)') if page: aid = match1(self.url, r'av(\d+)') self.url = ( 'https://www.bilibili.com/video/av{}/index_{}.html'.format( aid, page)) # handle bangumi url like this # http://bangumi.bilibili.com/anime/21542/play#173286 # https://www.bilibili.com/bangumi/play/ss21542#173286 # https://www.bilibili.com/bangumi/play/ep173286 bangumi_ep_id = match1(self.url, r'/anime/\d+/play#(\d+)') or \ match1(self.url, r'/bangumi/play/ss\d+#(\d+)') if bangumi_ep_id: self.url = 'https://www.bilibili.com/bangumi/play/ep{}'.format( bangumi_ep_id) self.referer = self.url self.page = get_content(self.url) self.parser = get_parser(self.page) if self.parser.h1: self.title = self.parser.h1.text.strip() else: # Some movie page got no h1 tag self.title = self.parser.find('meta', property='og:title')['content'] if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) if 'live.bilibili.com' in self.url: self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) else: # bangumi, movie use this entry too self.entry(**kwargs)
def letv_download(url, info_only=False, **kwargs): url = url_locations([url])[0] if re.match(r'http://yuntv.letv.com/', url): letvcloud_download(url, info_only=info_only, **kwargs) elif 'sports.le.com' in url: html = get_content(url) vid = match1(url, r'video/(\d+)\.html') title = match1(html, r'<h2 class="title">([^<]+)</h2>') letv_download_by_vid(vid, title=title, info_only=info_only, **kwargs) else: html = get_content(url) vid = match1(url, r'http://www.letv.com/ptv/vplay/(\d+).html') or \ match1(url, r'http://www.le.com/ptv/vplay/(\d+).html') or \ match1(html, r'vid="(\d+)"') title = match1(html, r'name="irTitle" content="(.*?)"') letv_download_by_vid(vid, title=title, info_only=info_only, **kwargs)
def bilibili_download_playlist_by_url(url, **kwargs): url = url_locations([url])[0] if 'live.bilibili' in url: site.download_by_url(url) elif 'bangumi.bilibili' in url: bangumi_id = match1(url, r'(\d+)') bangumi_data = get_bangumi_info(bangumi_id) ep_urls = collect_bangumi_urls(bangumi_data) for ep_url in ep_urls: Bilibili().download_by_url(ep_url, **kwargs) else: aid = match1(url, r'av(\d+)') if aid: # normal playlist # https://www.bilibili.com/video/av16907446/ page_list = json.loads( get_content( 'https://www.bilibili.com/widget/getPageList?aid={}'. format(aid))) page_cnt = len(page_list) for no in range(1, page_cnt + 1): page_url = ( 'https://www.bilibili.com/video/av{}/index_{}.html'.format( aid, no)) subtitle = page_list[no - 1]['pagename'] # 循环里面不能用同一个实例,self.streams 不会改变的 # 它里面始终存的是第一个地址的最高清晰度的 url # parse_bili_xml L107 Bilibili().download_by_url(page_url, subtitle=subtitle, **kwargs) else: # tv playlist # https://www.bilibili.com/bangumi/play/ep196751/ page = get_content(url) ep_data = json.loads( match1(page, r'window.__INITIAL_STATE__=(.+?);')) for ep in ep_data['epList']: ep_url = 'https://www.bilibili.com/bangumi/play/ep{}'.format( ep['ep_id']) Bilibili().download_by_url(ep_url, **kwargs)
def prepare(self, **kwargs): if socket.getdefaulttimeout() == 600: # no timeout specified socket.setdefaulttimeout(2) # fail fast, very speedy! # handle "watchlater" URLs if '/watchlater/' in self.url: aid = re.search(r'av(\d+)', self.url).group(1) self.url = 'http://www.bilibili.com/video/av{}/'.format(aid) self.ua = FAKE_HEADERS['User-Agent'] self.url = url_locations([self.url])[0] frag = urllib.parse.urlparse(self.url).fragment # http://www.bilibili.com/video/av3141144/index_2.html#page=3 if frag: hit = re.search(r'page=(\d+)', frag) if hit is not None: page = hit.group(1) aid = re.search(r'av(\d+)', self.url).group(1) self.url = ( 'http://www.bilibili.com/video/av{}/index_{}.html'.format( aid, page ) ) self.referer = self.url self.page = get_content(self.url) self.parser = get_parser(self.page) self.title = self.parser.h1.text.strip() if 'subtitle' in kwargs: subtitle = kwargs['subtitle'] self.title = '{} {}'.format(self.title, subtitle) if 'bangumi.bilibili.com/movie' in self.url: self.movie_entry(**kwargs) elif 'bangumi.bilibili.com' in self.url: self.bangumi_entry(**kwargs) elif 'live.bilibili.com' in self.url: self.live_entry(**kwargs) elif 'vc.bilibili.com' in self.url: self.vc_entry(**kwargs) else: self.entry(**kwargs)
def qq_download(url, output_dir='.', merge=True, info_only=False, **kwargs): if re.match(r'https?://egame.qq.com/live\?anchorid=(\d+)', url): from . import qq_egame qq_egame.qq_egame_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) return if 'kg.qq.com' in url or 'kg2.qq.com' in url: shareid = url.split('?s=')[-1] caption = kwargs['caption'] kg_qq_download_by_shareid(shareid, output_dir=output_dir, info_only=info_only, caption=caption) return if 'live.qq.com' in url: if 'live.qq.com/video/v' in url: qie_video_download(url, output_dir=output_dir, merge=merge, info_only=info_only, **kwargs) else: qieDownload(url, output_dir=output_dir, merge=merge, info_only=info_only) return if 'mp.weixin.qq.com/s?' in url: content = get_content(url) vids = matchall(content, [r'\?vid=(\w+)']) for vid in vids: qq_download_by_vid(vid, vid, output_dir, merge, info_only) return # do redirect if 'v.qq.com/page' in url: # for URLs like this: # http://v.qq.com/page/k/9/7/k0194pwgw97.html new_url = url_locations([url])[0] if url == new_url: # redirect in js? content = get_content(url) url = match1(content, r'window\.location\.href="(.*?)"') else: url = new_url if 'kuaibao.qq.com' in url or re.match( r'http://daxue.qq.com/content/content/id/\d+', url): content = get_content(url) vid = match1(content, r'vid\s*=\s*"\s*([^"]+)"') title = match1(content, r'title">([^"]+)</p>') title = title.strip() if title else vid elif 'iframe/player.html' in url: vid = match1(url, r'\bvid=(\w+)') # for embedded URLs; don't know what the title is title = vid else: content = get_content(url) # vid = parse_qs(urlparse(url).query).get('vid') # for links specified vid like # http://v.qq.com/cover/p/ps6mnfqyrfo7es3.html?vid=q0181hpdvo5 rurl = match1( content, r'<link.*?rel\s*=\s*"canonical".*?href\s*="(.+?)".*?>' ) # https://v.qq.com/x/cover/9hpjiv5fhiyn86u/t0522x58xma.html vid = '' if rurl: vid = rurl.split('/')[-1].split('.')[0] if vid == 'undefined': vid = '' # https://v.qq.com/x/cover/ps6mnfqyrfo7es3/q0181hpdvo5.html? vid = vid if vid else url.split('/')[-1].split('.')[0] # general fallback vid = vid if vid else match1(content, r'vid"*\s*:\s*"\s*([^"]+)"') if not vid: vid = match1(content, r'id"*\s*:\s*"(.+?)"') title = match1( content, r'<a.*?id\s*=\s*"{}".*?title\s*=\s*"(.+?)".*?>'.format(vid)) title = match1(content, r'title">([^"]+)</p>') if not title else title title = match1(content, r'"title":"([^"]+)"') if not title else title title = vid if not title else title # general fallback qq_download_by_vid(vid, title, output_dir, merge, info_only)