def _get_streams(self): res = http.get(self.url) data = self._data_re.search(res.text) if data: self.logger.debug("Found _data_re") data = self.js_to_json_regex(data.group(1)) res = http.post(self.api_url, data=data) m = self._hls_re.search(res.text) if m: self.logger.debug("Found _hls_re") hls_url = m.group("url") hls_url = update_scheme("http://", hls_url) self.logger.debug("URL={0}".format(hls_url)) streams = HLSStream.parse_variant_playlist(self.session, hls_url) if not streams: return {"live": HLSStream(self.session, hls_url)} else: return streams iframe = self._iframe_re.search(res.text) if iframe: self.logger.debug("Found _iframe_re") iframe_url = iframe.group("url") iframe_url = update_scheme("http://", iframe_url) self.logger.debug("URL={0}".format(iframe_url)) return self.session.streams(iframe_url)
def test_update_scheme(self): self.assertEqual( "https://example.com/foo", # becomes https update_scheme("https://other.com/bar", "//example.com/foo")) self.assertEqual( "http://example.com/foo", # becomes http update_scheme("http://other.com/bar", "//example.com/foo")) self.assertEqual( "http://example.com/foo", # remains unchanged update_scheme("https://other.com/bar", "http://example.com/foo")) self.assertEqual( "https://example.com/foo", # becomes https update_scheme("https://other.com/bar", "example.com/foo"))
def _get_streams(self): url, params = parse_url_params(self.url) urlnoproto = self._url_re.match(url).group(1) urlnoproto = update_scheme("http://", urlnoproto) self.logger.debug("URL={0}; params={1}", urlnoproto, params) return {"live": AkamaiHDStream(self.session, urlnoproto, **params)}
def _get_live_streams(self): # Get channel id match = self._url_re.match(self.url) channel = match.group('channel') # Retrieve live player URL res = http.get(self.PLAYER_URL) match = self._live_player_re.search(res.text) if match is None: return [] live_player_url = update_scheme(self.url, match.group('live_player_url')) # Extract streams from the live player page res = http.get(live_player_url) stream_datas = re.findall( r'{0}(?:_MINI)?:({{.+?}}]}}]}})'.format(self.CHANNEL_MAP[channel]), res.text) streams = [] for s in stream_datas: for u in self._live_streams_schema.validate(s): if u not in streams: streams.append(u) return streams
def _get_streams(self): """ Find the streams for web.tv :return: """ headers = {} res = http.get(self.url, headers=headers) headers["Referer"] = self.url sources = self._sources_re.findall(res.text) if len(sources): sdata = parse_json(sources[0], schema=self._sources_schema) for source in sdata: self.logger.debug("Found stream of type: {}", source[u'type']) if source[u'type'] == u"application/vnd.apple.mpegurl": url = update_scheme(self.url, source[u"src"]) try: # try to parse the stream as a variant playlist variant = HLSStream.parse_variant_playlist( self.session, url, headers=headers) if variant: for q, s in variant.items(): yield q, s else: # and if that fails, try it as a plain HLS stream yield 'live', HLSStream(self.session, url, headers=headers) except IOError: self.logger.warning( "Could not open the stream, perhaps the channel is offline" )
def _get_streams(self): url, params = parse_url_params(self.url) urlnoproto = self._url_re.match(url).group(2) urlnoproto = update_scheme("http://", urlnoproto) return HDSStream.parse_manifest(self.session, urlnoproto, **params)
def _get_streams(self): """Tries to find streams. Returns: Playable video from self._resolve_res or New self.url for livecli Raises: NoPluginError: if no video was found. """ self.url = self.url.replace("resolve://", "") self._cache_self_url() self.url = update_scheme("http://", self.url) """ GET website content """ o_res = self._res_text(self.url) """ HLS or HDS stream """ x = self._resolve_res(o_res) if x: return x """ iframe url """ x = self._iframe_src(o_res) if not x: """ script window.location.href """ x = self._window_location(o_res) if x: return self.session.streams(self.url) raise NoPluginError
def _get_streams(self): data = http.get(self.url, schema=self.config_schema) for info in data["files"].values(): stream_url = update_scheme(self.url, info["url"]) # pick the smaller of the two dimensions, for landscape v. portrait videos res = min(info["width"], info["height"]) yield "{0}p".format(res), HTTPStream(self.session, stream_url)
def _get_streams(self): match = _url_re.match(self.url) channel = match.group("channel") http.headers.update({"User-Agent": useragents.IPAD}) # Some problem with SSL on huya.com now, do not use https hls_url = http.get(HUYA_URL % channel, schema=_hls_schema) yield "live", HLSStream(self.session, update_scheme("http://", hls_url))
def _get_streams(self): res = http.get(self.url) m = self.embed_url_re.search(res.text) platform_url = m and m.group("url") if platform_url: url = update_scheme(self.url, platform_url) # hand off to ThePlatform plugin p = ThePlatform(url) p.bind(self.session, "plugin.nbcsports") return p.streams()
def get_iframe_url(self): self.logger.debug('search for an iframe') res = http.get(self.url) m = self._iframe_re.search(res.text) if not m: raise PluginError('No iframe found.') iframe_url = m.group('url') iframe_url = update_scheme('http://', iframe_url) self.logger.debug('IFRAME URL={0}'.format(iframe_url)) return iframe_url
def _get_streams(self): url, params = parse_url_params(self.url) urlnoproto = self._url_re.match(url).group(2) urlnoproto = update_scheme("http://", urlnoproto) self.logger.debug("URL={0}; params={1}", urlnoproto, params) streams = HLSStream.parse_variant_playlist(self.session, urlnoproto, **params) if not streams: return {"live": HLSStream(self.session, urlnoproto, **params)} else: return streams
def _get_streams(self): headers = {'User-Agent': useragents.IPAD} channel = self._url_re.match(self.url).group('channel') res = http.get('https://m.huya.com/{0}'.format(channel), headers=headers) m = self._hls_re.search(res.text) if not m: self.logger.debug('No m3u8 url found.') return hls_url = update_scheme('https://', m.group('url')) self.logger.debug('URL={0}'.format(hls_url)) return {'live': HLSStream(self.session, hls_url, headers=headers)}
def _get_streams(self): http.headers = {"User-Agent": useragents.CHROME} res = http.get(self.url) iframe_url = self.find_iframe(res) if iframe_url: self.logger.debug("Found iframe: {0}", iframe_url) res = http.get(iframe_url, headers={"Referer": self.url}) stream_url = update_scheme(self.url, self.stream_schema.validate(res.text)) return HLSStream.parse_variant_playlist( self.session, stream_url, headers={"User-Agent": useragents.CHROME})
def _get_streams(self): res = http.get(self.url) mobile_url_m = self.mobile_url_re.search(res.text) mobile_url = mobile_url_m and update_scheme(self.url, mobile_url_m.group("url")) token = mobile_url_m and mobile_url_m.group("token") if not token: # if no token is in the url, try to find it else where in the page token_m = self.token_re.search(res.text) token = token_m and token_m.group("token") return HLSStream.parse_variant_playlist(self.session, mobile_url + token, headers={"Referer": self.url})
def merge_path_list(self, static, user): """merge the static list, with an user list Args: static (list): static list from this plugin user (list): list from an user command Returns: A new valid list """ for _path_url in user: if not _path_url.startswith(("http", "//")): _path_url = update_scheme("http://", _path_url) _parsed_path_url = urlparse(_path_url) if _parsed_path_url.netloc and _parsed_path_url.path: static += [(_parsed_path_url.netloc, _parsed_path_url.path)] return static
def _get_streams(self): http.headers.update({'User-Agent': useragents.CHROME, 'Referer': 'http://www.abweb.com/BIS-TV-Online/bistvo-tele-universal.aspx'}) login_username = self.get_option('username') login_password = self.get_option('password') if self.options.get('purge_credentials'): self._session_attributes.set('ASP.NET_SessionId', None, expires=0) self._session_attributes.set('.abportail1', None, expires=0) self._authed = False self.logger.info('All credentials were successfully removed.') if not self._authed and not (login_username and login_password): self.logger.error('A login for ABweb is required, use --abweb-username USERNAME --abweb-password PASSWORD') return if self._authed: if self._expires < time.time(): self.logger.debug('get new cached cookies') # login after 24h self.set_expires_time_cache() self._authed = False else: self.logger.info('Attempting to authenticate using cached cookies') http.cookies.set('ASP.NET_SessionId', self._session_attributes.get('ASP.NET_SessionId')) http.cookies.set('.abportail1', self._session_attributes.get('.abportail1')) if not self._authed and not self._login(login_username, login_password): return iframe_url = self.get_iframe_url() http.headers.update({'Referer': iframe_url}) hls_url = self.get_hls_url(iframe_url) hls_url = update_scheme(self.url, hls_url) self.logger.debug('URL={0}'.format(hls_url)) variant = HLSStream.parse_variant_playlist(self.session, hls_url) if variant: for q, s in variant.items(): yield q, s else: yield 'live', HLSStream(self.session, hls_url)
def resolve_url(self, url, follow_redirect=True): """Attempts to find a plugin that can use this URL. The default protocol (http) will be prefixed to the URL if not specified. Raises :exc:`NoPluginError` on failure. :param url: a URL to match against loaded plugins :param follow_redirect: follow redirects """ url = update_scheme("http://", url) available_plugins = [] for name, plugin in self.plugins.items(): if plugin.can_handle_url(url): available_plugins.append(plugin) available_plugins.sort(key=lambda x: x.priority(url), reverse=True) if available_plugins: return available_plugins[0](url) if follow_redirect: # Attempt to handle a redirect URL try: res = self.http.head(url, allow_redirects=True, acceptable_status=[501]) # Fall back to GET request if server doesn't handle HEAD. if res.status_code == 501: res = self.http.get(url, stream=True) if res.url != url: return self.resolve_url(res.url, follow_redirect=follow_redirect) except PluginError: pass raise NoPluginError
def _get_streams(self): res = http.get(self.url) m = self.cam_name_re.search(res.text) cam_name = m and m.group("name") json_base = self.cam_data_schema.validate(res.text) cam_data = json_base["cam"][cam_name] self.logger.debug("Found cam for {0} - {1}", cam_data["group"], cam_data["title"]) is_live = (cam_data["liveon"] == "true" and cam_data["defaulttab"] == "live") # HLS data hls_domain = cam_data["html5_streamingdomain"] hls_playpath = cam_data["html5_streampath"] # RTMP data rtmp_playpath = "" if is_live: n = "live" rtmp_domain = cam_data["streamingdomain"] rtmp_path = cam_data["livestreamingpath"] rtmp_live = cam_data["liveon"] if rtmp_path: match = self.playpath_re.search(rtmp_path) rtmp_playpath = match.group("file") rtmp_url = rtmp_domain + match.group("folder") else: n = "vod" rtmp_domain = cam_data["archivedomain"] rtmp_path = cam_data["archivepath"] rtmp_live = cam_data["archiveon"] if rtmp_path: rtmp_playpath = rtmp_path rtmp_url = rtmp_domain # RTMP stream if rtmp_playpath: self.logger.debug("RTMP URL: {0}{1}", rtmp_url, rtmp_playpath) params = { "rtmp": rtmp_url, "playpath": rtmp_playpath, "pageUrl": self.url, "swfUrl": self.swf_url, "live": rtmp_live } yield n, RTMPStream(self.session, params) # HLS stream if hls_playpath and is_live: hls_url = hls_domain + hls_playpath hls_url = update_scheme(self.url, hls_url) self.logger.debug("HLS URL: {0}", hls_url) for s in HLSStream.parse_variant_playlist(self.session, hls_url).items(): yield s if not (rtmp_playpath or hls_playpath): self.logger.error("This cam stream appears to be in offline or " "snapshot mode and not live stream can be played.") return
class IDF1(Plugin): DACAST_API_URL = 'https://json.dacast.com/b/{}/{}/{}' DACAST_TOKEN_URL = 'https://services.dacast.com/token/i/b/{}/{}/{}' _url_re = re.compile( r'http://www\.idf1\.fr/(videos/[^/]+/[^/]+\.html|live\b)') _video_id_re = re.compile( r"dacast\('(?P<broadcaster_id>\d+)_(?P<video_type>[a-z]+)_(?P<video_id>\d+)', 'replay_content', data\);" ) _video_id_alt_re = re.compile( r'<script src="//player.dacast.com/js/player.js" id="(?P<broadcaster_id>\d+)_(?P<video_type>[cf])_(?P<video_id>\d+)"' ) _player_url = 'http://ssl.p.jwpcdn.com/player/v/7.12.6/jwplayer.flash.swf' _api_schema = validate.Schema( validate.transform(parse_json), { validate.optional('html5'): validate.all([ { 'src': validate.url() }, ], ), 'hls': validate.url(), 'hds': validate.url() }, validate.transform( lambda x: [update_scheme(IDF1.DACAST_API_URL, x['hls']), x['hds'] ] + [y['src'] for y in x.get('html5', [])])) _token_schema = validate.Schema(validate.transform(parse_json), {'token': validate.text}, validate.get('token')) _user_agent = useragents.IE_11 @classmethod def can_handle_url(cls, url): return IDF1._url_re.match(url) def _get_streams(self): res = http.get(self.url) match = self._video_id_re.search( res.text) or self._video_id_alt_re.search(res.text) if match is None: return broadcaster_id = match.group('broadcaster_id') video_type = match.group('video_type') video_id = match.group('video_id') videos = http.get(self.DACAST_API_URL.format(broadcaster_id, video_type, video_id), schema=self._api_schema) token = http.get(self.DACAST_TOKEN_URL.format(broadcaster_id, video_type, video_id), schema=self._token_schema) parsed = [] for video_url in videos: video_url += token # Ignore duplicate video URLs if video_url in parsed: continue parsed.append(video_url) # Ignore HDS streams (broken) if '.m3u8' in video_url: for s in HLSStream.parse_variant_playlist( self.session, video_url).items(): yield s
def _get_streams(self): # Retrieve geolocation data res = http.get(self.GEO_URL) geo = http.json(res, schema=self._geo_schema) country_code = geo['reponse']['geo_info']['country_code'] # Retrieve URL page and search for video ID res = http.get(self.url) if 'france.tv' in self.url: match = self._pluzz_video_id_re.search(res.text) elif 'ludo.fr' in self.url or 'zouzous.fr' in self.url: match = self._jeunesse_video_id_re.search(res.text) elif 'france3-regions.francetvinfo.fr' in self.url: match = self._f3_regions_video_id_re.search(res.text) elif 'sport.francetvinfo.fr' in self.url: match = self._sport_video_id_re.search(res.text) if match is None: return video_id = match.group('video_id') # Retrieve SWF player URL swf_url = None res = http.get(self.PLAYER_GENERATOR_URL) player_url = update_scheme( self.url, http.json(res, schema=self._player_schema)['result']) res = http.get(player_url) match = self._swf_re.search(res.text) if match is not None: swf_url = update_scheme(self.url, match.group(0)) res = http.get(self.API_URL.format(video_id)) videos = http.json(res, schema=self._api_schema) now = time.time() offline = False geolocked = False drm = False expired = False streams = [] for video in videos['videos']: video_url = video['url'] # Check whether video format is available if video['statut'] != 'ONLINE': offline = offline or True continue # Check whether video format is geo-locked if video['geoblocage'] is not None and country_code not in video[ 'geoblocage']: geolocked = geolocked or True continue # Check whether video is DRM-protected if video['drm']: drm = drm or True continue # Check whether video format is expired available = False for interval in video['plages_ouverture']: available = (interval['debut'] or 0) <= now <= (interval['fin'] or sys.maxsize) if available: break if not available: expired = expired or True continue # TODO: add DASH streams once supported if '.mpd' in video_url: continue if '.f4m' in video_url or 'france.tv' in self.url: res = http.get(self.TOKEN_URL.format(video_url)) video_url = res.text if '.f4m' in video_url and swf_url is not None: for bitrate, stream in HDSStream.parse_manifest( self.session, video_url, is_akamai=True, pvswf=swf_url).items(): # HDS videos with data in their manifest fragment token # doesn't seem to be supported by HDSStream. Ignore such # stream (but HDS stream having only the hdntl parameter in # their manifest token will be provided) pvtoken = stream.request_params['params'].get( 'pvtoken', '') match = self._hds_pv_data_re.search(pvtoken) if match is None: streams.append((bitrate, stream)) elif '.m3u8' in video_url: for stream in HLSStream.parse_variant_playlist( self.session, video_url).items(): streams.append(stream) # HBB TV streams are not provided anymore by France Televisions elif '.mp4' in video_url and '/hbbtv/' not in video_url: match = self._mp4_bitrate_re.match(video_url) if match is not None: bitrate = match.group('bitrate') else: # Fallback bitrate (seems all France Televisions MP4 videos # seem have such bitrate) bitrate = '1500k' streams.append((bitrate, HTTPStream(self.session, video_url))) if self.get_option("mux_subtitles") and videos['subtitles'] != []: substreams = {} for subtitle in videos['subtitles']: # TTML subtitles are available but not supported by FFmpeg if subtitle['format'] == 'ttml': continue substreams[subtitle['type']] = HTTPStream( self.session, subtitle['url']) for quality, stream in streams: yield quality, MuxedStream(self.session, stream, subtitles=substreams) else: for stream in streams: yield stream if offline: self.logger.error( 'Failed to access stream, may be due to offline content') if geolocked: self.logger.error( 'Failed to access stream, may be due to geo-restricted content' ) if drm: self.logger.error( 'Failed to access stream, may be due to DRM-protected content') if expired: self.logger.error( 'Failed to access stream, may be due to expired content')
def _get_streams(self): """Try to find streams on every website. Returns: Playable video or New session url Raises: NoPluginError: if no video was found. """ new_session_url = False self.url = update_scheme("http://", self.url) self.logger.debug("resolve.py - {0}".format(self.url)) # GET website content o_res = self._res_text(self.url) # rtmp search, will only print the url. m_rtmp = _rtmp_re.search(o_res) if m_rtmp: self.logger.info("Found RTMP: {0}".format(m_rtmp.group("url"))) # Playlist URL playlist_all = _playlist_re.findall(o_res) if playlist_all: # m_base is used for .f4m files that doesn't have a base_url m_base = self._stream_base_re.search(o_res) if m_base: stream_base = m_base.group("base") else: stream_base = "" playlist_list = self._make_url_list(playlist_all, self.url, url_type="playlist", stream_base=stream_base) if playlist_list: self.logger.debug("Found URL: {0}".format(", ".join(playlist_list))) return self._resolve_playlist(playlist_list) # iFrame URL iframe_list = [] for _iframe_list in (_iframe_re.findall(o_res), self._iframe_unescape(o_res)): if not _iframe_list: continue iframe_list += _iframe_list if iframe_list: # repair and filter iframe url list new_iframe_list = self._make_url_list(iframe_list, self.url, url_type="iframe") if new_iframe_list: self.logger.info("Found iframes: {0}".format(", ".join(new_iframe_list))) new_session_url = new_iframe_list[0] if not new_session_url: # search for window.location.href new_session_url = self._window_location(o_res) if new_session_url: return self.session.streams(new_session_url) raise NoPluginError
def set_option(self, key, value): """Sets general options used by plugins and streams originating from this session object. :param key: key of the option :param value: value to set the option to **Available options**: ======================== ========================================= hds-live-edge ( float) Specify the time live HDS streams will start from the edge of stream, default: ``10.0`` hds-segment-attempts (int) How many attempts should be done to download each HDS segment, default: ``3`` hds-segment-threads (int) The size of the thread pool used to download segments, default: ``1`` hds-segment-timeout (float) HDS segment connect and read timeout, default: ``10.0`` hds-timeout (float) Timeout for reading data from HDS streams, default: ``60.0`` hls-live-edge (int) How many segments from the end to start live streams on, default: ``3`` hls-segment-attempts (int) How many attempts should be done to download each HLS segment, default: ``3`` hls-segment-threads (int) The size of the thread pool used to download segments, default: ``1`` hls-segment-timeout (float) HLS segment connect and read timeout, default: ``10.0`` hls-timeout (float) Timeout for reading data from HLS streams, default: ``60.0`` http-proxy (str) Specify a HTTP proxy to use for all HTTP requests https-proxy (str) Specify a HTTPS proxy to use for all HTTPS requests http-cookies (dict or str) A dict or a semi-colon (;) delimited str of cookies to add to each HTTP request, e.g. ``foo=bar;baz=qux`` http-headers (dict or str) A dict or semi-colon (;) delimited str of headers to add to each HTTP request, e.g. ``foo=bar;baz=qux`` http-query-params (dict or str) A dict or a ampersand (&) delimited string of query parameters to add to each HTTP request, e.g. ``foo=bar&baz=qux`` http-trust-env (bool) Trust HTTP settings set in the environment, such as environment variables (HTTP_PROXY, etc) and ~/.netrc authentication http-ssl-verify (bool) Verify SSL certificates, default: ``True`` http-ssl-cert (str or tuple) SSL certificate to use, can be either a .pem file (str) or a .crt/.key pair (tuple) http-timeout (float) General timeout used by all HTTP requests except the ones covered by other options, default: ``20.0`` http-stream-timeout (float) Timeout for reading data from HTTP streams, default: ``60.0`` subprocess-errorlog (bool) Log errors from subprocesses to a file located in the temp directory subprocess-errorlog-path (str) Log errors from subprocesses to a specific file ringbuffer-size (int) The size of the internal ring buffer used by most stream types, default: ``16777216`` (16MB) rtmp-proxy (str) Specify a proxy (SOCKS) that RTMP streams will use rtmp-rtmpdump (str) Specify the location of the rtmpdump executable used by RTMP streams, e.g. ``/usr/local/bin/rtmpdump`` rtmp-timeout (float) Timeout for reading data from RTMP streams, default: ``60.0`` ffmpeg-ffmpeg (str) Specify the location of the ffmpeg executable use by Muxing streams e.g. ``/usr/local/bin/ffmpeg`` ffmpeg-verbose (bool) Log stderr from ffmpeg to the console ffmpeg-verbose-path (str) Specify the location of the ffmpeg stderr log file ffmpeg-video-transcode (str) The codec to use if transcoding video when muxing with ffmpeg e.g. ``h264`` ffmpeg-audio-transcode (str) The codec to use if transcoding audio when muxing with ffmpeg e.g. ``aac`` stream-segment-attempts (int) How many attempts should be done to download each segment, default: ``3``. General option used by streams not covered by other options. stream-segment-threads (int) The size of the thread pool used to download segments, default: ``1``. General option used by streams not covered by other options. stream-segment-timeout (float) Segment connect and read timeout, default: ``10.0``. General option used by streams not covered by other options. stream-timeout (float) Timeout for reading data from stream, default: ``60.0``. General option used by streams not covered by other options. locale (str) Locale setting, in the RFC 1766 format eg. en_US or es_ES default: ``system locale``. ======================== ========================================= """ # Backwards compatibility if key == "rtmpdump": key = "rtmp-rtmpdump" elif key == "rtmpdump-proxy": key = "rtmp-proxy" elif key == "errorlog": key = "subprocess-errorlog" elif key == "errorlog-path": key = "subprocess-errorlog-path" if key == "http-proxy": self.http.proxies["http"] = update_scheme("http://", value) elif key == "https-proxy": self.http.proxies["https"] = update_scheme("https://", value) elif key == "http-cookies": if isinstance(value, dict): self.http.cookies.update(value) else: self.http.parse_cookies(value) elif key == "http-headers": if isinstance(value, dict): self.http.headers.update(value) else: self.http.parse_headers(value) elif key == "http-query-params": if isinstance(value, dict): self.http.params.update(value) else: self.http.parse_query_params(value) elif key == "http-trust-env": self.http.trust_env = value elif key == "http-ssl-verify": self.http.verify = value elif key == "http-disable-dh": if value: requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ':!DH' try: requests.packages.urllib3.contrib.pyopenssl.DEFAULT_SSL_CIPHER_LIST = \ requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS.encode("ascii") except AttributeError: # no ssl to disable the cipher on pass elif key == "http-ssl-cert": self.http.cert = value elif key == "http-timeout": self.http.timeout = value else: self.options.set(key, value)
def _get_streams(self): res = http.get(self.url) match = self._player_js.search(res.text) if match: player_js = match.group(0) self.logger.info("Found player js {0}", player_js) else: self.logger.info( "Didn't find player js. Probably this page doesn't contain a video" ) return res = http.get(player_js) m = self._data_re.search(res.text) if not m: self.logger.info( "Couldn't extract json metadata from player.js: {0}", player_js) return stream_metadata = json.loads(m.group("data")) is_video = stream_metadata["mediaType"] in ["live", "vod"] is_audio = stream_metadata["mediaType"] == "aod" media_version = tuple([ int(d) for d in stream_metadata["mediaVersion"].split("-")[0].split(".") ]) if is_video or is_audio: media_url = stream_metadata["mediaResource"]["dflt"][ "videoURL" if is_video else "audioURL"] media_url_alt = stream_metadata["mediaResource"]["alt"][ "videoURL" if is_video else "audioURL"] media_name = "audio" if is_audio else "vod" if media_version >= (1, 2, 0): media_format = stream_metadata["mediaResource"]["dflt"][ "mediaFormat"] media_format_alt = stream_metadata["mediaResource"]["alt"][ "mediaFormat"] else: media_format = stream_metadata["mediaFormat"] media_format_alt = media_url_alt[-4:] stream_url = { "url": media_url, "format": media_format, "name": media_name } stream_url_alt = { "url": media_url_alt, "format": media_format_alt, "name": media_name } for stream in [stream_url, stream_url_alt]: url = update_scheme("http://", stream["url"]) try: if stream["format"] in ["hds", ".f4m"]: for s in HDSStream.parse_manifest( self.session, url, is_akamai=True).items(): yield s elif stream["format"] in ["hls", "m3u8"]: streams = HLSStream.parse_variant_playlist( self.session, url).items() if not streams: yield "live", HLSStream(self.session, url) for s in streams: yield s elif stream["format"] in ["mp3", "mp4", ".mp3", ".mp4"]: yield stream["name"], HTTPStream(self.session, url) except IOError as err: self.logger.error("Failed to extract {0} streams: {1}", stream["format"], err)
def _make_url_list(self, old_list, base_url, stream_base=""): """Creates a list of validate urls from a list of broken urls and removes every blacklisted url Args: old_list: List of broken urls base_url: url that will get used for scheme and netloc stream_base: basically same as base_url, but used for .f4m files. Returns: List of validate urls """ blacklist_netloc_user = self.get_option("blacklist_netloc") blacklist_netloc = ( "about:blank", "adfox.ru", "googletagmanager.com", "javascript:false", ) blacklist_path = [ ("facebook.com", "/plugins"), ("vesti.ru", "/native_widget.html"), ] # Add --resolve-blacklist-path to blacklist_path blacklist_path_user = self.get_option("blacklist_path") if blacklist_path_user is not None: for _path_url in blacklist_path_user: if not _path_url.startswith(("http", "//")): _path_url = update_scheme("http://", _path_url) _parsed_path_url = urlparse(_path_url) if _parsed_path_url.netloc and _parsed_path_url.path: blacklist_path += [(_parsed_path_url.netloc, _parsed_path_url.path)] new_list = [] for url in old_list: # Don't add the same url as self.url to the list. if url == self.url: continue # Repair the scheme new_url = url.replace("\\", "") if new_url.startswith("http://"): new_url = "http:" + new_url[9:] elif new_url.startswith("https://"): new_url = "https:" + new_url[10:] # Repair the domain if stream_base and new_url[1] is not "/": if new_url[0] is "/": new_url = new_url[1:] new_url = urljoin(stream_base, new_url) else: new_url = urljoin(base_url, new_url) # Parse the url and remove not wanted urls parse_new_url = urlparse(new_url) REMOVE = False # Removes blacklisted domains if REMOVE is False and parse_new_url.netloc.endswith( blacklist_netloc): REMOVE = True # Removes blacklisted domains from --resolve-blacklist-netloc if REMOVE is False and blacklist_netloc_user is not None and parse_new_url.netloc.endswith( tuple(blacklist_netloc_user)): REMOVE = True # Removes blacklisted paths from a domain if REMOVE is False: for netloc, path in blacklist_path: if parse_new_url.netloc.endswith( netloc) and parse_new_url.path.startswith(path): REMOVE = True continue # Removes images and chatrooms if REMOVE is False and parse_new_url.path.endswith( (".jpg", ".png", ".svg", "/chat")): REMOVE = True # Remove obviously ad urls if REMOVE is False and self._ads_path.match(parse_new_url.path): REMOVE = True if REMOVE is True: self.logger.debug("Removed url: {0}".format(new_url)) continue # Add url to the list new_list += [new_url] # Remove duplicates new_list = list(set(new_list)) return new_list