def is_age_restricted(watch_html): """Check if content is age restricted. :param str watch_html: The html contents of the watch page. :rtype: bool :returns: Whether or not the content is age restricted. """ try: regex_search(r'og:restrictions:age', watch_html, group=0) except RegexMatchError: return False return True
def get_initial_function_name(js): """Extract the name of the function responsible for computing the signature. :param str js: The contents of the base.js asset file. """ # c&&d.set("signature", EE(c)); pattern = [ r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[' r'a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(' r'\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(' r'\s*""\s*\)', r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,' r'\s*(?:encodeURIComponent\s*\()?\s*(?P<si$', r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[' r'a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[' r'a-zA-Z0-9$]+)\(' ] logger.debug('finding initial function name') return regex_search(pattern, js, group=1)
def parse_function(js_func): """Parse the Javascript transform function. Break a JavaScript transform function down into a two element ``tuple`` containing the function name and some integer-based argument. :param str js_func: The JavaScript version of the transform function. :rtype: tuple :returns: two element tuple containing the function name and an argument. **Example**: >>> parse_function('DE.AJ(a,15)') ('AJ', 15) """ logger.debug('parsing transform function') return regex_search(r'\w+\.(\w+)\(\w,(\d+)\)', js_func, groups=True)
def video_id(url): """Extract the ``video_id`` from a YouTube url. This function supports the following patterns: - :samp:`https://youtube.com/watch?v={video_id}` - :samp:`https://youtube.com/embed/{video_id}` - :samp:`https://youtu.be/{video_id}` :param str url: A YouTube url containing a video id. :rtype: str :returns: YouTube video id. """ return regex_search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url, group=1)
def video_info_url( video_id, watch_url, watch_html, embed_html, age_restricted, ): """Construct the video_info url. :param str video_id: A YouTube video identifier. :param str watch_url: A YouTube watch url. :param str watch_html: The html contents of the watch page. :param str embed_html: The html contents of the embed page (for age restricted videos). :param bool age_restricted: Is video age restricted. :rtype: str :returns: :samp:`https://youtube.com/get_video_info` with necessary GET parameters. """ if age_restricted: sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) # Here we use ``OrderedDict`` so that the output is consistent between # Python 2.7+. params = OrderedDict([ ('video_id', video_id), ('eurl', eurl(video_id)), ('sts', sts), ]) else: params = OrderedDict([ ('video_id', video_id), ('el', '$el'), ('ps', 'default'), ('eurl', quote(watch_url)), ('hl', 'en_US'), ]) return 'https://youtube.com/get_video_info?' + urlencode(params)
def get_transform_object(js, var): """Extract the "transform object". The "transform object" contains the function definitions referenced in the "transform plan". The ``var`` argument is the obfuscated variable name which contains these functions, for example, given the function call ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var. :param str js: The contents of the base.js asset file. :param str var: The obfuscated variable name that stores an object with all functions that descrambles the signature. **Example**: >>> get_transform_object(js, 'DE') ['AJ:function(a){a.reverse()}', 'VR:function(a,b){a.splice(0,b)}', 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}'] """ pattern = r'var %s={(.*?)};' % re.escape(var) logger.debug('getting transform object') return (regex_search(pattern, js, group=1, flags=re.DOTALL).replace('\n', ' ').split(', '))
def get_transform_plan(js): """Extract the "transform plan". The "transform plan" is the functions that the ciphered signature is cycled through to obtain the actual signature. :param str js: The contents of the base.js asset file. **Example**: >>> get_transform_plan(js) ['DE.AJ(a,15)', 'DE.VR(a,3)', 'DE.AJ(a,51)', 'DE.VR(a,3)', 'DE.kT(a,51)', 'DE.kT(a,8)', 'DE.VR(a,3)', 'DE.kT(a,21)'] """ name = re.escape(get_initial_function_name(js)) pattern = r'%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}' % name logger.debug('getting transform plan') return regex_search(pattern, js, group=1).split(';')
def get_ytplayer_config(html, age_restricted=False): """Get the YouTube player configuration data from the watch html. Extract the ``ytplayer_config``, which is json data embedded within the watch html and serves as the primary source of obtaining the stream manifest data. :param str watch_html: The html contents of the watch page. :param bool age_restricted: Is video age restricted. :rtype: str :returns: Substring of the html containing the encoded manifest data. """ if age_restricted: pattern = r";yt\.setConfig\(\{'PLAYER_CONFIG':\s*({.*})(,'EXPERIMENT_FLAGS'|;)" # noqa: E501 else: pattern = r';ytplayer\.config\s*=\s*({.*?});' yt_player_config = regex_search(pattern, html, group=1) return json.loads(yt_player_config)
def mime_type_codec(mime_type_codec): """Parse the type data. Breaks up the data in the ``type`` key of the manifest, which contains the mime type and codecs serialized together, and splits them into separate elements. **Example**: >>> mime_type_codec('audio/webm; codecs="opus"') ('audio/webm', ['opus']) :param str mime_type_codec: String containing mime type and codecs. :rtype: tuple :returns: The mime type and a list of codecs. """ pattern = r'(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"' mime_type, codecs = regex_search(pattern, mime_type_codec, groups=True) return mime_type, [c.strip() for c in codecs.split(',')]