def fetch_cached(url, callback, encoding=None, dbg_ref="", file_prefix=""): """ Fetch url - from tmpdir if already retrieved. """ tmpdir = os.path.join(tempfile.gettempdir(), "pafy") if not os.path.exists(tmpdir): os.makedirs(tmpdir) url_md5 = hashlib.md5(url.encode("utf8")).hexdigest() cached_filename = os.path.join(tmpdir, file_prefix + url_md5) if os.path.exists(cached_filename): dbg("fetched %s from cache", dbg_ref) with open(cached_filename) as f: retval = f.read() return retval else: data = fetch_decode(url, "utf8") # unicode dbg("Fetched %s", dbg_ref) if callback: callback("Fetched %s" % dbg_ref) with open(cached_filename, "w") as f: f.write(data) # prune files after write prune_files(tmpdir, file_prefix) return data
def _extract_dash(dashurl): """ Download dash url and extract some data. """ # pylint: disable = R0914 dbg("Fetching dash page") dashdata = fetch_decode(dashurl) dbg("DASH list fetched") ns = "{urn:mpeg:DASH:schema:MPD:2011}" ytns = "{http://youtube.com/yt/2012/10/10}" tree = ElementTree.fromstring(dashdata) tlist = tree.findall(".//%sRepresentation" % ns) dashmap = [] for x in tlist: baseurl = x.find("%sBaseURL" % ns) url = baseurl.text size = baseurl.get("%scontentLength" % ytns) bitrate = x.get("bandwidth") itag = uni(x.get("id")) width = uni(x.get("width")) height = uni(x.get("height")) dashmap.append( dict(bitrate=bitrate, dash=True, itag=itag, width=width, height=height, url=url, size=size)) return dashmap
def _get_mainfunc_from_js(js): """ Return main signature decryption function from javascript as dict. """ dbg("Scanning js for main function.") m = re.search(r'\.sig\|\|([a-zA-Z0-9$]+)\(', js) funcname = m.group(1) dbg("Found main function: %s", funcname) jsi = JSInterpreter(js) return jsi.extract_function(funcname)
def _decodesig(sig, js_url, callback): """ Return decrypted sig given an encrypted sig and js_url key. """ # lookup main function in funcmap dict mainfunction = funcmap[js_url] # fill in function argument with signature if callback: callback("Decrypting signature") solved = mainfunction([sig]) dbg("Decrypted sig = %s...", solved[:30]) if callback: callback("Decrypted signature") return solved
def get_video_info(video_id, callback, newurl=None): """ Return info for video_id. Returns dict. """ # TODO: see if there is a way to avoid retrieving the embed page # just for this, or to use it for more. This was coppied from # youtube-dl. embed_webpage = fetch_decode(g.urls['embed']) sts = re.search(r'sts"\s*:\s*(\d+)', embed_webpage).group(1) url = g.urls['vidinfo'] % (video_id, video_id, sts) url = newurl if newurl else url info = fetch_decode(url) # bytes info = parseqs(info) # unicode dict dbg("Fetched video info%s", " (age ver)" if newurl else "") if info['status'][0] == "fail": reason = info['reason'][0] or "Bad video argument" raise IOError("Youtube says: %s [%s]" % (reason, video_id)) return info
def get_js_sm(watchinfo, callback): """ Fetch watchinfo page and extract stream map and js funcs if not known. This function is needed by videos with encrypted signatures. If the js url referred to in the watchv page is not a key in funcmap, the javascript is fetched and functions extracted. Returns streammap (list of dicts), js url (str) and funcs (dict) """ m = re.search(g.jsplayer, watchinfo) myjson = json.loads(m.group(1)) stream_info = myjson['args'] sm = _extract_smap(g.UEFSM, stream_info, False) asm = _extract_smap(g.AF, stream_info, False) js_url = myjson['assets']['js'] js_url = "https:" + js_url if js_url.startswith("//") else js_url mainfunc = funcmap.get(js_url) if not mainfunc: dbg("Fetching javascript") if callback: callback("Fetching javascript") javascript = fetch_cached(js_url, callback, encoding="utf8", dbg_ref="javascript", file_prefix="js-") mainfunc = _get_mainfunc_from_js(javascript) elif mainfunc: dbg("Using functions in memory extracted from %s", js_url) dbg("Mem contains %s js func sets", len(funcmap)) return (sm, asm), js_url, mainfunc
def _fetch_basic(self): """ Fetch basic data and streams. """ if self._have_basic: return allinfo = get_video_info(self.videoid, self.callback) if self.callback: self.callback("Fetched video info") def _get_lst(key, default="unknown", dic=allinfo): """ Dict get function, returns first index. """ retval = dic.get(key, default) return retval[0] if retval != default else default self._title = _get_lst('title') self._dashurl = _get_lst('dashmpd') self._author = _get_lst('author') self._rating = float(_get_lst('avg_rating', 0.0)) self._length = int(_get_lst('length_seconds', 0)) self._viewcount = int(_get_lst('view_count'), 0) self._thumb = unquote_plus(_get_lst('thumbnail_url', "")) self._formats = [x.split("/") for x in _get_lst('fmt_list').split(",")] self._keywords = _get_lst('keywords', "").split(',') self._bigthumb = _get_lst('iurlsd', "") self._bigthumbhd = _get_lst('iurlsdmaxres', "") self.ciphertag = _get_lst("use_cipher_signature") == "True" self.sm = _extract_smap(g.UEFSM, allinfo, True) self.asm = _extract_smap(g.AF, allinfo, True) dbg("extracted stream maps") sm_ciphertag = "s" in self.sm[0] if self.ciphertag != sm_ciphertag: dbg("ciphertag mismatch") self.ciphertag = not self.ciphertag watch_url = g.urls['watchv'] % self.videoid if self.callback: self.callback("Fetching watch page") watchinfo = fetch_decode(watch_url) # unicode dbg("Fetched watch page") if self.callback: self.callback("Fetched watch page") self.age_ver = re.search(r'player-age-gate-content">', watchinfo) is not None if self.ciphertag: dbg("Encrypted signature detected.") if not self.age_ver: smaps, js_url, mainfunc = get_js_sm(watchinfo, self.callback) funcmap[js_url] = mainfunc self.sm, self.asm = smaps self.js_url = js_url dashsig = re.search(r"/s/([\w\.]+)", self._dashurl).group(1) dbg("decrypting dash sig") goodsig = _decodesig(dashsig, js_url, self.callback) self._dashurl = re.sub(r"/s/[\w\.]+", "/signature/%s" % goodsig, self._dashurl) else: s = re.search(r"/s/([\w\.]+)", self._dashurl).group(1) s = s[2:63] + s[82] + s[64:82] + s[63] self._dashurl = re.sub(r"/s/[\w\.]+", "/signature/%s" % s, self._dashurl) if self._dashurl != 'unknown': self.dash = _extract_dash(self._dashurl) self._have_basic = 1 self._process_streams() self.expiry = time.time() + g.lifespan