Пример #1
0
def fetch_cached(url, callback, encoding=None, dbg_ref="", file_prefix=""):
    """ Fetch url - from tmpdir if already retrieved. """
    tmpdir = os.path.join(tempfile.gettempdir(), "pafy")

    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    url_md5 = hashlib.md5(url.encode("utf8")).hexdigest()
    cached_filename = os.path.join(tmpdir, file_prefix + url_md5)

    if os.path.exists(cached_filename):
        dbg("fetched %s from cache", dbg_ref)

        with open(cached_filename) as f:
            retval = f.read()

        return retval

    else:
        data = fetch_decode(url, "utf8")  # unicode
        dbg("Fetched %s", dbg_ref)
        if callback:
            callback("Fetched %s" % dbg_ref)

        with open(cached_filename, "w") as f:
            f.write(data)

        # prune files after write
        prune_files(tmpdir, file_prefix)
        return data
Пример #2
0
def _extract_dash(dashurl):
    """ Download dash url and extract some data. """
    # pylint: disable = R0914
    dbg("Fetching dash page")
    dashdata = fetch_decode(dashurl)
    dbg("DASH list fetched")
    ns = "{urn:mpeg:DASH:schema:MPD:2011}"
    ytns = "{http://youtube.com/yt/2012/10/10}"
    tree = ElementTree.fromstring(dashdata)
    tlist = tree.findall(".//%sRepresentation" % ns)
    dashmap = []

    for x in tlist:
        baseurl = x.find("%sBaseURL" % ns)
        url = baseurl.text
        size = baseurl.get("%scontentLength" % ytns)
        bitrate = x.get("bandwidth")
        itag = uni(x.get("id"))
        width = uni(x.get("width"))
        height = uni(x.get("height"))
        dashmap.append(
            dict(bitrate=bitrate,
                 dash=True,
                 itag=itag,
                 width=width,
                 height=height,
                 url=url,
                 size=size))
    return dashmap
Пример #3
0
def _get_mainfunc_from_js(js):
    """ Return main signature decryption function from javascript as dict. """
    dbg("Scanning js for main function.")
    m = re.search(r'\.sig\|\|([a-zA-Z0-9$]+)\(', js)
    funcname = m.group(1)
    dbg("Found main function: %s", funcname)
    jsi = JSInterpreter(js)
    return jsi.extract_function(funcname)
Пример #4
0
def _decodesig(sig, js_url, callback):
    """  Return decrypted sig given an encrypted sig and js_url key. """
    # lookup main function in funcmap dict
    mainfunction = funcmap[js_url]

    # fill in function argument with signature
    if callback:
        callback("Decrypting signature")
    solved = mainfunction([sig])
    dbg("Decrypted sig = %s...", solved[:30])
    if callback:
        callback("Decrypted signature")
    return solved
Пример #5
0
def get_video_info(video_id, callback, newurl=None):
    """ Return info for video_id.  Returns dict. """
    # TODO: see if there is a way to avoid retrieving the embed page
    #       just for this, or to use it for more. This was coppied from
    #       youtube-dl.
    embed_webpage = fetch_decode(g.urls['embed'])
    sts = re.search(r'sts"\s*:\s*(\d+)', embed_webpage).group(1)

    url = g.urls['vidinfo'] % (video_id, video_id, sts)
    url = newurl if newurl else url
    info = fetch_decode(url)  # bytes
    info = parseqs(info)  # unicode dict
    dbg("Fetched video info%s", " (age ver)" if newurl else "")

    if info['status'][0] == "fail":
        reason = info['reason'][0] or "Bad video argument"
        raise IOError("Youtube says: %s [%s]" % (reason, video_id))

    return info
Пример #6
0
def get_js_sm(watchinfo, callback):
    """ Fetch watchinfo page and extract stream map and js funcs if not known.
    This function is needed by videos with encrypted signatures.
    If the js url referred to in the watchv page is not a key in funcmap,
    the javascript is fetched and functions extracted.
    Returns streammap (list of dicts), js url (str)  and funcs (dict)
    """
    m = re.search(g.jsplayer, watchinfo)
    myjson = json.loads(m.group(1))
    stream_info = myjson['args']
    sm = _extract_smap(g.UEFSM, stream_info, False)
    asm = _extract_smap(g.AF, stream_info, False)
    js_url = myjson['assets']['js']
    js_url = "https:" + js_url if js_url.startswith("//") else js_url
    mainfunc = funcmap.get(js_url)

    if not mainfunc:
        dbg("Fetching javascript")
        if callback:
            callback("Fetching javascript")
        javascript = fetch_cached(js_url,
                                  callback,
                                  encoding="utf8",
                                  dbg_ref="javascript",
                                  file_prefix="js-")
        mainfunc = _get_mainfunc_from_js(javascript)

    elif mainfunc:
        dbg("Using functions in memory extracted from %s", js_url)
        dbg("Mem contains %s js func sets", len(funcmap))

    return (sm, asm), js_url, mainfunc
Пример #7
0
    def _fetch_basic(self):
        """ Fetch basic data and streams. """
        if self._have_basic:
            return

        allinfo = get_video_info(self.videoid, self.callback)

        if self.callback:
            self.callback("Fetched video info")

        def _get_lst(key, default="unknown", dic=allinfo):
            """ Dict get function, returns first index. """
            retval = dic.get(key, default)
            return retval[0] if retval != default else default

        self._title = _get_lst('title')
        self._dashurl = _get_lst('dashmpd')
        self._author = _get_lst('author')
        self._rating = float(_get_lst('avg_rating', 0.0))
        self._length = int(_get_lst('length_seconds', 0))
        self._viewcount = int(_get_lst('view_count'), 0)
        self._thumb = unquote_plus(_get_lst('thumbnail_url', ""))
        self._formats = [x.split("/") for x in _get_lst('fmt_list').split(",")]
        self._keywords = _get_lst('keywords', "").split(',')
        self._bigthumb = _get_lst('iurlsd', "")
        self._bigthumbhd = _get_lst('iurlsdmaxres', "")
        self.ciphertag = _get_lst("use_cipher_signature") == "True"
        self.sm = _extract_smap(g.UEFSM, allinfo, True)
        self.asm = _extract_smap(g.AF, allinfo, True)
        dbg("extracted stream maps")

        sm_ciphertag = "s" in self.sm[0]

        if self.ciphertag != sm_ciphertag:
            dbg("ciphertag mismatch")
            self.ciphertag = not self.ciphertag

        watch_url = g.urls['watchv'] % self.videoid
        if self.callback:
            self.callback("Fetching watch page")
        watchinfo = fetch_decode(watch_url)  # unicode
        dbg("Fetched watch page")
        if self.callback:
            self.callback("Fetched watch page")
        self.age_ver = re.search(r'player-age-gate-content">',
                                 watchinfo) is not None

        if self.ciphertag:
            dbg("Encrypted signature detected.")

            if not self.age_ver:
                smaps, js_url, mainfunc = get_js_sm(watchinfo, self.callback)
                funcmap[js_url] = mainfunc
                self.sm, self.asm = smaps
                self.js_url = js_url
                dashsig = re.search(r"/s/([\w\.]+)", self._dashurl).group(1)
                dbg("decrypting dash sig")
                goodsig = _decodesig(dashsig, js_url, self.callback)
                self._dashurl = re.sub(r"/s/[\w\.]+",
                                       "/signature/%s" % goodsig,
                                       self._dashurl)

            else:
                s = re.search(r"/s/([\w\.]+)", self._dashurl).group(1)
                s = s[2:63] + s[82] + s[64:82] + s[63]
                self._dashurl = re.sub(r"/s/[\w\.]+", "/signature/%s" % s,
                                       self._dashurl)

        if self._dashurl != 'unknown':
            self.dash = _extract_dash(self._dashurl)
        self._have_basic = 1
        self._process_streams()
        self.expiry = time.time() + g.lifespan