Exemplo n.º 1
0
    def fileinfo_from_list(url):
        # get page
        try:
            resp = requests.get(url, headers = Downloader.header, cookies = Downloader.cookies)
        except Exception as e:
            raise BaseDownloaderException("Fail to get url: %s, %s" % (url, str(e)))

        try:
            uk       = re.compile(r'FileUtils\.sysUK="(\d+)"'   ).search(resp.text).group(1)
            share_id = re.compile(r'FileUtils\.share_id="(\d+)"').search(resp.text).group(1)
        except Exception as e:
            raise BaseDownloaderException("Url: %s has no file. %s" % (url, str(e)))

        if not uk or not share_id:
            raise BaseDownloaderException("Url: %s has no file." % (url))

        baseurl = "http://pan.baidu.com/share/list?uk=%s&shareid=%s" % (uk, share_id)

        # merge cookies
        cookies = '; '.join("%s=%s" % (k,v) for (k, v) in (list(Downloader.cookies.items()) + list(resp.cookies.items())))

        resp_json = []
        dir_name = re.compile(r'dir/path=([^#]+)').search(url)
        resq_url = ""
        if dir_name:
            dir_name = dir_name.group(1)
            # extract uk and share_id
            resp_json = Downloader.get_json_for_dir(baseurl, dir_name)
        else:
            # file list hides in json embbeded in page
            resp_json = Downloader.extract_filejson(resp.text)
            if not resp_json:
                raise BaseDownloaderException("Url: %s has no file." % (url))
        yield from Downloader.filelist_json_gen(baseurl, resp_json, cookies)
Exemplo n.º 2
0
    def filelist_from_dir(page, url):
        # short URL
        try:
            domain, shorturl = re.compile(
                r'(?:http://)?((?:\w+\.)?l\d+\.yunpan.cn)/lk/(.+)$').search(
                    url).group(1, 2)
        except Exception as e:
            raise BaseDownloaderException(
                "Malform URL: %s, no shortur or domain" % (s, str(e)))

        rootFileList = extract_filelist_json(page)
        if not rootFileList:
            raise BaseDownloaderException("Url %s has no file in it" % s)

        for fileinfo in rootFileList:
            if fileinfo["fhash"] != "":
                try:
                    yield Downloader.post_for_link(url, domain, shorturl,
                                                   fileinfo)
                except Exception as e:
                    print("Fail to get file %s:\n%s" %
                          (fileinfo["path"], str(e)))
                    continue
            else:
                try:
                    yield from Downloader.filelist_from_subdir(
                        url, fileinfo["nid"], shorturl, domain)
                except Exception as e:
                    print("Fail to get file from subdir %s:\n%s" %
                          (fileinfo["path"], str(e)))
                    continue
        raise StopIteration
Exemplo n.º 3
0
    def download_info(self, url):
        # parse url
        try:
            user_id, type_seg, node = self.parser_re.match(url).groups()
        except Exception as e:
            raise BaseDownloaderException("Cannot parse %s as %s, url malformated" % (url, self.brand))
        type_seg = type_seg[0:-1]

        # type?
        url_type = None
        if type_seg == "file":
            url_type = self.TYPE_FILE
        elif type_seg == "f" or type_seg == "folder":
            url_type = self.TYPE_FOLDER
        else:
            raise BaseDownloaderException("Cannot parse %s as %s, url malformated" % (url, self.brand))

        # make requests and get url
        num_of_file = 0

        request_url = ""
        if url_type == self.TYPE_FOLDER:
            # get folder content from url
            # http://svr.f.xunlei.com/file/getUserFileList?userId=$user_id&node=$node&needAudit=1
            request_url = "http://svr.f.xunlei.com/file/getUserFileList?userId=%s&node=%d&needAudit=1&callback=" % (user_id, node)
        else:
            # single file
            # http://svr.f.xunlei.com/file/getUserFileList?includingNode=$user_id%3A$node&userId=$user_id
            request_url = "http://svr.f.xunlei.com/file/getUserFileList?callback=&includingNode=%s%%3A%s&userId=%s&onlyFile=1" % (user_id, node, user_id)

        try:
            resp = requests.get(request_url, cookies = cookie, headers = self.header)
        except Exception as e:
            raise BaseDownloaderException("get file list in %s failed, %s" % (url, repr(e)))

        # respond was warped in (), need to remove these
        # accutualy, responded json was warp in xxx(...), where xxx is cgi args passed in callback.
        resp_json = json.load(StringIO(resp.text[1:-1]))

        # server return error?
        if resp_json["rtn"] != 0:
            raise BaseDownloaderException("Resquest file list fail, server return %d: %s" % (resp_json["rtn"], resp_json["data"]["msg"]))

        if url_type == self.TYPE_FOLDER:
            # read file list
            num_of_file = int(resp_json["data"]["nodesTotalNum"])
        else:
            num_of_file = 1

        last_cookies = "Cookie: " + "; ".join("%s=%s" % (k, v) for k, v in list(resp.cookies.items()))

        return (Task(
                 filename = n["name"],
                 url      = [n["url"]],
                 opts     = {"header": [last_cookies] + ["%s: %s" % (k, v) for k, v in list(self.header.items())] + ["Referer: " + url]})
                for n in resp_json["data"]["nodes"]
                if url_type == self.TYPE_FOLDER or n["nodeId"] == node)
Exemplo n.º 4
0
 def get_json_for_dir(baseurl, dir_name):
     # form url
     url = baseurl + '&dir=%s' % (dir_name)
     try:
         resp = requests.get(url, headers = Downloader.header, cookies = Downloader.cookies)
     except Exception as e:
         raise BaseDownloaderException("Url: %s has no file. %s" % (url, str(e)))
     ret_json = json.load(StringIO(resp.text))
     if ret_json["errno"] != 0:
         print("resp: %r" % ret_json)
         print("url: %s"  % url)
         raise BaseDownloaderException("Server returns error: %d" % ret_json["errno"])
     return ret_json["list"]
Exemplo n.º 5
0
    def post_for_link(referer, domain, surl, fileinfo):
        post_url = "http://%s/share/downloadfile/" % domain
        post_data = format_payload(surl, fileinfo["nid"])
        try:
            header = dict(
                list(Downloader.header.items()) + [
                    ("Referer", referer),
                    ("Content-Type",
                     "application/x-www-form-urlencoded UTF-8; charset=UTF-8"),
                ])

            post_resp = requests.post(post_url,
                                      data=post_data,
                                      headers=header,
                                      cookies={})
            resp_json = json.load(StringIO(post_resp.text))
        except Exception as e:
            raise BaseDownloaderException("\n".join([
                "  POST url: %s, data: %s" % (post_url, post_data),
                "  Request: %s %s" %
                (post_resp.request.headers, post_resp.request.body),
                "  Server respond: %d %s" %
                (post_resp.status_code, post_resp.text),
            ]))

        if resp_json["errno"] != 0:
            raise BaseDownloaderException("\n".join([
                "  POST url: %s, data: %s" % (post_url, post_data),
                "  Request: %s %s" %
                (post_resp.request.headers, post_resp.request.body),
                "  Server respond: %d %s" %
                (post_resp.status_code, post_resp.text),
            ]))

        return (Task(
            filename=fileinfo["path"],
            url=[resp_json["data"]["downloadurl"]],
            opts={
                "header":
                ["%s: %s" % (k, v) for (k, v) in Downloader.header.items()] + [
                    "Referer: %s" % referer,
                    "Cookie: %s" % '; '.join(
                        "%s=%s" % (k, v)
                        for (k, v) in post_resp.cookies.get_dict().items())
                ]
            }))
Exemplo n.º 6
0
    def fileinfo_from_home(url):
        # get UK
        uk = re.compile(r'uk=(\d+)').search(url).group(1)

        try:
            resp = requests.get(url, headers = Downloader.header, cookies = Downloader.cookies)
        except Exception as e:
            raise BaseDownloaderException( "Fail to get file list from %s, %s" % (url, str(e)));

        # get num of files
        num_of_file = int(re.compile(r'FileUtils\.pubshare_count="(\d+)"').search(resp.text).group(1))

        start = 0
        # We can only get less than 100 items a time, or server will return an error
        while num_of_file > 0:
            limit = min(num_of_file, 60)
            # http://pan.baidu.com/pcloud/feed/getsharelist?auth_type=1&request_location=share_home&start=[start]&limit=[limit]&query_uk=[uk]
            resq_url = "http://pan.baidu.com/pcloud/feed/getsharelist?auth_type=1&request_location=share_home&start=%d&limit=%d&query_uk=%s" % (start, limit, uk)

            try:
                resp = requests.get(resq_url, headers = Downloader.header, cookies = Downloader.cookies)
            except Exception as e:
                raise BaseDownloaderException( "Fail to get file list from %s, %s" % (url, str(e)));

            resp_json = json.load(StringIO(resp.text))

            if resp_json["errno"] != 0:
                print("resp: %r" % resp_json)
                print("url: %s"  % resq_url)
                raise BaseDownloaderException("Server returns error: %d" % resp_json["errno"])

            num_of_file = num_of_file - limit
            start   = start       + limit

            # merge cookies
            cookies = '; '.join("%s=%s" % (k,v) for (k, v) in (list(Downloader.cookies.items()) + list(resp.cookies.items())))

            for f in resp_json["records"]:
                fileinfo = f["filelist"][0]
                baseurl = fileinfo["isdir"] == 1 and "http://pan.baidu.com/share/list?uk=%s&shareid=%s" % (uk, f["shareid"])
                try:
                    yield from Downloader.filelist_json_gen(baseurl, [fileinfo], cookies)
                except Exception as e:
                    print("Failed: %s" % f)
        raise StopIteration
Exemplo n.º 7
0
    def download_info(self, url):
        # home or filelist
        url_type = Downloader.try_parse(url)

        if url_type == "s":
            yield from Downloader.fileinfo_from_list(url)
        elif url_type == "home":
            yield from Downloader.fileinfo_from_home(url)
        else:
            raise BaseDownloaderException("URL Malform: %s, this should not happen" % (url))
Exemplo n.º 8
0
    def download_info(self, url):
        # extract filelist json from page
        try:
            page = requests.get(url, headers=Downloader.header)
        except Exception as e:
            raise BaseDownloaderException("Fail to get page %s\n%s" %
                                          (e, str(e)))

        if page.text.find("rootFileList") > 0:
            yield from Downloader.filelist_from_dir(page.text, url)
        else:
            yield Downloader.filelist_form_singlefile(page.text, url)
Exemplo n.º 9
0
    def filelist_form_singlefile(page, url):
        try:
            domain = re.compile(
                r'(?:http://)?((?:\w+\.)?l\d+\.yunpan.cn)/lk/.+$').search(
                    url).group(1)
        except Exception as e:
            raise BaseDownloaderException(
                "Malform URL: %s, no shortur or domain" % (s, str(e)))

        try:
            surl = re.compile(r"surl\s*:\s*'(\w+)',?").search(page).group(1)
            nid = re.compile(r"nid\s*:\s*'(\d+)',?").search(page).group(1)
            name = re.compile(r"name\s*:\s*'((?:[^']|\\')+)',?").search(
                page).group(1)
        except Exception as e:
            raise BaseDownloaderException("Url: %s has no file.\n%s" %
                                          (url, str(e)))

        return Downloader.post_for_link(url, domain, surl, {
            "nid": nid,
            "path": name
        })
Exemplo n.º 10
0
    def download_info(self, url):
        try:
            resp = requests.get(url, headers = self.header)
        except Exception as e:
            traceback.print_exc()
            raise BaseDownloaderException("Cannot read from Url: %s, %s" % (url, str(e)))

        # get number of pages
        try:
            num_of_pages = int((int(self.ttasks_regex.search(resp.text).group(1)) + 9) / 10)
        except:
            num_of_pages = 1

        base_url = resp.url

        for pg in range(1, num_of_pages + 1):
            if pg > 1:
                try:
                    resq_url = base_url + "?p_index=%s" % pg
                    resp = requests.get(resq_url, headers = self.header)
                except Exception as e:
                    continue

            # kuai.xunlei.com set Encoding in the respond header as ISO-8859-1
            # but the respond body is acutually in UTF-8. So, we need manually
            # set it
            resp.encoding = "utf-8"

            for l in self.file_regex.findall(resp.text):
                fn   = None
                durl = None
                for (k, v) in [s.split('="', 1) for s in re.split('"(?: |>)', l[3:]) if s != '']:
                    if k == "title":
                        fn = v
                    elif k == "href":
                        durl = v

                if durl.startswith("http://kuai.xunlei.com/"):
                    # this is an dir, we needs to recursively yields it
                    yield from self.download_info(durl)
                else:
                    if durl.startswith("#"):
                        continue

                    yield (Task(filename = fn, url = [durl],
                           opts = {"header": ["%s: %s" % (k, v) for k, v in list(self.header.items())]}))
Exemplo n.º 11
0
    def filelist_from_subdir(url, nid, surl, domain):
        header = dict(
            list(Downloader.header.items()) +
            [("Referer", url),
             ("Content-Type",
              "application/x-www-form-urlencoded UTF-8; charset=UTF-8")])
        post_url = "http://%s/share/listsharedir/" % domain
        try:
            post_data = format_payload(surl, nid)
            post_resp = requests.post(post_url, data=post_data, headers=header)
            resp_json = json.load(
                StringIO(
                    re.compile(r'data:(\[\{.*\}\])\}').search(
                        post_resp.text).group(1)))
        except Exception as e:
            raise BaseDownloaderException("\n".join([
                "  POST url: %s, data: %s" % (post_url, post_data),
                "  Request: %s %s" %
                (post_resp.request.headers, post_resp.request.body),
                "  Server respond: %d %s" %
                (post_resp.status_code, post_resp.text),
            ]))

        for fileinfo in resp_json:
            if fileinfo["fhash"] != "":
                try:
                    yield Downloader.post_for_link(url, domain, surl, fileinfo)
                except Exception as e:
                    print("Fail to get file %s:\n%s" %
                          (fileinfo["path"], str(e)))
                    continue
            else:
                try:
                    yield from Downloader.filelist_from_subdir(
                        url, fileinfo["nid"], surl, domain)
                except Exception as e:
                    print("Fail to get file from subdir [%s]:\n%s" %
                          (fileinfo["path"], str(e)))
                    continue
        raise StopIteration