Exemplo n.º 1
0
def rewrite_urls(origin_url, urls):
    origin_pack = urlparse.urlsplit(origin_url)
    for u in urls:
        # kill breaks
        if u:
            u = re.sub("(\n|\t)", "", u)

        pack = urlparse.urlsplit(u)
        (scheme, netloc, path, query, fragment) = pack

        # try to rewrite scheme
        scheme = rewrite_scheme(pack.scheme)

        # rewrite netloc to include credentials
        if origin_pack.username and pack.hostname == origin_pack.hostname:
            netloc = assemble_netloc(origin_pack.username,
                                     origin_pack.password, pack.hostname, pack.port)

        # reassemble into url
        new_u = urlparse.urlunsplit((scheme, netloc, path, query, None))

        # no scheme or netloc, it's a path on-site
        if not scheme and not netloc and (path or query):
            path_query = urlparse.urlunsplit(('', '', path, query, ''))
            new_u = urlparse.urljoin(origin_url, path_query)

        # quote spaces
        new_u = new_u.replace(" ", "%20")
        if new_u:
            yield new_u
Exemplo n.º 2
0
def url_to_filename(url):
    (scheme, netloc, path, query, _) = urlparse.urlsplit(url)
    file = os.path.basename(path)
    if os.environ.get("ORIG_FILENAMES") == "1" and file:
        filename = file
    else:
        (path, ext) = os.path.splitext(path)
        filename = "_".join([x for x in (scheme, netloc, path, query) if x])
        filename = re.sub("[^a-zA-Z0-9]", "_", filename)
        filename = re.sub("_{2,}", "_", filename)
        filename = re.sub("_$", "", filename)
        filename = filename + ext
    return filename
Exemplo n.º 3
0
def get_scheme(url):
    pack = urlparse.urlsplit(url)
    return pack.scheme
Exemplo n.º 4
0
def get_referer(url):
    (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
    path = os.path.dirname(path)
    return urlparse.urlunsplit((scheme, netloc, path, None, None))
Exemplo n.º 5
0
def get_hostname(url):
    pack = urlparse.urlsplit(url)
    return pack.hostname