def normalize(self): parsed = urllib_parse.urlsplit(self.url) if parsed.scheme == 'file': path = urllib_request.url2pathname(parsed.path) path = normalize_path(path) path = urllib_request.pathname2url(path) self.url = urllib_parse.urlunsplit( (parsed.scheme, parsed.netloc, path, parsed.query, parsed.fragment))
def _get_html_page(link, session=None): # type: (Link, Optional[PipSession]) -> Optional[HTMLPage] if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'" ) url = link.url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) if vcs_scheme: logger.debug('Cannot look at %s URL %s', vcs_scheme, link) return None # Tack index.html onto file:// URLs that point to directories scheme, _, path, _, _, _ = urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) try: resp = _get_html_response(url, session=session) except _NotHTTP as exc: logger.debug( 'Skipping page %s because it looks like an archive, and cannot ' 'be checked by HEAD.', link, ) except _NotHTML as exc: logger.debug( 'Skipping page %s because the %s request got Content-Type: %s', link, exc.request_desc, exc.content_type, ) except requests.HTTPError as exc: _handle_get_page_fail(link, exc) except RetryError as exc: _handle_get_page_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) _handle_get_page_fail(link, reason, meth=logger.info) except requests.ConnectionError as exc: _handle_get_page_fail(link, "connection error: %s" % exc) except requests.Timeout: _handle_get_page_fail(link, "timed out") else: return HTMLPage(resp.content, resp.url, resp.headers) return None
def url_to_path(url): """ Convert a file: URL to a path. """ assert url.startswith("file:"), "You can only turn file: urls into filenames (not %r)" % url _, netloc, path, _, _ = urllib_parse.urlsplit(url) # if we have a UNC path, prepend UNC share notation if netloc: netloc = "\\\\" + netloc path = urllib_request.url2pathname(netloc + path) return path
def url_to_path(url): # type: (str) -> str """ Convert a file: URL to a path. """ assert url.startswith('file:'), ( "You can only turn file: urls into filenames (not %r)" % url) _, netloc, path, _, _ = urllib_parse.urlsplit(url) # if we have a UNC path, prepend UNC share notation if netloc: netloc = '\\\\' + netloc path = urllib_request.url2pathname(netloc + path) return path
def __init__(self, url=None, *args, **kwargs): # Works around an apparent Git bug # (see https://article.gmane.org/gmane.comp.version-control.git/146500) if url: scheme, netloc, path, query, fragment = urlsplit(url) if scheme.endswith('file'): initial_slashes = path[:-len(path.lstrip('/'))] newpath = (initial_slashes + urllib_request.url2pathname(path).replace( '\\', '/').lstrip('/')) url = urlunsplit((scheme, netloc, newpath, query, fragment)) after_plus = scheme.find('+') + 1 url = scheme[:after_plus] + urlunsplit( (scheme[after_plus:], netloc, newpath, query, fragment), ) super(Git, self).__init__(url, *args, **kwargs)
def get_url_rev_and_auth(cls, url): # type: (str) -> Tuple[str, Optional[str], AuthInfo] """ Prefixes stub URLs like 'user@hostname:user/repo.git' with 'ssh://'. That's required because although they use SSH they sometimes don't work with a ssh:// scheme (e.g. GitHub). But we need a scheme for parsing. Hence we remove it again afterwards and return it as a stub. """ # Works around an apparent Git bug # (see https://article.gmane.org/gmane.comp.version-control.git/146500) scheme, netloc, path, query, fragment = urlsplit(url) if scheme.endswith('file'): initial_slashes = path[:-len(path.lstrip('/'))] newpath = ( initial_slashes + urllib_request.url2pathname(path) .replace('\\', '/').lstrip('/') )
def __init__(self, url=None, *args, **kwargs): # Works around an apparent Git bug # (see http://article.gmane.org/gmane.comp.version-control.git/146500) if url: scheme, netloc, path, query, fragment = urlsplit(url) if scheme.endswith('file'): initial_slashes = path[:-len(path.lstrip('/'))] newpath = ( initial_slashes + urllib_request.url2pathname(path) .replace('\\', '/').lstrip('/') ) url = urlunsplit((scheme, netloc, newpath, query, fragment)) after_plus = scheme.find('+') + 1 url = scheme[:after_plus] + urlunsplit( (scheme[after_plus:], netloc, newpath, query, fragment), ) super(Git, self).__init__(url, *args, **kwargs)
def _clean_link(url): # type: (str) -> str """Makes sure a link is fully encoded. That is, if a ' ' shows up in the link, it will be rewritten to %20 (while not over-quoting % or other characters).""" # Split the URL into parts according to the general structure # `scheme://netloc/path;parameters?query#fragment`. Note that the # `netloc` can be empty and the URI will then refer to a local # filesystem path. result = urllib_parse.urlparse(url) # In both cases below we unquote prior to quoting to make sure # nothing is double quoted. if result.netloc == "": # On Windows the path part might contain a drive letter which # should not be quoted. On Linux where drive letters do not # exist, the colon should be quoted. We rely on urllib.request # to do the right thing here. path = urllib_request.pathname2url( urllib_request.url2pathname(result.path)) else: path = urllib_parse.quote(urllib_parse.unquote(result.path)) return urllib_parse.urlunparse(result._replace(path=path))
def url_to_path(url): # type: (str) -> str """ Convert a file: URL to a path. """ assert url.startswith("file:"), ( "You can only turn file: urls into filenames (not %r)" % url) _, netloc, path, _, _ = urllib_parse.urlsplit(url) if not netloc or netloc == "localhost": # According to RFC 8089, same as empty authority. netloc = "" elif sys.platform == "win32": # If we have a UNC path, prepend UNC share notation. netloc = "\\\\" + netloc else: raise ValueError( "non-local file URIs are not supported on this platform: %r" % url) path = urllib_request.url2pathname(netloc + path) return path
def url_to_path(url): # type: (str) -> str """ Convert a file: URL to a path. """ assert url.startswith('file:'), ( "You can only turn file: urls into filenames (not {url!r})".format( **locals())) _, netloc, path, _, _ = urllib_parse.urlsplit(url) if not netloc or netloc == 'localhost': # According to RFC 8089, same as empty authority. netloc = '' elif sys.platform == 'win32': # If we have a UNC path, prepend UNC share notation. netloc = '\\\\' + netloc else: raise ValueError( 'non-local file URIs are not supported on this platform: {url!r}'. format(**locals())) path = urllib_request.url2pathname(netloc + path) return path
def url_to_path(url): # type: (str) -> str """ Convert a file: URL to a path. """ assert url.startswith('file:'), ( "You can only turn file: urls into filenames (not %r)" % url) _, netloc, path, _, _ = urllib_parse.urlsplit(url) if not netloc or netloc == 'localhost': # According to RFC 8089, same as empty authority. netloc = '' elif sys.platform == 'win32': # If we have a UNC path, prepend UNC share notation. netloc = '\\\\' + netloc else: raise ValueError( 'non-local file URIs are not supported on this platform: %r' % url ) path = urllib_request.url2pathname(netloc + path) return path
def get_page(cls, link, skip_archives=True, session=None): if session is None: raise TypeError( "get_page() missing 1 required keyword argument: 'session'") url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: if skip_archives: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", "Cache-Control": "max-age=600", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = cls(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: cls._handle_fail(link, exc, url) except SSLError as exc: reason = ("There was a problem confirming the ssl certificate: " "%s" % exc) cls._handle_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: cls._handle_fail(link, "connection error: %s" % exc, url) except requests.Timeout: cls._handle_fail(link, "timed out", url) else: return inst
url = link.url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) if vcs_scheme: <<<<<<< HEAD logger.warning('Cannot look at %s URL %s because it does not support ' 'lookup as web pages.', vcs_scheme, link) ======= logger.debug('Cannot look at %s URL %s', vcs_scheme, link) >>>>>>> b66a76afa15ab74019740676a52a071b85ed8f71 return None # Tack index.html onto file:// URLs that point to directories scheme, _, path, _, _, _ = urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) try: resp = _get_html_response(url, session=session) except _NotHTTP: <<<<<<< HEAD logger.warning( 'Skipping page %s because it looks like an archive, and cannot ' 'be checked by a HTTP HEAD request.', link, )
def _get_html_page(link, session=None): # type: (Link, Optional[PipSession]) -> Optional[HTMLPage] if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'") url = link.url.split("#", 1)[0] # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) if vcs_scheme: logger.warning( "Cannot look at %s URL %s because it does not support " "lookup as web pages.", vcs_scheme, link, ) return None # Tack index.html onto file:// URLs that point to directories scheme, _, path, _, _, _ = urllib_parse.urlparse(url) if scheme == "file" and os.path.isdir(urllib_request.url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith("/"): url += "/" url = urllib_parse.urljoin(url, "index.html") logger.debug(" file: URL is directory, getting %s", url) try: resp = _get_html_response(url, session=session) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " "be checked by a HTTP HEAD request.", link, ) except _NotHTML as exc: logger.warning( "Skipping page %s because the %s request got Content-Type: %s." "The only supported Content-Type is text/html", link, exc.request_desc, exc.content_type, ) except NetworkConnectionError as exc: _handle_get_page_fail(link, exc) except RetryError as exc: _handle_get_page_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) _handle_get_page_fail(link, reason, meth=logger.info) except requests.ConnectionError as exc: _handle_get_page_fail(link, "connection error: {}".format(exc)) except requests.Timeout: _handle_get_page_fail(link, "timed out") else: return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing) return None
def get_page(cls, link, skip_archives=True, session=None): if session is None: raise TypeError( "get_page() missing 1 required keyword argument: 'session'" ) url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pip._internal.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: if skip_archives: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", "Cache-Control": "max-age=600", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = cls(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: cls._handle_fail(link, exc, url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) cls._handle_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: cls._handle_fail(link, "connection error: %s" % exc, url) except requests.Timeout: cls._handle_fail(link, "timed out", url) else: return inst
def _get_html_page(link, session=None): if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'") url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pip._internal.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = _get_content_type(url, session=session) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack blank.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'blank.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", # We don't want to blindly returned cached data for # /simple/, because authors generally expecting that # twine upload && pip install will function, but if # they've done a pip install in the last ~10 minutes # it won't. Thus by setting this to zero we will not # blindly use any cached data, however the benefit of # using max-age=0 instead of no-cache, is that we will # still support conditional requests, so we will still # minimize traffic sent in cases where the page hasn't # changed at all, we will just always incur the round # trip for the conditional GET now instead of only # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = HTMLPage(resp.content, resp.url, resp.headers) except HTTPError as exc: _handle_get_page_fail(link, exc, url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) _handle_get_page_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: _handle_get_page_fail(link, "connection error: %s" % exc, url) except requests.Timeout: _handle_get_page_fail(link, "timed out", url) else: return inst
def get_page(cls, link, skip_archives=True, session=None): if session is None: raise TypeError("get_page() missing 1 required keyword argument: 'session'") url = link.url url = url.split("#", 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pip.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in "+:": logger.debug("Cannot look at %s URL %s", scheme, link) return None try: if skip_archives: filename = link.filename for bad_ext in [".tar", ".tar.gz", ".tar.bz2", ".tgz", ".zip"]: if filename.endswith(bad_ext): content_type = cls._get_content_type(url, session=session) if content_type.lower().startswith("text/html"): break else: logger.debug("Skipping page %s because of Content-Type: %s", link, content_type) return logger.debug("Getting page %s", url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = urllib_parse.urlparse(url) if scheme == "file" and os.path.isdir(urllib_request.url2pathname(path)): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith("/"): url += "/" url = urllib_parse.urljoin(url, "index.html") logger.debug(" file: URL is directory, getting %s", url) resp = session.get(url, headers={"Accept": "text/html", "Cache-Control": "max-age=600"}) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get("Content-Type", "unknown") if not content_type.lower().startswith("text/html"): logger.debug("Skipping page %s because of Content-Type: %s", link, content_type) return inst = cls(resp.content, resp.url, resp.headers, trusted=link.trusted) except requests.HTTPError as exc: level = 2 if exc.response.status_code == 404 else 1 cls._handle_fail(link, exc, url, level=level) except requests.ConnectionError as exc: cls._handle_fail(link, "connection error: %s" % exc, url) except requests.Timeout: cls._handle_fail(link, "timed out", url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " "%s" % exc cls._handle_fail(link, reason, url, level=2, meth=logger.info) else: return inst
def _get_html_page(link, session=None): if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'" ) url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pip._internal.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = _get_content_type(url, session=session) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", # We don't want to blindly returned cached data for # /simple/, because authors generally expecting that # twine upload && pip install will function, but if # they've done a pip install in the last ~10 minutes # it won't. Thus by setting this to zero we will not # blindly use any cached data, however the benefit of # using max-age=0 instead of no-cache, is that we will # still support conditional requests, so we will still # minimize traffic sent in cases where the page hasn't # changed at all, we will just always incur the round # trip for the conditional GET now instead of only # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = HTMLPage(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: _handle_get_page_fail(link, exc, url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) _handle_get_page_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: _handle_get_page_fail(link, "connection error: %s" % exc, url) except requests.Timeout: _handle_get_page_fail(link, "timed out", url) else: return inst
def url_to_path(url): """ Convert a file: URL to a path. """ assert url.startswith('file:'), ( "You can only turn file: urls into filenames (not %r)" % url) _, netloc, path, _, _ = urllib_parse.urlsplit(url) # if we have a UNC path, prepend UNC share notation if netloc: netloc = '\\\\' + netloc path = urllib_request.url2pathname(netloc + path) return path def path_to_url(path): """ Convert a path to a file: URL. The path will be made absolute and have quoted path parts. """ path = os.path.normpath(os.path.abspath(path)) url = urllib_parse.urljoin('file:', urllib_request.pathname2url(path)) return url def is_archive_file(name): """Return True if `name` is a considered as an archive file."""