def _create_link_from_element( anchor, # type: HTMLElement page_url, # type: str base_url, # type: str ): # type: (...) -> Optional[Link] """ Convert an anchor element in a simple repository page to a Link. """ href = anchor.get("href") if not href: return None url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yanked_reason = anchor.get('data-yanked') if yanked_reason: # This is a unicode string in Python 2 (and 3). yanked_reason = unescape(yanked_reason) link = Link( url, comes_from=page_url, requires_python=pyrequire, yanked_reason=yanked_reason, ) return link
def _parse_and_recurse(self, filename, constraint): # type: (str, bool) -> Iterator[ParsedLine] for line in self._parse_file(filename, constraint): if (not line.args and not line.opts.editables and (line.opts.requirements or line.opts.constraints)): # parse a nested requirements file if line.opts.requirements: req_path = line.opts.requirements[0] nested_constraint = False else: req_path = line.opts.constraints[0] nested_constraint = True # original file is over http if SCHEME_RE.search(filename): # do a url join so relative paths work req_path = urllib_parse.urljoin(filename, req_path) # original file and nested file are paths elif not SCHEME_RE.search(req_path): # do a join so relative paths work req_path = os.path.join( os.path.dirname(filename), req_path, ) for inner_line in self._parse_and_recurse( req_path, nested_constraint, ): yield inner_line else: yield line
def path_to_url(path): """ Convert a path to a file: URL. The path will be made absolute and have quoted path parts. """ path = os.path.normpath(os.path.abspath(path)) url = urllib_parse.urljoin('file:', urllib_request.pathname2url(path)) return url
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link(urllib_parse.urljoin( self.base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self, requires_python=pyrequire)
def links(self): """Yields all links in the page""" for anchor in self.parsed.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = self.clean_link( urllib_parse.urljoin(self.base_url, href) ) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self, requires_python=pyrequire)
def _get_html_page(link, session=None): # type: (Link, Optional[PipSession]) -> Optional[HTMLPage] if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'") url = link.url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) if vcs_scheme: logger.debug('Cannot look at %s URL %s', vcs_scheme, link) return None # Tack index.html onto file:// URLs that point to directories scheme, _, path, _, _, _ = urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) try: resp = _get_html_response(url, session=session) except _NotHTTP: logger.debug( 'Skipping page %s because it looks like an archive, and cannot ' 'be checked by HEAD.', link, ) except _NotHTML as exc: logger.debug( 'Skipping page %s because the %s request got Content-Type: %s', link, exc.request_desc, exc.content_type, ) except HTTPError as exc: _handle_get_page_fail(link, exc) except RetryError as exc: _handle_get_page_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) _handle_get_page_fail(link, reason, meth=logger.info) except requests.ConnectionError as exc: _handle_get_page_fail(link, "connection error: %s" % exc) except requests.Timeout: _handle_get_page_fail(link, "timed out") else: return _make_html_page(resp) return None
def iter_links(self): """Yields all links in the page""" document = html5lib.parse( self.content, transport_encoding=_get_encoding_from_headers(self.headers), namespaceHTMLElements=False, ) base_url = _determine_base_url(document, self.url) for anchor in document.findall(".//a"): if anchor.get("href"): href = anchor.get("href") url = _clean_link(urllib_parse.urljoin(base_url, href)) pyrequire = anchor.get('data-requires-python') pyrequire = unescape(pyrequire) if pyrequire else None yield Link(url, self.url, requires_python=pyrequire)
def process_line(line, filename, line_number, finder=None, comes_from=None, options=None, session=None, wheel_cache=None, constraint=False): """Process a single requirements line; This can result in creating/yielding requirements, or updating the finder. For lines that contain requirements, the only options that have an effect are from SUPPORTED_OPTIONS_REQ, and they are scoped to the requirement. Other options from SUPPORTED_OPTIONS may be present, but are ignored. For lines that do not contain requirements, the only options that have an effect are from SUPPORTED_OPTIONS. Options from SUPPORTED_OPTIONS_REQ may be present, but are ignored. These lines may contain multiple options (although our docs imply only one is supported), and all our parsed and affect the finder. :param constraint: If True, parsing a constraints file. :param options: OptionParser options that we may update """ parser = build_parser(line) defaults = parser.get_default_values() defaults.index_url = None if finder: # `finder.format_control` will be updated during parsing defaults.format_control = finder.format_control args_str, options_str = break_args_options(line) if sys.version_info < (2, 7, 3): # Prior to 2.7.3, shlex cannot deal with unicode entries options_str = options_str.encode('utf8') opts, _ = parser.parse_args(shlex.split(options_str), defaults) # preserve for the nested code path line_comes_from = '%s %s (line %s)' % ( '-c' if constraint else '-r', filename, line_number, ) # yield a line requirement if args_str: isolated = options.isolated_mode if options else False if options: cmdoptions.check_install_build_global(options, opts) # get the options that apply to requirements req_options = {} for dest in SUPPORTED_OPTIONS_REQ_DEST: if dest in opts.__dict__ and opts.__dict__[dest]: req_options[dest] = opts.__dict__[dest] yield InstallRequirement.from_line(args_str, line_comes_from, constraint=constraint, isolated=isolated, options=req_options, wheel_cache=wheel_cache) # yield an editable requirement elif opts.editables: isolated = options.isolated_mode if options else False yield InstallRequirement.from_editable(opts.editables[0], comes_from=line_comes_from, constraint=constraint, isolated=isolated, wheel_cache=wheel_cache) # parse a nested requirements file elif opts.requirements or opts.constraints: if opts.requirements: req_path = opts.requirements[0] nested_constraint = False else: req_path = opts.constraints[0] nested_constraint = True # original file is over http if SCHEME_RE.search(filename): # do a url join so relative paths work req_path = urllib_parse.urljoin(filename, req_path) # original file and nested file are paths elif not SCHEME_RE.search(req_path): # do a join so relative paths work req_path = os.path.join(os.path.dirname(filename), req_path) # TODO: Why not use `comes_from='-r {} (line {})'` here as well? parser = parse_requirements(req_path, finder, comes_from, options, session, constraint=nested_constraint, wheel_cache=wheel_cache) for req in parser: yield req # percolate hash-checking option upward elif opts.require_hashes: options.require_hashes = opts.require_hashes # set finder options elif finder: if opts.index_url: finder.index_urls = [opts.index_url] if opts.no_index is True: finder.index_urls = [] if opts.extra_index_urls: finder.index_urls.extend(opts.extra_index_urls) if opts.find_links: # FIXME: it would be nice to keep track of the source # of the find_links: support a find-links local path # relative to a requirements file. value = opts.find_links[0] req_dir = os.path.dirname(os.path.abspath(filename)) relative_to_reqs_file = os.path.join(req_dir, value) if os.path.exists(relative_to_reqs_file): value = relative_to_reqs_file finder.find_links.append(value) if opts.pre: finder.allow_all_prereleases = True if opts.process_dependency_links: finder.process_dependency_links = True if opts.trusted_hosts: finder.secure_origins.extend( ("*", host, "*") for host in opts.trusted_hosts)
def _url_for_path(self, path): return urllib_parse.urljoin(self.url, path)
def get_page(cls, link, skip_archives=True, session=None): if session is None: raise TypeError( "get_page() missing 1 required keyword argument: 'session'" ) url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pipenv.patched.notpip._internal.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: if skip_archives: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", "Cache-Control": "max-age=600", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = cls(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: cls._handle_fail(link, exc, url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) cls._handle_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: cls._handle_fail(link, "connection error: %s" % exc, url) except requests.Timeout: cls._handle_fail(link, "timed out", url) else: return inst
def get_page(cls, link, skip_archives=True, session=None): if session is None: raise TypeError( "get_page() missing 1 required keyword argument: 'session'") url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pipenv.patched.notpip._internal.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: if skip_archives: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = cls._get_content_type( url, session=session, ) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", "Cache-Control": "max-age=600", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = cls(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: cls._handle_fail(link, exc, url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) cls._handle_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: cls._handle_fail(link, "connection error: %s" % exc, url) except requests.Timeout: cls._handle_fail(link, "timed out", url) else: return inst
def _url_for_path(self, path): # type: (str) -> str return urllib_parse.urljoin(self.url, path)
def url_to_path(self, path): return urllib_parse.urljoin(self.url, path)
def process_line(line, filename, line_number, finder=None, comes_from=None, options=None, session=None, wheel_cache=None, constraint=False): """Process a single requirements line; This can result in creating/yielding requirements, or updating the finder. For lines that contain requirements, the only options that have an effect are from SUPPORTED_OPTIONS_REQ, and they are scoped to the requirement. Other options from SUPPORTED_OPTIONS may be present, but are ignored. For lines that do not contain requirements, the only options that have an effect are from SUPPORTED_OPTIONS. Options from SUPPORTED_OPTIONS_REQ may be present, but are ignored. These lines may contain multiple options (although our docs imply only one is supported), and all our parsed and affect the finder. :param constraint: If True, parsing a constraints file. :param options: OptionParser options that we may update """ parser = build_parser(line) defaults = parser.get_default_values() defaults.index_url = None if finder: # `finder.format_control` will be updated during parsing defaults.format_control = finder.format_control args_str, options_str = break_args_options(line) if sys.version_info < (2, 7, 3): # Prior to 2.7.3, shlex cannot deal with unicode entries options_str = options_str.encode('utf8') opts, _ = parser.parse_args(shlex.split(options_str), defaults) # preserve for the nested code path line_comes_from = '%s %s (line %s)' % ( '-c' if constraint else '-r', filename, line_number, ) # yield a line requirement if args_str: isolated = options.isolated_mode if options else False if options: cmdoptions.check_install_build_global(options, opts) # get the options that apply to requirements req_options = {} for dest in SUPPORTED_OPTIONS_REQ_DEST: if dest in opts.__dict__ and opts.__dict__[dest]: req_options[dest] = opts.__dict__[dest] yield InstallRequirement.from_line( args_str, line_comes_from, constraint=constraint, isolated=isolated, options=req_options, wheel_cache=wheel_cache ) # yield an editable requirement elif opts.editables: isolated = options.isolated_mode if options else False yield InstallRequirement.from_editable( opts.editables[0], comes_from=line_comes_from, constraint=constraint, isolated=isolated, wheel_cache=wheel_cache ) # parse a nested requirements file elif opts.requirements or opts.constraints: if opts.requirements: req_path = opts.requirements[0] nested_constraint = False else: req_path = opts.constraints[0] nested_constraint = True # original file is over http if SCHEME_RE.search(filename): # do a url join so relative paths work req_path = urllib_parse.urljoin(filename, req_path) # original file and nested file are paths elif not SCHEME_RE.search(req_path): # do a join so relative paths work req_path = os.path.join(os.path.dirname(filename), req_path) # TODO: Why not use `comes_from='-r {} (line {})'` here as well? parser = parse_requirements( req_path, finder, comes_from, options, session, constraint=nested_constraint, wheel_cache=wheel_cache ) for req in parser: yield req # percolate hash-checking option upward elif opts.require_hashes: options.require_hashes = opts.require_hashes # set finder options elif finder: if opts.index_url: finder.index_urls = [opts.index_url] if opts.no_index is True: finder.index_urls = [] if opts.extra_index_urls: finder.index_urls.extend(opts.extra_index_urls) if opts.find_links: # FIXME: it would be nice to keep track of the source # of the find_links: support a find-links local path # relative to a requirements file. value = opts.find_links[0] req_dir = os.path.dirname(os.path.abspath(filename)) relative_to_reqs_file = os.path.join(req_dir, value) if os.path.exists(relative_to_reqs_file): value = relative_to_reqs_file finder.find_links.append(value) if opts.pre: finder.allow_all_prereleases = True if opts.process_dependency_links: finder.process_dependency_links = True if opts.trusted_hosts: finder.secure_origins.extend( ("*", host, "*") for host in opts.trusted_hosts)
def _get_html_page(link, session=None): if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'") url = link.url url = url.split('#', 1)[0] # Check for VCS schemes that do not support lookup as web pages. from pipenv.patched.notpip._internal.vcs import VcsSupport for scheme in VcsSupport.schemes: if url.lower().startswith(scheme) and url[len(scheme)] in '+:': logger.debug('Cannot look at %s URL %s', scheme, link) return None try: filename = link.filename for bad_ext in ARCHIVE_EXTENSIONS: if filename.endswith(bad_ext): content_type = _get_content_type(url, session=session) if content_type.lower().startswith('text/html'): break else: logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return logger.debug('Getting page %s', url) # Tack index.html onto file:// URLs that point to directories (scheme, netloc, path, params, query, fragment) = \ urllib_parse.urlparse(url) if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): # add trailing slash if not present so urljoin doesn't trim # final segment if not url.endswith('/'): url += '/' url = urllib_parse.urljoin(url, 'index.html') logger.debug(' file: URL is directory, getting %s', url) resp = session.get( url, headers={ "Accept": "text/html", # We don't want to blindly returned cached data for # /simple/, because authors generally expecting that # twine upload && pip install will function, but if # they've done a pip install in the last ~10 minutes # it won't. Thus by setting this to zero we will not # blindly use any cached data, however the benefit of # using max-age=0 instead of no-cache, is that we will # still support conditional requests, so we will still # minimize traffic sent in cases where the page hasn't # changed at all, we will just always incur the round # trip for the conditional GET now instead of only # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", }, ) resp.raise_for_status() # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every # url we cannot know ahead of time for sure if something is HTML # or not. However we can check after we've downloaded it. content_type = resp.headers.get('Content-Type', 'unknown') if not content_type.lower().startswith("text/html"): logger.debug( 'Skipping page %s because of Content-Type: %s', link, content_type, ) return inst = HTMLPage(resp.content, resp.url, resp.headers) except requests.HTTPError as exc: _handle_get_page_fail(link, exc, url) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) _handle_get_page_fail(link, reason, url, meth=logger.info) except requests.ConnectionError as exc: _handle_get_page_fail(link, "connection error: %s" % exc, url) except requests.Timeout: _handle_get_page_fail(link, "timed out", url) else: return inst