Exemplo n.º 1
0
def test_determine_base_url(html, url, expected):
    document = html5lib.parse(
        html,
        transport_encoding=None,
        namespaceHTMLElements=False,
    )
    assert _determine_base_url(document, url) == expected
Exemplo n.º 2
0
def parse_links(
        html,  # type: bytes
        encoding,  # type: Optional[str]
        url,  # type: str
):
    # type: (...) -> Iterable[Link]
    """
    Parse an HTML document, and yield its anchor elements as Link objects.

    :param url: the URL from which the HTML was downloaded.
    """
    document = html5lib.parse(
        html,
        transport_encoding=encoding,
        namespaceHTMLElements=False,
    )
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(
            anchor,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link
Exemplo n.º 3
0
def test_parse_links_caches_same_page():
    html = (
        # Mark this as a unicode string for Python 2 since anchor_html
        # can contain non-ascii.
        u'<html><head><meta charset="utf-8"><head>'
        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>')
    html_bytes = html.encode('utf-8')

    page_1 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )
    page_2 = HTMLPage(
        html_bytes,
        encoding=None,
        url='https://example.com/simple/',
    )

    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
    with mock_parse as mock_parse:
        mock_parse.return_value = html5lib.parse(
            page_1.content,
            transport_encoding=page_1.encoding,
            namespaceHTMLElements=False,
        )
        parsed_links_1 = list(parse_links(page_1))
        mock_parse.assert_called()

    with mock_parse as mock_parse:
        parsed_links_2 = list(parse_links(page_2))
        assert parsed_links_2 == parsed_links_1
        mock_parse.assert_not_called()
Exemplo n.º 4
0
    def __init__(self, content, url, headers=None):
        # Determine if we have any encoding information in our headers
        encoding = None
        if headers and "Content-Type" in headers:
            content_type, params = cgi.parse_header(headers["Content-Type"])

            if "charset" in params:
                encoding = params["charset"]

        self.content = content
        self.parsed = html5lib.parse(self.content, encoding=encoding, namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
Exemplo n.º 5
0
 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)
Exemplo n.º 6
0
Arquivo: index.py Projeto: jaraco/pip
 def iter_links(self):
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         if anchor.get("href"):
             href = anchor.get("href")
             url = _clean_link(urllib_parse.urljoin(base_url, href))
             pyrequire = anchor.get('data-requires-python')
             pyrequire = unescape(pyrequire) if pyrequire else None
             yield Link(url, self.url, requires_python=pyrequire)
Exemplo n.º 7
0
    def __init__(self, content, url, headers=None):
        # Determine if we have any encoding information in our headers
        encoding = None
        if headers and "Content-Type" in headers:
            content_type, params = cgi.parse_header(headers["Content-Type"])

            if "charset" in params:
                encoding = params["charset"]

        self.content = content
        self.parsed = html5lib.parse(self.content,
                                     transport_encoding=encoding,
                                     namespaceHTMLElements=False)
        self.url = url
        self.headers = headers
Exemplo n.º 8
0
def parse_links(page):
    # type: (HTMLPage) -> Iterable[Link]
    """
    Parse an HTML document, and yield its anchor elements as Link objects.
    """
    document = html5lib.parse(
        page.content, transport_encoding=page.encoding, namespaceHTMLElements=False,
    )

    url = page.url
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(anchor, page_url=url, base_url=base_url,)
        if link is None:
            continue
        yield link
Exemplo n.º 9
0
 def iter_links(self):
     # type: () -> Iterable[Link]
     """Yields all links in the page"""
     document = html5lib.parse(
         self.content,
         transport_encoding=_get_encoding_from_headers(self.headers),
         namespaceHTMLElements=False,
     )
     base_url = _determine_base_url(document, self.url)
     for anchor in document.findall(".//a"):
         link = _create_link_from_element(
             anchor,
             page_url=self.url,
             base_url=base_url,
         )
         if link is None:
             continue
         yield link
Exemplo n.º 10
0
    def search(self, query: str) -> SearchResult:
        pypi_simple = self.sources[0]["url"].rstrip("/")
        results = []

        if pypi_simple.endswith("/simple"):
            search_url = pypi_simple[:-6] + "search"
        else:
            search_url = pypi_simple + "/search"

        with self.environment.get_finder() as finder:
            session = finder.session
            resp = session.get(search_url, params={"q": query})
            if resp.status_code == 404:
                self.environment.project.core.ui.echo(
                    termui.yellow(
                        f"{pypi_simple!r} doesn't support '/search' endpoint, fallback "
                        f"to {self.DEFAULT_INDEX_URL!r} now.\n"
                        "This may take longer depending on your network condition."
                    ),
                    err=True,
                )
                resp = session.get(f"{self.DEFAULT_INDEX_URL}/search",
                                   params={"q": query})
            resp.raise_for_status()
            content = parse(resp.content, namespaceHTMLElements=False)

        for result in content.findall(".//*[@class='package-snippet']"):
            name = result.find("h3/*[@class='package-snippet__name']").text
            version = result.find(
                "h3/*[@class='package-snippet__version']").text

            if not name or not version:
                continue

            description = result.find(
                "p[@class='package-snippet__description']").text
            if not description:
                description = ""

            result = Package(name, version, description)
            results.append(result)

        return results
Exemplo n.º 11
0
def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]:
    """
    Parse an HTML document, and yield its anchor elements as Link objects.

    TODO: Remove when `html5lib` is dropped.
    """
    document = html5lib.parse(
        page.content,
        transport_encoding=page.encoding,
        namespaceHTMLElements=False,
    )

    url = page.url
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(
            anchor.attrib,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link
Exemplo n.º 12
0
 def __init__(self, content, url, headers=None, trusted=None):
     self.content = content
     self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
     self.url = url
     self.headers = headers
     self.trusted = trusted
Exemplo n.º 13
0
 def __init__(self, content, url, headers=None, trusted=None):
     self.content = content
     self.parsed = html5lib.parse(self.content, namespaceHTMLElements=False)
     self.url = url
     self.headers = headers
     self.trusted = trusted
Exemplo n.º 14
0
def test_determine_base_url(html, url, expected):
    document = html5lib.parse(
        html, transport_encoding=None, namespaceHTMLElements=False,
    )
    assert _determine_base_url(document, url) == expected
Exemplo n.º 15
0
    return wrapper_wrapper


@with_cached_html_pages
def parse_links(page):
    # type: (HTMLPage) -> Iterable[Link]
    """
    Parse an HTML document, and yield its anchor elements as Link objects.
    """
<<<<<<< HEAD
    document = html5lib.bbc_parse(page.content)
=======
    document = html5lib.parse(
        page.content,
        transport_encoding=page.encoding,
        namespaceHTMLElements=False,
    )
>>>>>>> 241b678... create predictions

    url = page.url
    base_url = _determine_base_url(document, url)
    for anchor in document.findall(".//a"):
        link = _create_link_from_element(
            anchor,
            page_url=url,
            base_url=base_url,
        )
        if link is None:
            continue
        yield link