def test_extract_relative_urls(self):
        page = """
        <html>
        <body>
            <p>
                <a href="foo.html">FooPage</a>
            </p>
            <div>
                <div>
                    <div>
                        <div>
                            <a href="/sub/page/../bar.html">BarPage</a>
                        </div>
                    </div>
                </div>
            </div>
        </body>
        </html>
        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
            Link(self.crawled_page_url, "/sub/bar.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
    def test_extract_multiple_links(self):
        page = """
        <html>
        <head>
            <title>Test Page Two Links</title>
        </head>
        <body>
            <p>
                <a href="http://www.example.com/foo.html">FooPage</a>
            </p>
            <div>
                <div>
                    <div>
                        <div>
                            <a href="https://www.example.com/sub/page/bar.html">BarPage</a>
                        </div>
                    </div>
                </div>
            </div>
        </body>
        </html>
        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
            Link(self.crawled_page_url, "/sub/page/bar.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
    def test_extract_no_links(self):
        page = """
        <html>
        <body>
        </body>
        </html>
        """
        expected_links = []

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)
        self.assertEqual(actual_links, expected_links)
예제 #4
0
    def __init__(self, link, page_text):
        """Initialiser

            Args:
                link: The link that describes this page
                page_text: The text of the page
        """
        self.link = link
        self._page_text = page_text

        self.out_links = LinkExtractor.extract(self.link.url, page_text)
    def test_discards_invalid_links(self):
        page = """
        <html>
        <body>
            <p>
                <a href="ftp://www.example.com/foo.html">FooPage</a>
                <a href="example.com/../../bar.html">BarPage</a>
                <a href="/baz.html">BazPage</a>
            </p>
        </body>
        </html>

        """
        expected_links = [
            Link("http://www.example.com", "/baz.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
    def test_extract_one_link(self):
        page = """
        <html>
        <head>
            <title>Test Page One Link</title>
        </head>
        <body>
            <p>
                <a href="http://www.example.com/foo.html">FooPage</a>
            </p>
        </body>
        </html>
        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
    def test_extract_includes_external_links(self):
        page = """
        <html>
        <body>
            <p>
                <a href="http://www.example.com/foo.html">FooPage</a>
                <a href="http://example.com/bar.html">BarPage</a>
                <a href="http://www.example.net/baz.html">BazPage</a>
            </p>
        </body>
        </html>

        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
            Link(self.crawled_page_url, "http://example.com/bar.html"),
            Link(self.crawled_page_url, "http://www.example.net/baz.html")
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)