Exemplo n.º 1
0
 def test(test_url):
     data = '<a href="%s">my link here</a>' % test_url
     result = site_diff.extract_urls(base_url, data)
     if not result:
         return None
     return list(result)[0]
Exemplo n.º 2
0
    def testAll(self):
        """Tests all the variations."""
        base_url = 'http://www.example.com/my-url/here'
        def test(test_url):
            data = '<a href="%s">my link here</a>' % test_url
            result = site_diff.extract_urls(base_url, data)
            if not result:
                return None
            return list(result)[0]

        self.assertEquals('http://www.example.com/my-url/dummy_page2.html',
                          test('dummy_page2.html'))

        self.assertEquals('http://www.example.com/',
                          test('/'))

        self.assertEquals('http://www.example.com/mypath-here',
                          test('/mypath-here'))

        self.assertEquals(None, test('#fragment-only'))

        self.assertEquals('http://www.example.com/my/path/over/here.html',
                          test('/my/path/01/13/../../over/here.html'))

        self.assertEquals('http://www.example.com/my/path/01/over/here.html',
                          test('/my/path/01/13/./../over/here.html'))

        self.assertEquals('http://www.example.com/my-url/same-directory.html',
                          test('same-directory.html'))

        self.assertEquals('http://www.example.com/relative-but-no/child',
                          test('../../relative-but-no/child'))

        self.assertEquals('http://www.example.com/too/many/relative/paths',
                          test('../../../../too/many/relative/paths'))

        self.assertEquals(
            'http://www.example.com/this/is/scheme-relative.html',
            test('//www.example.com/this/is/scheme-relative.html'))

        self.assertEquals(
            'http://www.example.com/okay-then',    # Scheme changed
            test('https://www.example.com/okay-then#blah'))

        self.assertEquals('http://www.example.com/another-one',
                          test('http://www.example.com/another-one'))

        self.assertEquals('http://www.example.com/this-has/a',
                          test('/this-has/a?query=string'))

        self.assertEquals(
            'http://www.example.com/this-also-has/a/',
            test('/this-also-has/a/?query=string&but=more-complex'))

        self.assertEquals(
            'http://www.example.com/relative-with/some-(parenthesis%20here)',
            test('/relative-with/some-(parenthesis%20here)'))

        self.assertEquals(
            'http://www.example.com/relative-with/some-(parenthesis%20here)',
            test('//www.example.com/relative-with/some-(parenthesis%20here)'))

        self.assertEquals(
            'http://www.example.com/relative-with/some-(parenthesis%20here)',
            test('http://www.example.com/relative-with/some-'
                 '(parenthesis%20here)'))

        self.assertIsNone(test('mailto:[email protected]'))

        # Known bad results
        self.assertEquals(
            'http://www.example.com/my-url/ftp://[email protected]/',
            test('ftp://[email protected]/'))

        self.assertEquals(
            'http://www.example.com/my-url/javascript:runme()',
            test('javascript:runme()'))

        self.assertEquals(
            'http://www.example.com/my-url/tel:1-555-555-5555',
            test('tel:1-555-555-5555'))

        self.assertEquals('http://www.example.com/test.js',
                          test('/test.js'))

        # Escaped sources (e.g. inside inline JavaScript) are scraped,
        # even though they shouldn't be.
        scriptTag = ('<script type=\"text\/javascript\"'
            ' src=\"\/\/platform.twitter.com\/widgets.js\"><\/script>')
        self.assertEquals(
            set([
                'http://www.example.com/my-url/'
                '\\/\\/platform.twitter.com\\/widgets.js'
            ]),
            site_diff.extract_urls(base_url, scriptTag))

        spacesInTag = "<a href = 'spaced.html'>"
        self.assertEquals(
            set(['http://www.example.com/my-url/spaced.html']),
            site_diff.extract_urls(base_url, spacesInTag))

        # JavaScript variable assignment isn't handled correctly.
        jsText = "var src = true;"
        self.assertEquals(
            set([
                'http://www.example.com/my-url/true'
            ]),
            site_diff.extract_urls(base_url, jsText))
Exemplo n.º 3
0
 def test(test_url):
     data = '<a href="%s">my link here</a>' % test_url
     result = site_diff.extract_urls(base_url, data)
     if not result:
         return None
     return list(result)[0]
Exemplo n.º 4
0
    def testAll(self):
        """Tests all the variations."""
        base_url = "http://www.example.com/my-url/here"

        def test(test_url):
            data = '<a href="%s">my link here</a>' % test_url
            result = site_diff.extract_urls(base_url, data)
            if not result:
                return None
            return list(result)[0]

        self.assertEquals("http://www.example.com/my-url/dummy_page2.html", test("dummy_page2.html"))

        self.assertEquals("http://www.example.com/", test("/"))

        self.assertEquals("http://www.example.com/mypath-here", test("/mypath-here"))

        self.assertEquals(None, test("#fragment-only"))

        self.assertEquals("http://www.example.com/my/path/over/here.html", test("/my/path/01/13/../../over/here.html"))

        self.assertEquals(
            "http://www.example.com/my/path/01/over/here.html", test("/my/path/01/13/./../over/here.html")
        )

        self.assertEquals("http://www.example.com/my-url/same-directory.html", test("same-directory.html"))

        self.assertEquals("http://www.example.com/relative-but-no/child", test("../../relative-but-no/child"))

        self.assertEquals("http://www.example.com/too/many/relative/paths", test("../../../../too/many/relative/paths"))

        self.assertEquals(
            "http://www.example.com/this/is/scheme-relative.html",
            test("//www.example.com/this/is/scheme-relative.html"),
        )

        self.assertEquals(
            "http://www.example.com/okay-then", test("https://www.example.com/okay-then#blah")  # Scheme changed
        )

        self.assertEquals("http://www.example.com/another-one", test("http://www.example.com/another-one"))

        self.assertEquals("http://www.example.com/this-has/a", test("/this-has/a?query=string"))

        self.assertEquals(
            "http://www.example.com/this-also-has/a/", test("/this-also-has/a/?query=string&but=more-complex")
        )

        self.assertEquals(
            "http://www.example.com/relative-with/some-(parenthesis%20here)",
            test("/relative-with/some-(parenthesis%20here)"),
        )

        self.assertEquals(
            "http://www.example.com/relative-with/some-(parenthesis%20here)",
            test("//www.example.com/relative-with/some-(parenthesis%20here)"),
        )

        self.assertEquals(
            "http://www.example.com/relative-with/some-(parenthesis%20here)",
            test("http://www.example.com/relative-with/some-" "(parenthesis%20here)"),
        )

        # Known bad results
        self.assertEquals(
            "http://www.example.com/my-url/ftp://[email protected]/", test("ftp://[email protected]/")
        )

        self.assertEquals("http://www.example.com/my-url/mailto:[email protected]", test("mailto:[email protected]"))

        self.assertEquals("http://www.example.com/my-url/javascript:runme()", test("javascript:runme()"))

        self.assertEquals("http://www.example.com/my-url/tel:1-555-555-5555", test("tel:1-555-555-5555"))

        self.assertEquals("http://www.example.com/test.js", test("/test.js"))

        # Escaped sources (e.g. inside inline JavaScript) are scraped,
        # even though they shouldn't be.
        scriptTag = '<script type="text\/javascript"' ' src="\/\/platform.twitter.com\/widgets.js"><\/script>'
        self.assertEquals(
            set(["http://www.example.com/my-url/" "\\/\\/platform.twitter.com\\/widgets.js"]),
            site_diff.extract_urls(base_url, scriptTag),
        )

        spacesInTag = "<a href = 'spaced.html'>"
        self.assertEquals(
            set(["http://www.example.com/my-url/spaced.html"]), site_diff.extract_urls(base_url, spacesInTag)
        )

        # JavaScript variable assignment isn't handled correctly.
        jsText = "var src = true;"
        self.assertEquals(set(["http://www.example.com/my-url/true"]), site_diff.extract_urls(base_url, jsText))
Exemplo n.º 5
0
    def testAll(self):
        """Tests all the variations."""
        base_url = 'http://www.example.com/my-url/here'
        def test(test_url):
            data = '<a href="%s">my link here</a>' % test_url
            result = site_diff.extract_urls(base_url, data)
            if not result:
                return None
            return list(result)[0]

        self.assertEquals('http://www.example.com/my-url/dummy_page2.html',
                          test('dummy_page2.html'))

        self.assertEquals('http://www.example.com/',
                          test('/'))

        self.assertEquals('http://www.example.com/mypath-here',
                          test('/mypath-here'))

        self.assertEquals(None, test('#fragment-only'))

        self.assertEquals('http://www.example.com/my/path/over/here.html',
                          test('/my/path/01/13/../../over/here.html'))

        self.assertEquals('http://www.example.com/my/path/01/over/here.html',
                          test('/my/path/01/13/./../over/here.html'))

        self.assertEquals('http://www.example.com/my-url/same-directory.html',
                          test('same-directory.html'))

        self.assertEquals('http://www.example.com/relative-but-no/child',
                          test('../../relative-but-no/child'))

        self.assertEquals('http://www.example.com/too/many/relative/paths',
                          test('../../../../too/many/relative/paths'))

        self.assertEquals(
            'http://www.example.com/this/is/scheme-relative.html',
            test('//www.example.com/this/is/scheme-relative.html'))

        self.assertEquals(
            'http://www.example.com/okay-then',    # Scheme changed
            test('https://www.example.com/okay-then#blah'))

        self.assertEquals('http://www.example.com/another-one',
                          test('http://www.example.com/another-one'))

        self.assertEquals('http://www.example.com/this-has/a',
                          test('/this-has/a?query=string'))

        self.assertEquals(
            'http://www.example.com/this-also-has/a/',
            test('/this-also-has/a/?query=string&but=more-complex'))

        self.assertEquals(
            'http://www.example.com/relative-with/some-(parenthesis%20here)',
            test('/relative-with/some-(parenthesis%20here)'))

        self.assertEquals(
            'http://www.example.com/relative-with/some-(parenthesis%20here)',
            test('//www.example.com/relative-with/some-(parenthesis%20here)'))

        self.assertEquals(
            'http://www.example.com/relative-with/some-(parenthesis%20here)',
            test('http://www.example.com/relative-with/some-'
                 '(parenthesis%20here)'))

        # Known bad results
        self.assertEquals(
            'http://www.example.com/my-url/ftp://[email protected]/',
            test('ftp://[email protected]/'))

        self.assertEquals(
            'http://www.example.com/my-url/mailto:[email protected]',
            test('mailto:[email protected]'))

        self.assertEquals(
            'http://www.example.com/my-url/javascript:runme()',
            test('javascript:runme()'))

        self.assertEquals(
            'http://www.example.com/my-url/tel:1-555-555-5555',
            test('tel:1-555-555-5555'))

        self.assertEquals('http://www.example.com/test.js',
                          test('/test.js'))

        # Escaped sources (e.g. inside inline JavaScript) are scraped,
        # even though they shouldn't be.
        scriptTag = ('<script type=\"text\/javascript\"'
            ' src=\"\/\/platform.twitter.com\/widgets.js\"><\/script>')
        self.assertEquals(
            set([
                'http://www.example.com/my-url/'
                '\\/\\/platform.twitter.com\\/widgets.js'
            ]),
            site_diff.extract_urls(base_url, scriptTag))

        spacesInTag = "<a href = 'spaced.html'>"
        self.assertEquals(
            set(['http://www.example.com/my-url/spaced.html']),
            site_diff.extract_urls(base_url, spacesInTag))

        # JavaScript variable assignment isn't handled correctly.
        jsText = "var src = true;"
        self.assertEquals(
            set([
                'http://www.example.com/my-url/true'
            ]),
            site_diff.extract_urls(base_url, jsText))