def test_get_urls(self):
        # multi line string
        html = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<table border="1">
<tr>
<td>test</td>
<td><img src="http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png"></td>
<td><img src="http://abc/test.png" data-defer-src="http://abc/testdefer.jpg"></td>
<td><img src="http://abc/test.png" data-defer-src="/testdefer.jpg"></td>
<td><img src="abc/test.png" data-defer-src="wp/testdefer.jpg?test=true"></td>
</tr>
</table> 
</body></html>
"""        
        correct_urls = [
        "http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png",
        "http://abc/testdefer.jpg",
        "http://www.test.com/testdefer.jpg",
        "http://www.test.com/wp/testdefer.jpg?test=true"]

        ei = ExtractImages("http://www.test.com/?q=test")
        urls = ei.get_image_urls(html)
        self.assertItemsEqual(urls, correct_urls)
Exemplo n.º 2
0
    def test_get_urls(self):
        # multi line string
        html = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<table border="1">
<tr>
<td>test</td>
<td><img src="http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png"></td>
<td><img src="http://abc/test.png" data-defer-src="http://abc/testdefer.jpg"></td>
<td><img src="http://abc/test.png" data-defer-src="/testdefer.jpg"></td>
<td><img src="abc/test.png" data-defer-src="wp/testdefer.jpg?test=true"></td>
</tr>
</table> 
</body></html>
"""
        correct_urls = [
            "http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png",
            "http://abc/testdefer.jpg", "http://www.test.com/testdefer.jpg",
            "http://www.test.com/wp/testdefer.jpg?test=true"
        ]

        ei = ExtractImages("http://www.test.com/?q=test")
        urls = ei.get_image_urls(html)
        self.assertItemsEqual(urls, correct_urls)