def test_get_urls(self):
        # multi line string
        html = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<table border="1">
<tr>
<td>test</td>
<td><img src="http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png"></td>
<td><img src="http://abc/test.png" data-defer-src="http://abc/testdefer.jpg"></td>
<td><img src="http://abc/test.png" data-defer-src="/testdefer.jpg"></td>
<td><img src="abc/test.png" data-defer-src="wp/testdefer.jpg?test=true"></td>
</tr>
</table> 
</body></html>
"""        
        correct_urls = [
        "http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png",
        "http://abc/testdefer.jpg",
        "http://www.test.com/testdefer.jpg",
        "http://www.test.com/wp/testdefer.jpg?test=true"]

        ei = ExtractImages("http://www.test.com/?q=test")
        urls = ei.get_image_urls(html)
        self.assertItemsEqual(urls, correct_urls)
Exemplo n.º 2
0
    def test_get_urls(self):
        # multi line string
        html = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<table border="1">
<tr>
<td>test</td>
<td><img src="http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png"></td>
<td><img src="http://abc/test.png" data-defer-src="http://abc/testdefer.jpg"></td>
<td><img src="http://abc/test.png" data-defer-src="/testdefer.jpg"></td>
<td><img src="abc/test.png" data-defer-src="wp/testdefer.jpg?test=true"></td>
</tr>
</table> 
</body></html>
"""
        correct_urls = [
            "http://www.patrickwolf.net/wp-content/uploads/2008/09/mcp.png",
            "http://abc/testdefer.jpg", "http://www.test.com/testdefer.jpg",
            "http://www.test.com/wp/testdefer.jpg?test=true"
        ]

        ei = ExtractImages("http://www.test.com/?q=test")
        urls = ei.get_image_urls(html)
        self.assertItemsEqual(urls, correct_urls)
 def test_download_url(self):
     # get url to dummy.jpg
     pn = os.path.dirname(__file__)
     fpn = os.path.join(pn, "dummy.png")
     url = "file:///" + fpn.replace("\\", "/").replace(":", "|")
     # get target path
     target_fpn = os.path.join(pn, "dummy_1.png")
 
     ei = ExtractImages("http://www.test.com/?q=test")
     ei.download_image_from_url(url, pn)
     # test file was correctly downloaded
     self.assertTrue(os.path.exists(target_fpn), "target file missing")
     self.assertEqual(os.path.getsize(fpn), os.path.getsize(target_fpn), "file size mismatch")
     # cleanup 
     os.remove(target_fpn)
Exemplo n.º 4
0
    def test_download_url(self):
        # get url to dummy.jpg
        pn = os.path.dirname(__file__)
        fpn = os.path.join(pn, "dummy.png")
        url = "file:///" + fpn.replace("\\", "/").replace(":", "|")
        # get target path
        target_fpn = os.path.join(pn, "dummy_1.png")

        ei = ExtractImages("http://www.test.com/?q=test")
        ei.download_image_from_url(url, pn)
        # test file was correctly downloaded
        self.assertTrue(os.path.exists(target_fpn), "target file missing")
        self.assertEqual(os.path.getsize(fpn), os.path.getsize(target_fpn),
                         "file size mismatch")
        # cleanup
        os.remove(target_fpn)