def testDotPaths(self): # Test that paths containing dots are handled correctly. # # We expect the returned directory and file names to only # include those links http://example.com/foo/ even in the # presence of "." and ".." path segments. content = ''' <html> <head> <title>Listing</title> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> </head> <body> <pre> <a href="../">Up a level</a> <a href="/foo/../">The same again</a> <a href="file1/../file2">file2</a> <a href=".">This directory</a> <a href="dir/.">A subdirectory</a> </pre> </html> ''' listing_url = 'http://example.com/foo/' responses.add('GET', listing_url, body=content) responses.add('HEAD', listing_url + 'file2') walker = HTTPWalker(listing_url, logging.getLogger()) dirnames, filenames = walker.list('/foo/') self.assertEqual(dirnames, ['dir/']) self.assertEqual(filenames, ['file2'])
def testNonAsciiListing(self): # Test that list() handles non-ASCII output. content = b''' <html> <head> <title>Listing</title> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> </head> <body> <p>A non-breaking space: \xc2\xa0</p> <p><a href="/elsewhere">Somewhere else on the site</a></p> <!-- intentionally unclosed anchor below --> <p><a href="/foo/file99">Absolute path</p> <pre> <a href="../">Parent directory</a> <a href="subdir1/">subdir 1</a> <a href="subdir2/">subdir 2</a> <a href="subdir3/">subdir 3</a> <a href="file3">file 3</a> <a href="file2">file 2</a> <a href="file1">file 1</a> </pre> </html> ''' listing_url = 'http://example.com/foo/' responses.add('GET', listing_url, body=content) expected_filenames = ['file1', 'file2', 'file3', 'file99'] for filename in expected_filenames: responses.add('HEAD', listing_url + filename) walker = HTTPWalker(listing_url, logging.getLogger()) dirnames, filenames = walker.list('/foo/') self.assertEqual(dirnames, ['subdir1/', 'subdir2/', 'subdir3/']) self.assertEqual(filenames, expected_filenames)
def testSquidFtpListing(self): # Test that a Squid FTP listing can be parsed. content = ''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <!-- HTML listing generated by Squid 2.5.STABLE12 --> <!-- Wed, 06 Sep 2006 11:04:02 GMT --> <HTML><HEAD><TITLE> FTP Directory: ftp://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/ </TITLE> <STYLE type="text/css"><!--BODY{background-color:#ffffff;font-family:verdana,sans-serif}--></STYLE> </HEAD><BODY> <H2> FTP Directory: <A HREF="/">ftp://ftp.gnome.org</A>/<A HREF="/pub/">pub</A>/<A HREF="/pub/GNOME/">GNOME</A>/<A HREF="/pub/GNOME/sources/">sources</A>/<A HREF="/pub/GNOME/sources/gnome-gpg/">gnome-gpg</A>/<A HREF="/pub/GNOME/sources/gnome-gpg/0.5/">0.5</A>/</H2> <PRE> <A HREF="../"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-dirup.gif" ALT="[DIRUP]"></A> <A HREF="../">Parent Directory</A> <A HREF="LATEST-IS-0.5.0"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-link.gif" ALT="[LINK]"></A> <A HREF="LATEST-IS-0.5.0">LATEST-IS-0.5.0</A>. . . . . . . . . Sep 02 07:07 <A HREF="LATEST-IS-0.5.0;type=a"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-text.gif" ALT="[VIEW]"></A> <A HREF="LATEST-IS-0.5.0;type=i"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-box.gif" ALT="[DOWNLOAD]"></A> -> <A HREF="gnome-gpg-0.5.0.tar.gz">gnome-gpg-0.5.0.tar.gz</A> <A HREF="gnome-gpg-0.5.0.md5sum"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-unknown.gif" ALT="[FILE]"></A> <A HREF="gnome-gpg-0.5.0.md5sum">gnome-gpg-0.5.0.md5sum</A> . . . . . Sep 02 06:58 115 <A HREF="gnome-gpg-0.5.0.md5sum;type=a"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-text.gif" ALT="[VIEW]"></A> <A HREF="gnome-gpg-0.5.0.md5sum;type=i"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-box.gif" ALT="[DOWNLOAD]"></A> <A HREF="gnome-gpg-0.5.0.tar.bz2"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-compressed.gif" ALT="[FILE]"></A> <A HREF="gnome-gpg-0.5.0.tar.bz2">gnome-gpg-0.5.0.tar.bz2</A>. . . . . Sep 02 06:58 68K <A HREF="gnome-gpg-0.5.0.tar.bz2;type=i"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-box.gif" ALT="[DOWNLOAD]"></A> <A HREF="gnome-gpg-0.5.0.tar.gz"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-tar.gif" ALT="[FILE]"></A> <A HREF="gnome-gpg-0.5.0.tar.gz">gnome-gpg-0.5.0.tar.gz</A> . . . . . Sep 02 06:58 81K <A HREF="gnome-gpg-0.5.0.tar.gz;type=i"><IMG border="0" SRC="http://squid:3128/squid-internal-static/icons/anthony-box.gif" ALT="[DOWNLOAD]"></A> </PRE> <HR noshade size="1px"> <ADDRESS> Generated Wed, 06 Sep 2006 11:04:02 GMT by squid (squid/2.5.STABLE12) </ADDRESS></BODY></HTML> ''' listing_url = 'ftp://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/' responses.add('GET', listing_url, body=content) walker = HTTPWalker(listing_url, logging.getLogger()) dirnames, filenames = walker.list('/pub/GNOME/sources/gnome-gpg/0.5/') self.assertEqual(dirnames, []) self.assertEqual(filenames, ['LATEST-IS-0.5.0', 'gnome-gpg-0.5.0.md5sum', 'gnome-gpg-0.5.0.tar.bz2', 'gnome-gpg-0.5.0.tar.gz'])
def testApacheListing(self): # Test that list() handles a standard Apache dir listing. content = ''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> <html> <head> <title>Index of /pub/GNOME/sources/gnome-gpg/0.5</title> </head> <body> <h1>Index of /pub/GNOME/sources/gnome-gpg/0.5</h1> <pre><img src="/icons/blank.gif" alt="Icon " width="24" height="24"> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr><a href="/pub/GNOME/sources/gnome-gpg/"><img src="http://www.gnome.org/img/24x24/parent.png" alt="[DIR]" width="24" height="24"></a> <a href="/pub/GNOME/sources/gnome-gpg/">Parent Directory</a> - <a href="LATEST-IS-0.5.0"><img src="http://www.gnome.org/img/24x24/default.png" alt="[ ]" width="24" height="24"></a> <a href="LATEST-IS-0.5.0">LATEST-IS-0.5.0</a> 02-Sep-2006 08:58 81K <a href="gnome-gpg-0.5.0.md5sum"><img src="http://www.gnome.org/img/24x24/default.png" alt="[ ]" width="24" height="24"></a> <a href="gnome-gpg-0.5.0.md5sum">gnome-gpg-0.5.0.md5sum</a> 02-Sep-2006 08:58 115 <a href="gnome-gpg-0.5.0.tar.bz2"><img src="http://www.gnome.org/img/24x24/archive.png" alt="[ ]" width="24" height="24"></a> <a href="gnome-gpg-0.5.0.tar.bz2">gnome-gpg-0.5.0.tar.bz2</a> 02-Sep-2006 08:58 68K <a href="gnome-gpg-0.5.0.tar.gz"><img src="http://www.gnome.org/img/24x24/archive.png" alt="[ ]" width="24" height="24"></a> <a href="gnome-gpg-0.5.0.tar.gz">gnome-gpg-0.5.0.tar.gz</a> 02-Sep-2006 08:58 81K <hr></pre> <address>Apache/2.2.3 (Unix) Server at <a href="mailto:[email protected]">ftp.acc.umu.se</a> Port 80</address> </body></html> ''' listing_url = 'http://ftp.gnome.org/pub/GNOME/sources/gnome-gpg/0.5/' responses.add('GET', listing_url, body=content) expected_filenames = [ 'LATEST-IS-0.5.0', 'gnome-gpg-0.5.0.md5sum', 'gnome-gpg-0.5.0.tar.bz2', 'gnome-gpg-0.5.0.tar.gz', ] for filename in expected_filenames: responses.add('HEAD', listing_url + filename) walker = HTTPWalker(listing_url, logging.getLogger()) dirnames, filenames = walker.list('/pub/GNOME/sources/gnome-gpg/0.5/') self.assertEqual(dirnames, []) self.assertEqual(filenames, expected_filenames)
def testGarbageListing(self): # Make sure that garbage doesn't trip up the dir lister. content = b'\x01\x02\x03\x00\xff\xf2\xablkjsdflkjsfkljfds' listing_url = 'http://example.com/foo/' responses.add('GET', listing_url, body=content) walker = HTTPWalker(listing_url, logging.getLogger()) dirnames, filenames = walker.list('/foo/') self.assertEqual(dirnames, []) self.assertEqual(filenames, [])
def testNamedAnchors(self): # Test that the directory listing parser code handles named anchors. # These are <a> tags without an href attribute. content = ''' <html> <head> <title>Listing</title> </head> <body> <a name="top"></a> <pre> <a href="file1">file1</a> <a href="dir1/">dir1/</a> <a href="#top">Go to top</a> </pre> </html> ''' listing_url = 'http://example.com/foo/' responses.add('GET', listing_url, body=content) responses.add('HEAD', listing_url + 'file1') walker = HTTPWalker(listing_url, logging.getLogger()) dirnames, filenames = walker.list('/foo/') self.assertEqual(dirnames, ['dir1/']) self.assertEqual(filenames, ['file1'])