def test_filenaming(self): now = datetime.now() filename = storytracker.create_archive_filename(self.url, now) url, then = storytracker.reverse_archive_filename(filename) self.assertEqual(self.url, url) self.assertEqual(now, then) with self.assertRaises(storytracker.ArchiveFileNameError): storytracker.reverse_archive_filename("foo.bar")
def open_archive_filepath(path): """ Accepts a file path and returns an ArchivedURL object """ # Split the file extension from the name name = os.path.basename(path) name, ext = os.path.splitext(name) # Extract the URL and timestamp from the file name url, timestamp = storytracker.reverse_archive_filename(name) # If it is gzipped, then open it that way if ext == '.gz': obj = gzip.open(path) return ArchivedURL( url, timestamp, obj.read().decode("utf-8"), gzip_archive_path=path ) # Otherwise handle it normally else: obj = open(path, "rb") return ArchivedURL( url, timestamp, obj.read().decode("utf-8"), html_archive_path=path )
def open_pastpages_url(url, **kwargs): """ Accepts an URL from PastPages and returns an ArchivedURL object if there is an HTML archive """ # Break out the unique ID for the page from the URL id_ = url.split("http://www.pastpages.org/screenshot/")[1].replace("/", "") # Use that request the HTML archive url from the API html_url = requests.get( "http://www.pastpages.org/api/beta/screenshots/%s/" % id_).json()['html'] # Extract the URL and timestamp from the url html_filename = html_url.split("/html/")[1].replace(".gz", "") archive_url, timestamp = storytracker.reverse_archive_filename( html_filename) # Get the archived HTML data gzipped = requests.get(html_url).content html = gzip.GzipFile(fileobj=BytesIO(gzipped)).read().decode("utf-8") # Pass it all back return ArchivedURL(archive_url, timestamp, html)
def open_pastpages_url(url, **kwargs): """ Accepts an URL from PastPages and returns an ArchivedURL object if there is an HTML archive """ # Break out the unique ID for the page from the URL id_ = url.split( "http://www.pastpages.org/screenshot/" )[1].replace("/", "") # Use that request the HTML archive url from the API html_url = requests.get( "http://www.pastpages.org/api/beta/screenshots/%s/" % id_ ).json()['html'] # Extract the URL and timestamp from the url html_filename = html_url.split("/html/")[1].replace(".gz", "") archive_url, timestamp = storytracker.reverse_archive_filename( html_filename ) # Get the archived HTML data gzipped = requests.get(html_url).content html = gzip.GzipFile(fileobj=BytesIO(gzipped)).read().decode("utf-8") # Pass it all back return ArchivedURL(archive_url, timestamp, html)
def open_archive_filepath(path): """ Accepts a file path and returns an ArchivedURL object """ # Split the file extension from the name name = os.path.basename(path) name, ext = os.path.splitext(name) # Extract the URL and timestamp from the file name url, timestamp = storytracker.reverse_archive_filename(name) # If it is gzipped, then open it that way if ext == '.gz': obj = gzip.open(path) return ArchivedURL(url, timestamp, obj.read().decode("utf-8"), gzip_archive_path=path) # Otherwise handle it normally else: obj = open(path, "rb") return ArchivedURL(url, timestamp, obj.read().decode("utf-8"), html_archive_path=path)