def page_to_version(url, cabinet_id, archive_id, page_key, *, agency, site): """ Obtain URI, timestamp, metadata, hash, and title and return a Version. """ uri = file_command_uri(cabinet_id, archive_id, page_key, 'file') dt = datetime.fromtimestamp(int(archive_id)) metadata = get_file_metadata(cabinet_id, archive_id, page_key) content = get_file(cabinet_id, archive_id, page_key) version_hash = utils.hash_content(content) # Sniff whether this is text and, if so, what the encoding is. # PF provides its own 'ContentType' key, mapped to a string, # not to be confused with 'Content-Type' in the Header, mapped to a list. content_type = metadata['file']['ContentType'] is_text = content_type.startswith('text/html') if is_text: if 'charset=' in content_type: _, encoding = content_type.split('charset=') else: encoding = 'utf-8' # best effort title = utils.extract_title(content, encoding) else: title = '' version = format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, agency=agency, site=site, metadata=metadata) return version
def timestamped_uri_to_version(self, dt, uri, *, url, maintainers=None, tags=None, view_url=None): """ Fetch version content and combine it with metadata to build a Version. Parameters ---------- dt : datetime.datetime capture time uri : string URI of version url : string page URL maintainers : list of string, optional Entities responsible for maintaining the page, as a list of strings tags : list of string, optional Any arbitrary "tags" to apply to the page for categorization view_url : string, optional The archive.org URL for viewing the page (with rewritten links, etc.) Returns ------- dict : Version suitable for passing to :class:`Client.add_versions` """ res = self.get_memento(uri, exact_redirects=False) version_hash = utils.hash_content(res.content) title = utils.extract_title(res.content) content_type = (res.headers['content-type'] or '').split(';', 1) # Get all headers from original response prefix = 'X-Archive-Orig-' original_headers = { k[len(prefix):]: v for k, v in res.headers.items() if k.startswith(prefix) } redirected_url = None redirects = None if res.url != uri: redirected_url = original_url_for_memento(res.url) redirects = list(map( lambda response: original_url_for_memento(response.url), res.history)) redirects.append(redirected_url) return format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, tags=tags, maintainers=maintainers, status=res.status_code, mime_type=content_type[0], encoding=res.encoding, headers=original_headers, view_url=view_url, redirected_url=redirected_url, redirects=redirects)
def test_extract_title_handles_whitespace(): title = extract_title(b'''<html> <head> <meta charset="utf-8"> <title> THIS IS THE TITLE </title> </head> <body>Blah</body> </html>''') assert title == 'THIS IS THE TITLE'
def timestamped_uri_to_version(dt, uri, *, url, site, agency): """ Obtain hash and title and return a Version. """ res = requests.get(uri) assert res.ok version_hash = utils.hash_content(res.content) title = utils.extract_title(res.content) return format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, agency=agency, site=site)
def timestamped_uri_to_version(self, dt, uri, *, url, maintainers=None, tags=None, view_url=None): """ Fetch version content and combine it with metadata to build a Version. Parameters ---------- dt : datetime.datetime capture time uri : string URI of version url : string page URL maintainers : list of string, optional Entities responsible for maintaining the page, as a list of strings tags : list of string, optional Any arbitrary "tags" to apply to the page for categorization view_url : string, optional The archive.org URL for viewing the page (with rewritten links, etc.) Returns ------- dict : Version suitable for passing to :class:`Client.add_versions` """ with utils.rate_limited(group='timestamped_uri_to_version'): # Check to make sure we are actually getting a memento playback. res = utils.retryable_request('GET', uri, allow_redirects=False, session=self.session) if res.headers.get('memento-datetime') is None: message = res.headers.get('X-Archive-Wayback-Runtime-Error') if message: raise MementoPlaybackError( f'Memento at {uri} could not be played: {message}') elif res.ok: raise MementoPlaybackError( f'Memento at {uri} could not be played') else: res.raise_for_status() # If the playback includes a redirect, continue on. if res.status_code >= 300 and res.status_code < 400: original = res res = utils.retryable_request('GET', res.headers.get('location'), session=self.session) res.history.insert(0, original) res.request = original.request version_hash = utils.hash_content(res.content) title = utils.extract_title(res.content) content_type = (res.headers['content-type'] or '').split(';', 1) # Get all headers from original response prefix = 'X-Archive-Orig-' original_headers = { k[len(prefix):]: v for k, v in res.headers.items() if k.startswith(prefix) } redirected_url = None redirects = None if res.url != uri: redirected_url = original_url_for_memento(res.url) redirects = list( map(lambda response: original_url_for_memento(response.url), res.history)) redirects.append(redirected_url) return format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, tags=tags, maintainers=maintainers, status=res.status_code, mime_type=content_type[0], encoding=res.encoding, headers=original_headers, view_url=view_url, redirected_url=redirected_url, redirects=redirects)
def timestamped_uri_to_version(dt, uri, *, url, agency, site, view_url=None): """ Fetch version content and combine it with metadata to build a Version. Parameters ---------- dt : datetime.datetime capture time uri : string URI of version url : string page URL agency : string primer metadata (likely to change in the future) site : string primer metadata (likely to change in the future) view_url : string, optional The archive.org URL for viewing the page (with rewritten links, etc.) Returns ------- dict : Version suitable for passing to :class:`Client.add_versions` """ res = requests.get(uri) # IA's memento server responds with the status of the original request, so # use the presence of the 'Memento-Datetime' header to determine if we # should use the response or there was an actual error. if not res.ok and not res.headers.get('memento-datetime'): res.raise_for_status() version_hash = utils.hash_content(res.content) title = utils.extract_title(res.content) content_type = (res.headers['content-type'] or '').split(';', 1) # Get all headers from original response prefix = 'X-Archive-Orig-' original_headers = { k[len(prefix):]: v for k, v in res.headers.items() if k.startswith(prefix) } redirected_url = None redirects = None if res.url != uri: redirected_url = original_url_for_memento(res.url) redirects = list( map(lambda response: original_url_for_memento(response.url), res.history)) redirects.append(redirected_url) return format_version(url=url, dt=dt, uri=uri, version_hash=version_hash, title=title, agency=agency, site=site, status=res.status_code, mime_type=content_type[0], encoding=res.encoding, headers=original_headers, view_url=view_url, redirected_url=redirected_url, redirects=redirects)
def test_extract_title(): title = extract_title(b'''<html> <head><title>THIS IS THE TITLE</title></head> <body>Blah</body> </html>''') assert title == 'THIS IS THE TITLE'
def test_extract_title_from_titleless_page(): title = extract_title(b'''<html> <head><meta charset="utf-8"></head> <body>Blah</body> </html>''') assert title == ''