def page_to_version(url, cabinet_id, archive_id, page_key, *, agency, site):
    """
    Obtain URI, timestamp, metadata, hash, and title and return a Version.
    """
    uri = file_command_uri(cabinet_id, archive_id, page_key, 'file')
    dt = datetime.fromtimestamp(int(archive_id))
    metadata = get_file_metadata(cabinet_id, archive_id, page_key)
    content = get_file(cabinet_id, archive_id, page_key)
    version_hash = utils.hash_content(content)

    # Sniff whether this is text and, if so, what the encoding is.
    # PF provides its own 'ContentType' key, mapped to a string,
    # not to be confused with 'Content-Type' in the Header, mapped to a list.
    content_type = metadata['file']['ContentType']
    is_text = content_type.startswith('text/html')
    if is_text:
        if 'charset=' in content_type:
            _, encoding = content_type.split('charset=')
        else:
            encoding = 'utf-8'  # best effort
        title = utils.extract_title(content, encoding)
    else:
        title = ''
    version = format_version(url=url,
                             dt=dt,
                             uri=uri,
                             version_hash=version_hash,
                             title=title,
                             agency=agency,
                             site=site,
                             metadata=metadata)
    return version
예제 #2
0
    def timestamped_uri_to_version(self, dt, uri, *, url,
                                   maintainers=None, tags=None, view_url=None):
        """
        Fetch version content and combine it with metadata to build a Version.

        Parameters
        ----------
        dt : datetime.datetime
            capture time
        uri : string
            URI of version
        url : string
            page URL
        maintainers : list of string, optional
            Entities responsible for maintaining the page, as a list of strings
        tags : list of string, optional
            Any arbitrary "tags" to apply to the page for categorization
        view_url : string, optional
            The archive.org URL for viewing the page (with rewritten links, etc.)

        Returns
        -------
        dict : Version
            suitable for passing to :class:`Client.add_versions`
        """
        res = self.get_memento(uri, exact_redirects=False)
        version_hash = utils.hash_content(res.content)
        title = utils.extract_title(res.content)
        content_type = (res.headers['content-type'] or '').split(';', 1)

        # Get all headers from original response
        prefix = 'X-Archive-Orig-'
        original_headers = {
            k[len(prefix):]: v for k, v in res.headers.items()
            if k.startswith(prefix)
        }

        redirected_url = None
        redirects = None
        if res.url != uri:
            redirected_url = original_url_for_memento(res.url)
            redirects = list(map(
                lambda response: original_url_for_memento(response.url),
                res.history))
            redirects.append(redirected_url)

        return format_version(url=url, dt=dt, uri=uri,
                              version_hash=version_hash, title=title,
                              tags=tags, maintainers=maintainers,
                              status=res.status_code,
                              mime_type=content_type[0], encoding=res.encoding,
                              headers=original_headers, view_url=view_url,
                              redirected_url=redirected_url,
                              redirects=redirects)
def test_extract_title_handles_whitespace():
    title = extract_title(b'''<html>
        <head>
            <meta charset="utf-8">
            <title>

                THIS IS
                THE  TITLE
            </title>
        </head>
        <body>Blah</body>
    </html>''')
    assert title == 'THIS IS THE TITLE'
def timestamped_uri_to_version(dt, uri, *, url, site, agency):
    """
    Obtain hash and title and return a Version.
    """
    res = requests.get(uri)
    assert res.ok
    version_hash = utils.hash_content(res.content)
    title = utils.extract_title(res.content)
    return format_version(url=url,
                          dt=dt,
                          uri=uri,
                          version_hash=version_hash,
                          title=title,
                          agency=agency,
                          site=site)
예제 #5
0
    def timestamped_uri_to_version(self,
                                   dt,
                                   uri,
                                   *,
                                   url,
                                   maintainers=None,
                                   tags=None,
                                   view_url=None):
        """
        Fetch version content and combine it with metadata to build a Version.

        Parameters
        ----------
        dt : datetime.datetime
            capture time
        uri : string
            URI of version
        url : string
            page URL
        maintainers : list of string, optional
            Entities responsible for maintaining the page, as a list of strings
        tags : list of string, optional
            Any arbitrary "tags" to apply to the page for categorization
        view_url : string, optional
            The archive.org URL for viewing the page (with rewritten links, etc.)

        Returns
        -------
        dict : Version
            suitable for passing to :class:`Client.add_versions`
        """
        with utils.rate_limited(group='timestamped_uri_to_version'):
            # Check to make sure we are actually getting a memento playback.
            res = utils.retryable_request('GET',
                                          uri,
                                          allow_redirects=False,
                                          session=self.session)
            if res.headers.get('memento-datetime') is None:
                message = res.headers.get('X-Archive-Wayback-Runtime-Error')
                if message:
                    raise MementoPlaybackError(
                        f'Memento at {uri} could not be played: {message}')
                elif res.ok:
                    raise MementoPlaybackError(
                        f'Memento at {uri} could not be played')
                else:
                    res.raise_for_status()

            # If the playback includes a redirect, continue on.
            if res.status_code >= 300 and res.status_code < 400:
                original = res
                res = utils.retryable_request('GET',
                                              res.headers.get('location'),
                                              session=self.session)
                res.history.insert(0, original)
                res.request = original.request

        version_hash = utils.hash_content(res.content)
        title = utils.extract_title(res.content)
        content_type = (res.headers['content-type'] or '').split(';', 1)

        # Get all headers from original response
        prefix = 'X-Archive-Orig-'
        original_headers = {
            k[len(prefix):]: v
            for k, v in res.headers.items() if k.startswith(prefix)
        }

        redirected_url = None
        redirects = None
        if res.url != uri:
            redirected_url = original_url_for_memento(res.url)
            redirects = list(
                map(lambda response: original_url_for_memento(response.url),
                    res.history))
            redirects.append(redirected_url)

        return format_version(url=url,
                              dt=dt,
                              uri=uri,
                              version_hash=version_hash,
                              title=title,
                              tags=tags,
                              maintainers=maintainers,
                              status=res.status_code,
                              mime_type=content_type[0],
                              encoding=res.encoding,
                              headers=original_headers,
                              view_url=view_url,
                              redirected_url=redirected_url,
                              redirects=redirects)
def timestamped_uri_to_version(dt, uri, *, url, agency, site, view_url=None):
    """
    Fetch version content and combine it with metadata to build a Version.

    Parameters
    ----------
    dt : datetime.datetime
        capture time
    uri : string
        URI of version
    url : string
        page URL
    agency : string
        primer metadata (likely to change in the future)
    site : string
        primer metadata (likely to change in the future)
    view_url : string, optional
        The archive.org URL for viewing the page (with rewritten links, etc.)

    Returns
    -------
    dict : Version
        suitable for passing to :class:`Client.add_versions`
    """
    res = requests.get(uri)

    # IA's memento server responds with the status of the original request, so
    # use the presence of the 'Memento-Datetime' header to determine if we
    # should use the response or there was an actual error.
    if not res.ok and not res.headers.get('memento-datetime'):
        res.raise_for_status()

    version_hash = utils.hash_content(res.content)
    title = utils.extract_title(res.content)
    content_type = (res.headers['content-type'] or '').split(';', 1)

    # Get all headers from original response
    prefix = 'X-Archive-Orig-'
    original_headers = {
        k[len(prefix):]: v
        for k, v in res.headers.items() if k.startswith(prefix)
    }

    redirected_url = None
    redirects = None
    if res.url != uri:
        redirected_url = original_url_for_memento(res.url)
        redirects = list(
            map(lambda response: original_url_for_memento(response.url),
                res.history))
        redirects.append(redirected_url)

    return format_version(url=url,
                          dt=dt,
                          uri=uri,
                          version_hash=version_hash,
                          title=title,
                          agency=agency,
                          site=site,
                          status=res.status_code,
                          mime_type=content_type[0],
                          encoding=res.encoding,
                          headers=original_headers,
                          view_url=view_url,
                          redirected_url=redirected_url,
                          redirects=redirects)
def test_extract_title():
    title = extract_title(b'''<html>
        <head><title>THIS IS THE TITLE</title></head>
        <body>Blah</body>
    </html>''')
    assert title == 'THIS IS THE TITLE'
def test_extract_title_from_titleless_page():
    title = extract_title(b'''<html>
        <head><meta charset="utf-8"></head>
        <body>Blah</body>
    </html>''')
    assert title == ''