예제 #1
0
def test_separate_rate_limited_groups_do_not_affect_each_other():
    start_time = datetime.utcnow()

    with rate_limited(calls_per_second=2, group='a'):
        1 + 1
    with rate_limited(calls_per_second=2, group='b'):
        1 + 1
    with rate_limited(calls_per_second=2, group='a'):
        1 + 1
    with rate_limited(calls_per_second=2, group='b'):
        1 + 1

    duration = datetime.utcnow() - start_time
    assert duration.total_seconds() > 0.5
    assert duration.total_seconds() < 0.55
예제 #2
0
def test_rate_limited():
    start_time = datetime.utcnow()
    for i in range(2):
        with rate_limited(calls_per_second=2):
            1 + 1
    duration = datetime.utcnow() - start_time
    assert duration.total_seconds() > 0.5
예제 #3
0
    def timestamped_uri_to_version(self,
                                   dt,
                                   uri,
                                   *,
                                   url,
                                   maintainers=None,
                                   tags=None,
                                   view_url=None):
        """
        Fetch version content and combine it with metadata to build a Version.

        Parameters
        ----------
        dt : datetime.datetime
            capture time
        uri : string
            URI of version
        url : string
            page URL
        maintainers : list of string, optional
            Entities responsible for maintaining the page, as a list of strings
        tags : list of string, optional
            Any arbitrary "tags" to apply to the page for categorization
        view_url : string, optional
            The archive.org URL for viewing the page (with rewritten links, etc.)

        Returns
        -------
        dict : Version
            suitable for passing to :class:`Client.add_versions`
        """
        with utils.rate_limited(group='timestamped_uri_to_version'):
            # Check to make sure we are actually getting a memento playback.
            res = utils.retryable_request('GET',
                                          uri,
                                          allow_redirects=False,
                                          session=self.session)
            if res.headers.get('memento-datetime') is None:
                message = res.headers.get('X-Archive-Wayback-Runtime-Error')
                if message:
                    raise MementoPlaybackError(
                        f'Memento at {uri} could not be played: {message}')
                elif res.ok:
                    raise MementoPlaybackError(
                        f'Memento at {uri} could not be played')
                else:
                    res.raise_for_status()

            # If the playback includes a redirect, continue on.
            if res.status_code >= 300 and res.status_code < 400:
                original = res
                res = utils.retryable_request('GET',
                                              res.headers.get('location'),
                                              session=self.session)
                res.history.insert(0, original)
                res.request = original.request

        version_hash = utils.hash_content(res.content)
        title = utils.extract_title(res.content)
        content_type = (res.headers['content-type'] or '').split(';', 1)

        # Get all headers from original response
        prefix = 'X-Archive-Orig-'
        original_headers = {
            k[len(prefix):]: v
            for k, v in res.headers.items() if k.startswith(prefix)
        }

        redirected_url = None
        redirects = None
        if res.url != uri:
            redirected_url = original_url_for_memento(res.url)
            redirects = list(
                map(lambda response: original_url_for_memento(response.url),
                    res.history))
            redirects.append(redirected_url)

        return format_version(url=url,
                              dt=dt,
                              uri=uri,
                              version_hash=version_hash,
                              title=title,
                              tags=tags,
                              maintainers=maintainers,
                              status=res.status_code,
                              mime_type=content_type[0],
                              encoding=res.encoding,
                              headers=original_headers,
                              view_url=view_url,
                              redirected_url=redirected_url,
                              redirects=redirects)
예제 #4
0
    def get_memento(self,
                    url,
                    exact=True,
                    exact_redirects=None,
                    target_window=24 * 60 * 60):
        """
        Fetch a memento from the Wayback Machine. This retrieves the content
        that was ultimately returned from a memento, following any redirects
        that were present at the time the memento was captured. (That is, if
        `http://example.com/a` redirected to `http://example.com/b`, this
        returns the memento for `/b` when you request `/a`.)

        Parameters
        ----------
        url : string
            URL of memento in Wayback (e.g.
            `http://web.archive.org/web/20180816111911id_/http://www.nws.noaa.gov/sp/`)
        exact : boolean, optional
            If false and the requested memento either doesn't exist or can't be
            played back, this returns the closest-in-time memento to the
            requested one, so long as it is within `target_window`.
            Default: True
        exact_redirects : boolean, optional
            If false and the requested memento is a redirect whose *target*
            doesn't exist or or can't be played back, this returns the closest-
            in-time memento to the intended target, so long as it is within
            `target_window`. If unset, this will be the same as `exact`.
        target_window : int, optional
            If the memento is of a redirect, allow up to this many seconds
            between the capture of the redirect and the capture of the target
            URL. (Note this does NOT apply when the originally requested
            memento didn't exist and wayback redirects to the next-closest-in-
            -time one. That will always raise a MementoPlaybackError.)
            Defaults to 86,400 (24 hours).

        Returns
        -------
        dict : requests.Response
            An HTTP response with the content of the memento, including a
            history of any redirects involved.
        """
        if exact_redirects is None:
            exact_redirects = exact

        with utils.rate_limited(calls_per_second=30, group='get_memento'):
            # Correctly following redirects is actually pretty complicated. In
            # the simplest case, a memento is a simple web page, and that's
            # no problem. However...
            #   1.  If the response was a >= 400 status, we have to determine
            #       whether that status is coming from the memento or from the
            #       the Wayback Machine itself.
            #   2.  If the response was a 3xx status (a redirect) we have to
            #       determine the same thing, but it's a little more complex...
            #       a) If the redirect *is* the memento, its target may be an
            #          actual memento (see #1) or it may be a redirect (#2).
            #          The targeted URL is frequently captured anywhere from
            #          the same second to a few hours later, so it is likely
            #          the target will result in case 2b (below).
            #       b) If there is no memento for the requested time, but there
            #          are mementos for the same URL at another time, Wayback
            #          *may* redirect to that memento.
            #          - If this was on the original request, that's *not* ok
            #            because it means we're getting a different memento
            #            than we asked for.
            #          - If the redirect came from a URL that was the target of
            #            of a memento redirect (2a), then this is expected.
            #            Before following the redirect, though, we first sanity
            #            check it to make sure the memento we are redirecting
            #            to actually came from nearby in time (sometimes
            #            Wayback will redirect to captures *months* away).
            history = []
            urls = set()
            previous_was_memento = False
            orginal_url, original_date = memento_url_data(url)
            response = self.session.request('GET', url, allow_redirects=False)
            protocol_and_www = re.compile(r'^https?://(www\d?\.)?')
            while True:
                is_memento = 'Memento-Datetime' in response.headers

                if not is_memento:
                    # The exactness requirements for redirects from memento
                    # playbacks and non-playbacks is different -- even with
                    # strict matching, a memento that redirects to a non-
                    # memento is normal and ok; the target of a redirect will
                    # rarely have been captured at the same time as the
                    # redirect itself. (See 2b)
                    playable = False
                    if response.next and (
                        (len(history) == 0 and exact == False) or
                        (len(history) > 0 and
                         (previous_was_memento or exact_redirects == False))):
                        current_url = original_url_for_memento(response.url)
                        target_url, target_date = memento_url_data(
                            response.next.url)
                        # A non-memento redirect is generally taking us to the
                        # closest-in-time capture of the same URL. Note that is
                        # NOT the next capture -- i.e. the one that would have
                        # been produced by an earlier memento redirect -- it's
                        # just the *closest* one. The first job here is to make
                        # sure it fits within our target window.
                        if abs(target_date -
                               original_date).seconds <= target_window:
                            # The redirect will point to the closest-in-time
                            # SURT URL, which will often not be an exact URL
                            # match. If we aren't looking for exact matches,
                            # then just assume wherever we're redirecting to is
                            # ok. Otherwise, try to sanity-check the URL.
                            if exact_redirects:
                                # FIXME: what should *really* happen here, if
                                # we want exactness, is a CDX search for the
                                # next-int-time capture of the exact URL we
                                # redirected to. I'm not totally sure how
                                # great that is (also it seems high overhead to
                                # do a search in the middle of this series of
                                # memento lookups), so just do a loose URL
                                # check for now.
                                current_nice_url = protocol_and_www.sub(
                                    '', current_url).casefold()
                                target_nice_url = protocol_and_www.sub(
                                    '', target_url).casefold()
                                playable = current_nice_url == target_nice_url
                            else:
                                playable = True

                    if not playable:
                        message = response.headers.get(
                            'X-Archive-Wayback-Runtime-Error')
                        if message:
                            raise MementoPlaybackError(
                                f'Memento at {url} could not be played: {message}'
                            )
                        elif response.ok:
                            raise MementoPlaybackError(
                                f'Memento at {url} could not be played')
                        else:
                            response.raise_for_status()

                if response.next:
                    previous_was_memento = is_memento
                    urls.add(response.url)
                    # Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯
                    if response.next.url in urls:
                        raise MementoPlaybackError(
                            f'Memento at {url} is circular')

                    history.append(response)
                    response = self.session.send(response.next,
                                                 allow_redirects=False)
                else:
                    break

            response.history = history
            return response
예제 #5
0
def timestamped_uri_to_version(dt,
                               uri,
                               *,
                               url,
                               maintainers=None,
                               tags=None,
                               view_url=None):
    """
    Fetch version content and combine it with metadata to build a Version.

    Parameters
    ----------
    dt : datetime.datetime
        capture time
    uri : string
        URI of version
    url : string
        page URL
    maintainers : list of string, optional
        Entities responsible for maintaining the page, as a list of strings
    tags : list of string, optional
        Any arbitrary "tags" to apply to the page for categorization
    view_url : string, optional
        The archive.org URL for viewing the page (with rewritten links, etc.)

    Returns
    -------
    dict : Version
        suitable for passing to :class:`Client.add_versions`
    """
    with utils.rate_limited(group='timestamped_uri_to_version'):
        res = utils.retryable_request('GET', uri)

    # IA's memento server responds with the status of the original request, so
    # use the presence of the 'Memento-Datetime' header to determine if we
    # should use the response or there was an actual error.
    if not res.ok and not res.headers.get('memento-datetime'):
        res.raise_for_status()

    version_hash = utils.hash_content(res.content)
    title = utils.extract_title(res.content)
    content_type = (res.headers['content-type'] or '').split(';', 1)

    # Get all headers from original response
    prefix = 'X-Archive-Orig-'
    original_headers = {
        k[len(prefix):]: v
        for k, v in res.headers.items() if k.startswith(prefix)
    }

    redirected_url = None
    redirects = None
    if res.url != uri:
        redirected_url = original_url_for_memento(res.url)
        redirects = list(
            map(lambda response: original_url_for_memento(response.url),
                res.history))
        redirects.append(redirected_url)

    return format_version(url=url,
                          dt=dt,
                          uri=uri,
                          version_hash=version_hash,
                          title=title,
                          tags=tags,
                          maintainers=maintainers,
                          status=res.status_code,
                          mime_type=content_type[0],
                          encoding=res.encoding,
                          headers=original_headers,
                          view_url=view_url,
                          redirected_url=redirected_url,
                          redirects=redirects)