示例#1
0
def invalidate_by_url_later():
    # Order preserving uniqify list
    _seen = set()
    slated = [
        x for x in cache.get("invalidate_by_url", [])
        if x not in _seen and not _seen.add(x)
    ]
    if slated:
        cache.delete("invalidate_by_url")
        for url in slated:
            invalidate_by_url(url, revisit=True)
        CDNPurgeURL.add(slated)
示例#2
0
def run_purge_cdn_urls():
    queue = CDNPurgeURL.get()
    # The `timezone.now()` in the printed output message is to keep an eye
    # on whether the periodic task sometimes fires repeatedly in a short
    # amount of time.
    if queue:
        print("{} queued CDN URLs for purging: {} ({})".format(
            len(queue), queue, timezone.now()))
        purge_cdn_urls(queue)
    else:
        print("No queued CDN URLs for purgning ({})".format(timezone.now()))
示例#3
0
def _post_process_cached_html(filepath, url, postprocessing, original_url):
    if "\n" in url:
        raise ValueError("URL can't have a linebreak in it ({!r})".format(url))
    if url.startswith("http://testserver"):
        # do nothing. testing.
        return
    if not os.path.exists(filepath):
        postprocessing.notes.append("{} no longer exists".format(filepath))
        return
        # raise ValueError(
        #     "{!r} does not exist and can't be post-processed".format(filepath)
        # )

    attempts = 0
    with open(filepath) as f:
        html = f.read()

    if has_been_css_minified(html):
        # This function has a lock decorator on it. That essentially makes sure,
        # if fired concurrently, at the same time'ish, by two threads, only one
        # of them will run at a time. In serial. The second thread will still
        # get to run. This check is to see if it's no point running now.
        msg = "HTML ({}) already post processed".format(filepath)
        postprocessing.notes.append(msg)
        return

    # Squeezing every little byte out of it!
    # That page doesn't need the little minimalcss stats block.
    # Otherwise, the default is to include it.
    include_minimalcss_stats = "/plog/blogitem-040601-1" not in url

    optimized_html = html
    while True and not url.endswith("/awspa"):
        t0 = time.perf_counter()
        try:
            print("CALLING mincss_html FOR", original_url or url)
            optimized_html = mincss_html(
                html,
                original_url or url,
                include_minimalcss_stats=include_minimalcss_stats,
            )
            t1 = time.perf_counter()
            if optimized_html is None:
                postprocessing.notes.append(
                    "At attempt number {} the optimized HTML "
                    "became None (Took {:.1f}s)".format(attempts + 1, t1 - t0))
            else:
                postprocessing.notes.append(
                    "Took {:.1f}s mincss_html HTML from {} to {}".format(
                        t1 - t0, len(html), len(optimized_html)))
        except ReadTimeout as exception:
            postprocessing.notes.append(
                "Timeout on mincss_html() ({})".format(exception))
            optimized_html = None
            # created = False

        attempts += 1
        if optimized_html is None:
            postprocessing.notes.append(
                "WARNING! mincss_html returned None for {} ({})".format(
                    filepath, url))
            if attempts < 3:
                print("Will try again!")
                time.sleep(1)
                continue
            postprocessing.notes.append(
                "Gave up after {} attempts".format(attempts))
            return

        try:
            shutil.move(filepath, filepath + ".original")
        except FileNotFoundError:
            postprocessing.notes.append(
                "Can't move to .original {} no longer exists".format(filepath))
            return
        with open(filepath, "w") as f:
            f.write(optimized_html)
        print("mincss optimized {}".format(filepath))
        break

    try:
        page, = re.findall(r"/p(\d+)$", url)
        page = int(page)
    except ValueError:
        page = 1

    if "/plog/blogitem-040601-1" in url:
        songsearch_autocomplete.insert(page=page)
    else:
        t0 = time.perf_counter()
        minified_html = _minify_html(filepath, url)
        t1 = time.perf_counter()
        if not minified_html:
            postprocessing.notes.append("Calling minify_html() failed")
        postprocessing.notes.append("Took {:.1f}s to minify HTML".format(t1 -
                                                                         t0))

        t0 = time.perf_counter()
        _zopfli_html(minified_html and minified_html or optimized_html,
                     filepath, url)
        t1 = time.perf_counter()
        postprocessing.notes.append("Took {:.1f}s to Zopfli HTML".format(t1 -
                                                                         t0))

        t0 = time.perf_counter()
        _brotli_html(minified_html and minified_html or optimized_html,
                     filepath, url)
        t1 = time.perf_counter()
        postprocessing.notes.append("Took {:.1f}s to Brotli HTML".format(t1 -
                                                                         t0))

    CDNPurgeURL.add(url)
示例#4
0
def purge_cdn_urls(urls, api=None):
    if settings.USE_NGINX_BYPASS:
        # Note! This Nginx trick will not just purge the proxy_cache, it will
        # immediately trigger a refetch.
        x_cache_headers = []
        for url in urls:
            if "://" not in url:
                url = settings.NGINX_BYPASS_BASEURL + url
            try:
                r = requests.get(url, headers={"secret-header": "true"})
                r.raise_for_status()
                x_cache_headers.append({
                    "url": url,
                    "x-cache": r.headers.get("x-cache")
                })
                # print("X-CACHE-HEADERS", x_cache_headers)
                CDNPurgeURL.succeeded(urls)
            except Exception:
                CDNPurgeURL.failed(urls)
                raise
        return {"all_urls": urls, "result": x_cache_headers}

    if not keycdn_zone_check():
        print("WARNING! Unable to use KeyCDN API at the moment :(")
        return

    if not api:
        api = keycdn.Api(settings.KEYCDN_API_KEY)
        api.session = get_requests_retry_session()
    config = get_cdn_config(api)
    # See https://www.keycdn.com/api#purge-zone-url
    try:
        cachebr = config["data"]["zone"]["cachebr"] == "enabled"
    except KeyError:
        raise BrokenKeyCDNConfig("Config={!r}".format(config))
    all_urls = []

    # For KeyCDN we need to do some transformations. Our URLs are different
    # from the KeyCDN "URLs". When we make this transformation, maintain a map
    # *back* to the original URLs as they're known to us.
    original_urls = {}

    for absolute_url in urls:
        url = settings.KEYCDN_ZONE_URL + urlparse(absolute_url).path
        all_urls.append(url)
        original_urls[url] = absolute_url
        if cachebr:
            all_urls.append(url + "br")
            original_urls[url + "br"] = absolute_url

    # Make absolutely sure nothing's repeated.
    all_all_urls = sorted(list(set(all_urls)))

    def get_original_urls(cdn_urls):
        original = set()
        for url in cdn_urls:
            original.add(original_urls[url])
        return original

    # Break it up into lists of 100
    def chunks(l, n):
        # For item i in a range that is a length of l,
        for i in range(0, len(l), n):
            # Create an index range for l of n items:
            yield l[i:i + n]

    for all_urls in chunks(all_all_urls, 100):
        call = "zones/purgeurl/{}.json".format(settings.KEYCDN_ZONE_ID)
        params = {"urls": all_urls}

        with open("/tmp/purge_cdn_urls.log", "a") as f:
            f.write("{}\t{!r}\t{}\n".format(timezone.now(), all_urls,
                                            get_stack_signature()))
        try:
            r = api.delete(call, params)
            CDNPurgeURL.succeeded(get_original_urls(all_urls))
        except Exception:
            CDNPurgeURL.failed(get_original_urls(all_urls))
            raise
        print(
            "SENT CDN PURGE FOR: {!r}\tORIGINAL URLS: {!r}\tRESULT: {}".format(
                all_urls, urls, r))
    return {"result": r, "all_urls": all_all_urls}
示例#5
0
def purge_outdated_cdn_urls(verbose=False, dry_run=False, revisit=False, max_files=100):
    """Periodically, go through fs cache files, by date, and compare each one
    with their CDN equivalent to see if the CDN version is too different.
    """
    paths = []
    for root, dirs, files in os.walk(settings.FSCACHE_ROOT):
        for file_ in files:
            if file_.endswith(".metadata"):
                continue
            path = os.path.join(root, file_)
            for attempt in range(3):
                if (
                    "index.html" in file_
                    and os.path.isfile(path)
                    and not os.stat(path).st_size
                ):
                    # If this happens, give it "one more chance" by sleeping
                    # a little and only raise the error if it file still isn't
                    # there after some sleeping.
                    time.sleep(1)
                    continue
                break
            else:
                raise EmptyFSCacheFile(path)
            if os.path.isfile(path + ".metadata") and "/awspa" not in path:
                paths.append((os.stat(path).st_mtime, path))

    # Oldest first
    paths.sort()

    for mtime, path in paths[:max_files]:
        uri = path.replace(settings.FSCACHE_ROOT, "")
        uri = re.sub(r"/index\.html$", "", uri)
        if uri == "":
            uri = "/"

        if verbose:
            age_seconds = time.time() - mtime
            if age_seconds > 10000:
                human_age = "{} hours".format(int(age_seconds / 60 / 60))
            elif age_seconds > 60:
                human_age = "{} minutes".format(int(age_seconds / 60))
            else:
                human_age = "{:.1f} seconds".format(age_seconds)
            print("{} last touched {} ago".format(uri, human_age))

        # Update the files modification time so it gets last in the sort order
        # next time.
        os.utime(path, (os.stat(path).st_atime, time.time()))

        cdn_url = get_cdn_base_url() + uri
        response = _download_cdn_url(cdn_url)
        if response.status_code == 404:
            # If it can't be viewed on the CDN, it has no business existing as a
            # fscached filed.
            # os.remove(path)
            if verbose:
                print("Deleted {!r} because it 404'ed on {}".format(path, cdn_url))
            continue
        if response.status_code != 200:
            if verbose:
                print("{} on {} :(".format(response.status_code, cdn_url))
            continue

        if response.headers.get("x-cache") != "HIT":
            if verbose:
                print(
                    "Wasn't x-cache HIT anyway ({!r}) {}".format(
                        response.headers.get("x-cache"), cdn_url
                    )
                )
            continue

        with open(path) as f:
            local_html = f.read()
        remote_html = response.text

        if local_html != remote_html and not dry_run:
            CDNPurgeURL.add(cdn_url)