def invalidate_by_url_later(): # Order preserving uniqify list _seen = set() slated = [ x for x in cache.get("invalidate_by_url", []) if x not in _seen and not _seen.add(x) ] if slated: cache.delete("invalidate_by_url") for url in slated: invalidate_by_url(url, revisit=True) CDNPurgeURL.add(slated)
def run_purge_cdn_urls(): queue = CDNPurgeURL.get() # The `timezone.now()` in the printed output message is to keep an eye # on whether the periodic task sometimes fires repeatedly in a short # amount of time. if queue: print("{} queued CDN URLs for purging: {} ({})".format( len(queue), queue, timezone.now())) purge_cdn_urls(queue) else: print("No queued CDN URLs for purgning ({})".format(timezone.now()))
def _post_process_cached_html(filepath, url, postprocessing, original_url): if "\n" in url: raise ValueError("URL can't have a linebreak in it ({!r})".format(url)) if url.startswith("http://testserver"): # do nothing. testing. return if not os.path.exists(filepath): postprocessing.notes.append("{} no longer exists".format(filepath)) return # raise ValueError( # "{!r} does not exist and can't be post-processed".format(filepath) # ) attempts = 0 with open(filepath) as f: html = f.read() if has_been_css_minified(html): # This function has a lock decorator on it. That essentially makes sure, # if fired concurrently, at the same time'ish, by two threads, only one # of them will run at a time. In serial. The second thread will still # get to run. This check is to see if it's no point running now. msg = "HTML ({}) already post processed".format(filepath) postprocessing.notes.append(msg) return # Squeezing every little byte out of it! # That page doesn't need the little minimalcss stats block. # Otherwise, the default is to include it. include_minimalcss_stats = "/plog/blogitem-040601-1" not in url optimized_html = html while True and not url.endswith("/awspa"): t0 = time.perf_counter() try: print("CALLING mincss_html FOR", original_url or url) optimized_html = mincss_html( html, original_url or url, include_minimalcss_stats=include_minimalcss_stats, ) t1 = time.perf_counter() if optimized_html is None: postprocessing.notes.append( "At attempt number {} the optimized HTML " "became None (Took {:.1f}s)".format(attempts + 1, t1 - t0)) else: postprocessing.notes.append( "Took {:.1f}s mincss_html HTML from {} to {}".format( t1 - t0, len(html), len(optimized_html))) except ReadTimeout as exception: postprocessing.notes.append( "Timeout on mincss_html() ({})".format(exception)) optimized_html = None # created = False attempts += 1 if optimized_html is None: postprocessing.notes.append( "WARNING! mincss_html returned None for {} ({})".format( filepath, url)) if attempts < 3: print("Will try again!") time.sleep(1) continue postprocessing.notes.append( "Gave up after {} attempts".format(attempts)) return try: shutil.move(filepath, filepath + ".original") except FileNotFoundError: postprocessing.notes.append( "Can't move to .original {} no longer exists".format(filepath)) return with open(filepath, "w") as f: f.write(optimized_html) print("mincss optimized {}".format(filepath)) break try: page, = re.findall(r"/p(\d+)$", url) page = int(page) except ValueError: page = 1 if "/plog/blogitem-040601-1" in url: songsearch_autocomplete.insert(page=page) else: t0 = time.perf_counter() minified_html = _minify_html(filepath, url) t1 = time.perf_counter() if not minified_html: postprocessing.notes.append("Calling minify_html() failed") postprocessing.notes.append("Took {:.1f}s to minify HTML".format(t1 - t0)) t0 = time.perf_counter() _zopfli_html(minified_html and minified_html or optimized_html, filepath, url) t1 = time.perf_counter() postprocessing.notes.append("Took {:.1f}s to Zopfli HTML".format(t1 - t0)) t0 = time.perf_counter() _brotli_html(minified_html and minified_html or optimized_html, filepath, url) t1 = time.perf_counter() postprocessing.notes.append("Took {:.1f}s to Brotli HTML".format(t1 - t0)) CDNPurgeURL.add(url)
def purge_cdn_urls(urls, api=None): if settings.USE_NGINX_BYPASS: # Note! This Nginx trick will not just purge the proxy_cache, it will # immediately trigger a refetch. x_cache_headers = [] for url in urls: if "://" not in url: url = settings.NGINX_BYPASS_BASEURL + url try: r = requests.get(url, headers={"secret-header": "true"}) r.raise_for_status() x_cache_headers.append({ "url": url, "x-cache": r.headers.get("x-cache") }) # print("X-CACHE-HEADERS", x_cache_headers) CDNPurgeURL.succeeded(urls) except Exception: CDNPurgeURL.failed(urls) raise return {"all_urls": urls, "result": x_cache_headers} if not keycdn_zone_check(): print("WARNING! Unable to use KeyCDN API at the moment :(") return if not api: api = keycdn.Api(settings.KEYCDN_API_KEY) api.session = get_requests_retry_session() config = get_cdn_config(api) # See https://www.keycdn.com/api#purge-zone-url try: cachebr = config["data"]["zone"]["cachebr"] == "enabled" except KeyError: raise BrokenKeyCDNConfig("Config={!r}".format(config)) all_urls = [] # For KeyCDN we need to do some transformations. Our URLs are different # from the KeyCDN "URLs". When we make this transformation, maintain a map # *back* to the original URLs as they're known to us. original_urls = {} for absolute_url in urls: url = settings.KEYCDN_ZONE_URL + urlparse(absolute_url).path all_urls.append(url) original_urls[url] = absolute_url if cachebr: all_urls.append(url + "br") original_urls[url + "br"] = absolute_url # Make absolutely sure nothing's repeated. all_all_urls = sorted(list(set(all_urls))) def get_original_urls(cdn_urls): original = set() for url in cdn_urls: original.add(original_urls[url]) return original # Break it up into lists of 100 def chunks(l, n): # For item i in a range that is a length of l, for i in range(0, len(l), n): # Create an index range for l of n items: yield l[i:i + n] for all_urls in chunks(all_all_urls, 100): call = "zones/purgeurl/{}.json".format(settings.KEYCDN_ZONE_ID) params = {"urls": all_urls} with open("/tmp/purge_cdn_urls.log", "a") as f: f.write("{}\t{!r}\t{}\n".format(timezone.now(), all_urls, get_stack_signature())) try: r = api.delete(call, params) CDNPurgeURL.succeeded(get_original_urls(all_urls)) except Exception: CDNPurgeURL.failed(get_original_urls(all_urls)) raise print( "SENT CDN PURGE FOR: {!r}\tORIGINAL URLS: {!r}\tRESULT: {}".format( all_urls, urls, r)) return {"result": r, "all_urls": all_all_urls}
def purge_outdated_cdn_urls(verbose=False, dry_run=False, revisit=False, max_files=100): """Periodically, go through fs cache files, by date, and compare each one with their CDN equivalent to see if the CDN version is too different. """ paths = [] for root, dirs, files in os.walk(settings.FSCACHE_ROOT): for file_ in files: if file_.endswith(".metadata"): continue path = os.path.join(root, file_) for attempt in range(3): if ( "index.html" in file_ and os.path.isfile(path) and not os.stat(path).st_size ): # If this happens, give it "one more chance" by sleeping # a little and only raise the error if it file still isn't # there after some sleeping. time.sleep(1) continue break else: raise EmptyFSCacheFile(path) if os.path.isfile(path + ".metadata") and "/awspa" not in path: paths.append((os.stat(path).st_mtime, path)) # Oldest first paths.sort() for mtime, path in paths[:max_files]: uri = path.replace(settings.FSCACHE_ROOT, "") uri = re.sub(r"/index\.html$", "", uri) if uri == "": uri = "/" if verbose: age_seconds = time.time() - mtime if age_seconds > 10000: human_age = "{} hours".format(int(age_seconds / 60 / 60)) elif age_seconds > 60: human_age = "{} minutes".format(int(age_seconds / 60)) else: human_age = "{:.1f} seconds".format(age_seconds) print("{} last touched {} ago".format(uri, human_age)) # Update the files modification time so it gets last in the sort order # next time. os.utime(path, (os.stat(path).st_atime, time.time())) cdn_url = get_cdn_base_url() + uri response = _download_cdn_url(cdn_url) if response.status_code == 404: # If it can't be viewed on the CDN, it has no business existing as a # fscached filed. # os.remove(path) if verbose: print("Deleted {!r} because it 404'ed on {}".format(path, cdn_url)) continue if response.status_code != 200: if verbose: print("{} on {} :(".format(response.status_code, cdn_url)) continue if response.headers.get("x-cache") != "HIT": if verbose: print( "Wasn't x-cache HIT anyway ({!r}) {}".format( response.headers.get("x-cache"), cdn_url ) ) continue with open(path) as f: local_html = f.read() remote_html = response.text if local_html != remote_html and not dry_run: CDNPurgeURL.add(cdn_url)