async def _expire_old_entries(clock: Clock, expiry_seconds: int) -> None: """Walks the global cache list to find cache entries that haven't been accessed in the given number of seconds. """ now = int(clock.time()) node = GLOBAL_ROOT.prev_node assert node is not None i = 0 logger.debug("Searching for stale caches") while node is not GLOBAL_ROOT: # Only the root node isn't a `_TimedListNode`. assert isinstance(node, _TimedListNode) if node.last_access_ts_secs > now - expiry_seconds: break cache_entry = node.get_cache_entry() next_node = node.prev_node # The node should always have a reference to a cache entry and a valid # `prev_node`, as we only drop them when we remove the node from the # list. assert next_node is not None assert cache_entry is not None cache_entry.drop_from_cache() # If we do lots of work at once we yield to allow other stuff to happen. if (i + 1) % 10000 == 0: logger.debug("Waiting during drop") await clock.sleep(0) logger.debug("Waking during drop") node = next_node # If we've yielded then our current node may have been evicted, so we # need to check that its still valid. if node.prev_node is None: break i += 1 logger.info("Dropped %d items from caches", i)
def _create_event_id(clock: Clock, hostname: str) -> str: """Create a new event ID Args: clock hostname: The server name for the event ID Returns: The new event ID """ global _event_id_counter i = str(_event_id_counter) _event_id_counter += 1 local_part = str(int(clock.time())) + i + random_string(5) e_id = EventID(local_part, hostname) return e_id.to_string()
def update_last_access(self, clock: Clock) -> None: self.last_access_ts_secs = int(clock.time())
async def _expire_old_entries(clock: Clock, expiry_seconds: int, autotune_config: Optional[dict]) -> None: """Walks the global cache list to find cache entries that haven't been accessed in the given number of seconds, or if a given memory threshold has been breached. """ if autotune_config: max_cache_memory_usage = autotune_config["max_cache_memory_usage"] target_cache_memory_usage = autotune_config[ "target_cache_memory_usage"] min_cache_ttl = autotune_config["min_cache_ttl"] / 1000 now = int(clock.time()) node = GLOBAL_ROOT.prev_node assert node is not None i = 0 logger.debug("Searching for stale caches") evicting_due_to_memory = False # determine if we're evicting due to memory jemalloc_interface = get_jemalloc_stats() if jemalloc_interface and autotune_config: try: jemalloc_interface.refresh_stats() mem_usage = jemalloc_interface.get_stat("allocated") if mem_usage > max_cache_memory_usage: logger.info("Begin memory-based cache eviction.") evicting_due_to_memory = True except Exception: logger.warning( "Unable to read allocated memory, skipping memory-based cache eviction." ) while node is not GLOBAL_ROOT: # Only the root node isn't a `_TimedListNode`. assert isinstance(node, _TimedListNode) # if node has not aged past expiry_seconds and we are not evicting due to memory usage, there's # nothing to do here if (node.last_access_ts_secs > now - expiry_seconds and not evicting_due_to_memory): break # if entry is newer than min_cache_entry_ttl then do not evict and don't evict anything newer if evicting_due_to_memory and now - node.last_access_ts_secs < min_cache_ttl: break cache_entry = node.get_cache_entry() next_node = node.prev_node # The node should always have a reference to a cache entry and a valid # `prev_node`, as we only drop them when we remove the node from the # list. assert next_node is not None assert cache_entry is not None cache_entry.drop_from_cache() # Check mem allocation periodically if we are evicting a bunch of caches if jemalloc_interface and evicting_due_to_memory and (i + 1) % 100 == 0: try: jemalloc_interface.refresh_stats() mem_usage = jemalloc_interface.get_stat("allocated") if mem_usage < target_cache_memory_usage: evicting_due_to_memory = False logger.info("Stop memory-based cache eviction.") except Exception: logger.warning( "Unable to read allocated memory, this may affect memory-based cache eviction." ) # If we've failed to read the current memory usage then we # should stop trying to evict based on memory usage evicting_due_to_memory = False # If we do lots of work at once we yield to allow other stuff to happen. if (i + 1) % 10000 == 0: logger.debug("Waiting during drop") if node.last_access_ts_secs > now - expiry_seconds: await clock.sleep(0.5) else: await clock.sleep(0) logger.debug("Waking during drop") node = next_node # If we've yielded then our current node may have been evicted, so we # need to check that its still valid. if node.prev_node is None: break i += 1 logger.info("Dropped %d items from caches", i)
class WellKnownResolver: """Handles well-known lookups for matrix servers.""" def __init__( self, reactor: IReactorTime, agent: IAgent, user_agent: bytes, well_known_cache: Optional[TTLCache] = None, had_well_known_cache: Optional[TTLCache] = None, ): self._reactor = reactor self._clock = Clock(reactor) if well_known_cache is None: well_known_cache = _well_known_cache if had_well_known_cache is None: had_well_known_cache = _had_valid_well_known_cache self._well_known_cache = well_known_cache self._had_valid_well_known_cache = had_well_known_cache self._well_known_agent = RedirectAgent(agent) self.user_agent = user_agent async def get_well_known(self, server_name: bytes) -> WellKnownLookupResult: """Attempt to fetch and parse a .well-known file for the given server Args: server_name: name of the server, from the requested url Returns: The result of the lookup """ try: prev_result, expiry, ttl = self._well_known_cache.get_with_expiry( server_name) now = self._clock.time() if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl: return WellKnownLookupResult(delegated_server=prev_result) except KeyError: prev_result = None # TODO: should we linearise so that we don't end up doing two .well-known # requests for the same server in parallel? try: with Measure(self._clock, "get_well_known"): result, cache_period = await self._fetch_well_known( server_name) # type: Optional[bytes], float except _FetchWellKnownFailure as e: if prev_result and e.temporary: # This is a temporary failure and we have a still valid cached # result, so lets return that. Hopefully the next time we ask # the remote will be back up again. return WellKnownLookupResult(delegated_server=prev_result) result = None if self._had_valid_well_known_cache.get(server_name, False): # We have recently seen a valid well-known record for this # server, so we cache the lack of well-known for a shorter time. cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD else: cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD # add some randomness to the TTL to avoid a stampeding herd cache_period *= random.uniform( 1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER, 1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER, ) if cache_period > 0: self._well_known_cache.set(server_name, result, cache_period) return WellKnownLookupResult(delegated_server=result) async def _fetch_well_known(self, server_name: bytes) -> Tuple[bytes, float]: """Actually fetch and parse a .well-known, without checking the cache Args: server_name: name of the server, from the requested url Raises: _FetchWellKnownFailure if we fail to lookup a result Returns: The lookup result and cache period. """ had_valid_well_known = self._had_valid_well_known_cache.get( server_name, False) # We do this in two steps to differentiate between possibly transient # errors (e.g. can't connect to host, 503 response) and more permanent # errors (such as getting a 404 response). response, body = await self._make_well_known_request( server_name, retry=had_valid_well_known) try: if response.code != 200: raise Exception("Non-200 response %s" % (response.code, )) parsed_body = json_decoder.decode(body.decode("utf-8")) logger.info("Response from .well-known: %s", parsed_body) result = parsed_body["m.server"].encode("ascii") except defer.CancelledError: # Bail if we've been cancelled raise except Exception as e: logger.info("Error parsing well-known for %s: %s", server_name, e) raise _FetchWellKnownFailure(temporary=False) cache_period = _cache_period_from_headers( response.headers, time_now=self._reactor.seconds) if cache_period is None: cache_period = WELL_KNOWN_DEFAULT_CACHE_PERIOD # add some randomness to the TTL to avoid a stampeding herd every 24 hours # after startup cache_period *= random.uniform( 1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER, 1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER, ) else: cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD) cache_period = max(cache_period, WELL_KNOWN_MIN_CACHE_PERIOD) # We got a success, mark as such in the cache self._had_valid_well_known_cache.set( server_name, bool(result), cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID, ) return result, cache_period async def _make_well_known_request(self, server_name: bytes, retry: bool) -> Tuple[IResponse, bytes]: """Make the well known request. This will retry the request if requested and it fails (with unable to connect or receives a 5xx error). Args: server_name: name of the server, from the requested url retry: Whether to retry the request if it fails. Raises: _FetchWellKnownFailure if we fail to lookup a result Returns: Returns the response object and body. Response may be a non-200 response. """ uri = b"https://%s/.well-known/matrix/server" % (server_name, ) uri_str = uri.decode("ascii") headers = { b"User-Agent": [self.user_agent], } i = 0 while True: i += 1 logger.info("Fetching %s", uri_str) try: response = await make_deferred_yieldable( self._well_known_agent.request(b"GET", uri, headers=Headers(headers))) body_stream = BytesIO() await make_deferred_yieldable( read_body_with_max_size(response, body_stream, WELL_KNOWN_MAX_SIZE)) body = body_stream.getvalue() if 500 <= response.code < 600: raise Exception("Non-200 response %s" % (response.code, )) return response, body except defer.CancelledError: # Bail if we've been cancelled raise except BodyExceededMaxSize: # If the well-known file was too large, do not keep attempting # to download it, but consider it a temporary error. logger.warning( "Requested .well-known file for %s is too large > %r bytes", server_name.decode("ascii"), WELL_KNOWN_MAX_SIZE, ) raise _FetchWellKnownFailure(temporary=True) except Exception as e: if not retry or i >= WELL_KNOWN_RETRY_ATTEMPTS: logger.info("Error fetching %s: %s", uri_str, e) raise _FetchWellKnownFailure(temporary=True) logger.info("Error fetching %s: %s. Retrying", uri_str, e) # Sleep briefly in the hopes that they come back up await self._clock.sleep(0.5)