class QueryCompute(object): """A class for executing Jia queries Provides `compute` and `cache` methods. `compute(use_cache=False)` can be called to simply run the query within a given `timeframe`. Otherwise, `bucket_width` must be specified to get from the cache. In order to write to the cache via the `cache` method, both `bucket_width` and `untrusted_time` must be specified. """ def __init__(self, query, timeframe, bucket_width=None, untrusted_time=None, metis=False): """Initialize QueryCompute :param query: A string of python code to execute as a Jia query. :param timeframe: A timeframe dictionary. It specifies a mode, which can be 'recent' or 'range'. Depending on which mode is selected, some of the other parameters will be unused. The unused parameters come from the frontend for the purposes of storing default/previous values. If the mode is recent, only 'value' and 'scale' are used. If the mode is 'range', only 'from' and 'to' are used. Example timeframe: timeframe = { 'mode': 'recent', 'value': 1, 'scale': 'days', 'from': 'Sat Jun 10 2014 00:00:00', 'to': 'Sun Jun 11 2014 00:00:00', } :param bucket_width: Optional bucket width in seconds :param untrusted_time: Optional untrusted time interval in seconds :param metis: Send `query` to metis for computation """ try: self._app = current_app self._app.config # The above line won't fail, but this one will except RuntimeError: from scheduler import get_app self._app = get_app() self._query = query self._bucket_width = bucket_width self._untrusted_time = untrusted_time self._metis = metis self._start_time, self._end_time = self._get_timeframe_bounds(timeframe, bucket_width) self._cache_client = KronosClient( self._app.config['CACHE_KRONOS_URL'], namespace=self._app.config['CACHE_KRONOS_NAMESPACE'], blocking=False, sleep_block=0.2) # The query is sent through as an unused unique_id argument so that the # QueryCache hash can properly uniquely identify it unique = { 'unique_id': self._query } if self._metis: query_func = self._run_metis elif self._app.config['ALLOW_PYCODE']: query_func = self._run_query else: raise ValueError("`metis` must be `True` if ALLOW_PYCODE is not enabled") if self._bucket_width: bucket_width_timedelta = datetime.timedelta(seconds=bucket_width) self._query_cache = QueryCache(self._cache_client, query_func, bucket_width_timedelta, self._app.config['CACHE_KRONOS_NAMESPACE'], query_function_kwargs=unique) def _get_timeframe_bounds(self, timeframe, bucket_width): """ Get a `bucket_width` aligned `start_time` and `end_time` from a `timeframe` dict """ if bucket_width: bucket_width_seconds = bucket_width bucket_width = epoch_time_to_kronos_time(bucket_width) # TODO(derek): Potential optimization by setting the end_time equal to the # untrusted_time if end_time > untrusted_time and the results are not being # output to the user (only for caching) if timeframe['mode']['value'] == 'recent': # Set end_time equal to now and align to bucket width end_time = datetime_to_kronos_time(datetime.datetime.now()) original_end_time = end_time duration = get_seconds(timeframe['value'], timeframe['scale']['name']) duration = epoch_time_to_kronos_time(duration) start_time = original_end_time - duration if bucket_width: # Align values to the bucket width # TODO(derek): Warn the user that the timeframe has been altered to fit # the bucket width if (end_time % bucket_width) != 0: end_time += bucket_width - (end_time % bucket_width) if (start_time % bucket_width) != 0: start_time -= (start_time % bucket_width) start = kronos_time_to_datetime(start_time) end = kronos_time_to_datetime(end_time) elif timeframe['mode']['value'] == 'range': end = datetime.datetime.strptime(timeframe['to'], DT_FORMAT) end_seconds = datetime_to_epoch_time(end) start = datetime.datetime.strptime(timeframe['from'], DT_FORMAT) start_seconds = datetime_to_epoch_time(start) if bucket_width: # Align values to the bucket width # TODO(derek): Warn the user that the timeframe has been altered to fit # the bucket width start_bump = start_seconds % bucket_width_seconds start -= datetime.timedelta(seconds=start_bump) if (end_seconds % bucket_width_seconds) != 0: end_bump = bucket_width_seconds - (end_seconds % bucket_width_seconds) end += datetime.timedelta(seconds=end_bump) else: raise ValueError("Timeframe mode must be 'recent' or 'range'") return start, end def _run_query(self, start_time, end_time, unique_id=None): """Executes a Python query string and returns events Acts as a wrapper around exec that injects necessary local variables into the scope of the user-provided query blob. :param start_time: Python datetime to be injected into query :param end_time: Python datetime to be injected into query :param unique_id: An unused flag that allows the scheduler to hash this function uniquely based on its args when it passes through """ client = KronosClient(self._app.config['KRONOS_URL'], namespace=self._app.config['KRONOS_NAMESPACE'], blocking=False, sleep_block=0.2) locals_dict = { 'kronos_client': client, 'events': [], 'start_time': start_time, 'end_time': end_time, } try: exec self._query in {}, locals_dict # No globals. except: _, exception, tb = sys.exc_info() raise PyCodeError(exception, traceback.format_tb(tb)) events = sorted(locals_dict.get('events', []), key=lambda event: event['@time']) return events def _run_metis(self, start_time, end_time, unique_id=None): start_time = datetime_to_kronos_time(start_time) end_time = datetime_to_kronos_time(end_time) q = create_metis_query_plan(self._query, start_time, end_time) r = requests.post("%s/1.0/query" % self._app.config['METIS_URL'], data=q) return json.loads('[%s]' % (',').join(r.text.splitlines())) def compute(self, use_cache=True): """Call a user defined query and return events with optional help from the cache. :param use_cache: Specifies whether the cache should be used when possible """ if use_cache: if not self._bucket_width: raise ValueError('QueryCompute must be initialized with a bucket_width' ' to use caching features.') return list(self._query_cache.retrieve_interval(self._start_time, self._end_time, compute_missing=True)) else: if self._metis: return self._run_metis(self._start_time, self._end_time) else: return self._run_query(self._start_time, self._end_time) def cache(self): """Call a user defined query and cache the results""" if not self._bucket_width or self._untrusted_time is None: raise ValueError('QueryCompute must be initialized with a bucket_width ' 'and an untrusted_time in order to write to the cache.') now = datetime.datetime.now() untrusted_time = now - datetime.timedelta(seconds=self._untrusted_time) list(self._query_cache.compute_and_cache_missing_buckets( self._start_time, self._end_time, untrusted_time))
def test_cache_layer(self): cache = QueryCache(self.client, self.filter_and_sum, self.bucket_width, self.computed_namespace) start_time = self.start_time - (self.bucket_width * 3) end_time = self.start_time + (self.total_events * self.increment) + ( self.bucket_width * 3) untrusted_time = self.start_time + ( timedelta(minutes=(self.total_events / 2) - 25)) # Verify all results were computed correctly. self.verify_results(lambda: list( cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 31) # Verify only trusted results are cached. self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Running the same operations twice should result in the same # results as before. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Expanding the time range without caching should also result in the same # results self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width)), cache, 11, 0) # But specifying compute_missing should get all results for the timerange self.verify_results( lambda: list(cache.retrieve_interval(start_time - self.bucket_width, end_time + self.bucket_width, compute_missing=True)), cache, 25, 19) # Overlapping time queries should result in the same # results as before, and benefit from the cache. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time - self.bucket_width, end_time + self.bucket_width, untrusted_time)), cache, 25, 19) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 11, 0) # Increasing the trusted time should increase the cached results. untrusted_time = untrusted_time + timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 17) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Decreasing trusted time shouldn't remove results. untrusted_time = untrusted_time - timedelta(minutes=40) self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 15) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # If there are two cached entries, that cached time should no # longer be returned. results = list(cache.retrieve_interval(start_time, end_time)) duplicate_result = dict(results[10]) duplicate_result['b_sum'] = 0 self.client.put({cache._scratch_stream: [duplicate_result]}, namespace=cache._scratch_namespace) self.client.flush() safe_results = list(cache.retrieve_interval(start_time, end_time)) self.assertEqual(results[:10] + results[11:], safe_results) # Rerunning the cache/computation should re-cache the corrupted # element. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets(start_time, end_time, untrusted_time)), cache, 25, 16) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0) # Forcing computation should generate the same result set. self.verify_results( lambda: list(cache.compute_and_cache_missing_buckets( start_time, end_time, untrusted_time, force_recompute=True)), cache, 25, 31) self.verify_results( lambda: list(cache.retrieve_interval(start_time, end_time)), cache, 13, 0)