def _execute(task): """A wrapper around exec This exists outside the Scheduler class because it is pickled after it is sent to the executor. """ print "[%s] -- %s -- START" % (datetime.datetime.now(), task['id']) try: with get_app().app_context(): exec task['code'] in {}, {} print "[%s] -- %s -- COMPLETE" % (datetime.datetime.now(), task['id']) except Exception as e: if isinstance(e, PyCodeError): err_msg = "%s: %s\n%s" % (e.data['name'], e.data['message'], ''.join(e.data['traceback'])) else: err_msg = traceback.format_exc() sys.stderr.write(err_msg) sys.stderr.write("[%s] -- %s -- FAIL\n" % (datetime.datetime.now(), task['id'])) email_msg = 'Task %s failed at %s\n\n%s' % (task['id'], datetime.datetime.now(), err_msg) send_mail(get_app().config['SCHEDULER_FAILURE_EMAILS'], 'Scheduler Failure', email_msg) finally: return task
def _loop(self, reader): """Main execution loop of the scheduler. The loop runs every second. Between iterations, the loop listens for schedule or cancel requests coming from Flask via over the gipc pipe (reader) and modifies the queue accordingly. When a task completes, it is rescheduled """ results = set() while True: now = datetime.datetime.now() if self._task_queue and self._task_queue[0][0] <= now: task = heappop(self._task_queue)[1] if task['id'] not in self._pending_cancels: result = self._executor.submit(_execute, task) results.add(result) else: self._pending_cancels.remove(task['id']) else: # Check for new tasks coming from HTTP with gevent.Timeout(0.5, False) as t: message = reader.get(timeout=t) if message[0] == 'schedule': self._schedule(message[1], next_run=now) elif message[0] == 'cancel': self._cancel(message[1]) # Reschedule completed tasks if not results: gevent.sleep(0.5) continue ready = self._executor.wait(results, num=1, timeout=0.5) for result in ready: results.remove(result) if result.value: task = result.value interval = int(task['interval']) if interval: run_at = now + datetime.timedelta(seconds=int(task['interval'])) self._schedule(task, next_run=run_at) else: err_msg = result.exception sys.stderr.write("ERROR: %s" % err_msg) email_msg = 'Task %s failed at %s\n\n%s' % ( task['id'], datetime.datetime.now(), err_msg ) send_mail(get_app().config['SCHEDULER_FAILURE_EMAILS'], 'Scheduler Failure', email_msg)
def __init__(self): """Initialize the queue and spawn the main loop thread Upon initialization, tasks stored in the database are immediately scheduled. _task_queue is a priority queue ordered using Python's heapq functionality. Elements in _task_queue are tuples of the form (datetime, task) where datetime is the scheduled run time and task is a dictionary as defined in the above docstring for the Scheduler class. For concurrency safety reasons, never write to _task_queue outside the _loop() thread. """ self._task_queue = [] # Never write to this outside the _loop thread self._pending_cancels = set() self._executor = GIPCExecutor() # Load previously scheduled tasks from database now = datetime.datetime.now() with get_app().app_context(): saved_schedule = Task.query.filter_by(active=True) for task in saved_schedule: new_task = { 'id': task.id, 'interval': task.interval, 'code': task.code } # Writing directly to the _task_queue is safe since we haven't started # the _loop yet self._task_queue.append((now, new_task)) # Make _task_queue a priority queue heapify(self._task_queue) # Spawn main loop and save writer for future communication (read, write) = gipc.pipe() self._main_thread = gevent.spawn(self._loop, read) self._schedule_pipe = write atexit.register(self._interrupt)
def __init__(self, query, timeframe, bucket_width=None, untrusted_time=None, metis=False): """Initialize QueryCompute :param query: A string of python code to execute as a Jia query. :param timeframe: A timeframe dictionary. It specifies a mode, which can be 'recent' or 'range'. Depending on which mode is selected, some of the other parameters will be unused. The unused parameters come from the frontend for the purposes of storing default/previous values. If the mode is recent, only 'value' and 'scale' are used. If the mode is 'range', only 'from' and 'to' are used. Example timeframe: timeframe = { 'mode': 'recent', 'value': 1, 'scale': 'days', 'from': 'Sat Jun 10 2014 00:00:00', 'to': 'Sun Jun 11 2014 00:00:00', } :param bucket_width: Optional bucket width in seconds :param untrusted_time: Optional untrusted time interval in seconds :param metis: Send `query` to metis for computation """ try: self._app = current_app self._app.config # The above line won't fail, but this one will except RuntimeError: from scheduler import get_app self._app = get_app() self._query = query self._bucket_width = bucket_width self._untrusted_time = untrusted_time self._metis = metis self._start_time, self._end_time = self._get_timeframe_bounds(timeframe, bucket_width) self._cache_client = KronosClient( self._app.config['CACHE_KRONOS_URL'], namespace=self._app.config['CACHE_KRONOS_NAMESPACE'], blocking=False, sleep_block=0.2) # The query is sent through as an unused unique_id argument so that the # QueryCache hash can properly uniquely identify it unique = { 'unique_id': self._query } if self._metis: query_func = self._run_metis elif self._app.config['ALLOW_PYCODE']: query_func = self._run_query else: raise ValueError("`metis` must be `True` if ALLOW_PYCODE is not enabled") if self._bucket_width: bucket_width_timedelta = datetime.timedelta(seconds=bucket_width) self._query_cache = QueryCache(self._cache_client, query_func, bucket_width_timedelta, self._app.config['CACHE_KRONOS_NAMESPACE'], query_function_kwargs=unique)