def janitor(): """Ideally this is maintained by a systemd service to cleanup redis and the file system while Fractalis is running. """ data_dir = os.path.join(app.config['FRACTALIS_TMP_DIR'], 'data') tracked_ids = [key.split(':')[1] for key in redis.scan_iter('data:*')] if not os.path.exists(data_dir): for task_id in tracked_ids: async_result = celery.AsyncResult(task_id) if async_result.state == 'SUCCESS': redis.delete('data:{}'.format(task_id)) return cached_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))] # clean cached files for cached_file in cached_files: if cached_file not in tracked_ids: sync.remove_file(os.path.join(data_dir, cached_file)) # clean tracked files for task_id in tracked_ids: path = os.path.join(data_dir, task_id) async_result = celery.AsyncResult(task_id) if async_result.state == 'SUCCESS' and not os.path.exists(path): redis.delete('data:{}'.format(task_id))
def find_duplicate_task_id(self, data_tasks: List[str], descriptor: dict) -> Union[str, None]: """Search for duplicates of the given descriptor and return their task id if the state is SUBMITTED or SUCCESS, meaning the data are reusable. :param data_tasks: Limit search to this list. :param descriptor: ETL descriptor. Used to identify duplicates. :return: TaskID if valid duplicate has been found, None otherwise. """ task_ids = self.find_duplicates(data_tasks, descriptor) for task_id in task_ids: async_result = celery.AsyncResult(task_id) if (async_result.state == 'SUBMITTED' or async_result.state == 'SUCCESS'): return task_id return None
def cleanup_all() -> None: """Reset redis, celery and the filesystem. This is only useful for testing and should !!!NEVER!!! be used for anything else. """ celery.control.purge() for key in redis.keys('data:*'): value = redis.get(key) try: data_state = json.loads(value) except ValueError: continue task_id = data_state.get('task_id') if task_id is not None: async_result = celery.AsyncResult(task_id) if async_result.state == 'SUBMITTED': async_result.get(propagate=False) redis.flushall() tmp_dir = app.config['FRACTALIS_TMP_DIR'] if os.path.exists(tmp_dir): rmtree(tmp_dir) assert not os.path.exists(tmp_dir)
def get_data_state_for_task_id(task_id: str, wait: bool) -> Union[dict, None]: """Return data state associated with task id. :param task_id: The id associated with the ETL task. :param wait: If true and ETL is still running wait for it. :return: Data state that has been stored in Redis. """ async_result = celery.AsyncResult(task_id) if wait and async_result.state == 'SUBMITTED': logger.debug("'wait' was set. Waiting for tasks to finish ...") async_result.get(propagate=False) value = redis.get('data:{}'.format(task_id)) if not value: return None data_state = json.loads(value) # add additional information to data_state result = async_result.result if isinstance(result, Exception): # Exception -> str result = "{}: {}".format(type(result).__name__, str(result)) data_state['etl_message'] = result data_state['etl_state'] = async_result.state return data_state
def get_task_details(task_id: UUID) -> Tuple[Response, int]: """Get task details for the given task_id. See doc/api/ for more information. :param task_id: ID returned on task creation. :return: Flask Response """ logger.debug("Received GET request on /analytics/task_id.") wait = request.args.get('wait') == '1' task_id = str(task_id) if task_id not in session['analytic_tasks']: error = "Task ID '{}' not found in session. " \ "Refusing access.".format(task_id) logger.warning(error) return jsonify({'error': error}), 403 async_result = celery.AsyncResult(task_id) if wait and async_result.state == 'SUBMITTED': async_result.get(propagate=False) result = async_result.result if isinstance(result, Exception): # Exception -> str result = "{}: {}".format(type(result).__name__, str(result)) logger.debug("Task found and has access. Sending response.") return jsonify({'state': async_result.state, 'result': result}), 200
def get_state_data(state_id: UUID) -> Tuple[Response, int]: """Check whether every ETL linked to the state_id successfully executed for this session. If and only if every ETL successfully completed grant access to the state information. :param state_id: ID of the state that is requested. :return: Previously saved state. """ logger.debug("Received GET request on /state/<uuid:state_id>.") state_id = str(state_id) value = redis.get('state:{}'.format(state_id)) if not value or state_id not in session['state_access']: error = "Cannot get state. Make sure to submit a POST request " \ "to this very same URL containing credentials and server " \ "data to launch access verification. Only after that a GET " \ "request might or might not return you the saved state." logger.error(error) return jsonify({'error': error}), 404 meta_state = json.loads(value) state = json.dumps(meta_state['state']) for task_id in session['state_access'][state_id]: async_result = celery.AsyncResult(task_id) if async_result.state == 'SUBMITTED': return jsonify({'message': 'ETLs are still running.'}), 202 elif async_result.state == 'SUCCESS': continue else: error = "One or more ETLs failed or has unknown status. " \ "Assuming no access to saved state." logger.error(error) return jsonify({'error': error}), 403 # replace task ids in state with the ids of the freshly loaded data for i, task_id in enumerate(meta_state['task_ids']): state = re.sub(pattern=task_id, repl=session['state_access'][state_id][i], string=state) return jsonify({'state': json.loads(state)}), 200