def execute_query(self, query, data_source_id, metadata): start_time = time.time() logger.info("Loading data source (%d)...", data_source_id) # TODO: we should probably cache data sources in Redis data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state='STARTED', meta={ 'start_time': start_time, 'custom_message': '' }) logger.info("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = get_query_runner(data_source.type, data_source.options) if query_runner.annotate_query(): metadata['Task ID'] = self.request.id metadata['Query Hash'] = query_hash metadata['Queue'] = self.request.delivery_info['routing_key'] annotation = u", ".join( [u"{}: {}".format(k, v) for k, v in metadata.iteritems()]) logging.debug(u"Annotation: %s", annotation) annotated_query = u"/* {} */ {}".format(annotation, query) else: annotated_query = query with statsd_client.timer('query_runner.{}.{}.run_time'.format( data_source.type, data_source.name)): data, error = query_runner.run_query(annotated_query) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state='STARTED', meta={ 'start_time': start_time, 'error': error, 'custom_message': '' }) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) if not error: query_result, updated_query_ids = models.QueryResult.store_result( data_source.id, query_hash, query, data, run_time, utils.utcnow()) for query_id in updated_query_ids: check_alerts_for_query.delay(query_id) else: raise Exception(error) return query_result.id
def send_failure_report(user_id): user = models.User.get_by_id(user_id) errors = [ json_loads(e) for e in redis_connection.lrange(key(user_id), 0, -1) ] if errors: errors.reverse() occurrences = Counter((e.get("id"), e.get("message")) for e in errors) unique_errors = {(e.get("id"), e.get("message")): e for e in errors} context = { "failures": [{ "id": v.get("id"), "name": v.get("name"), "failed_at": v.get("failed_at"), "failure_reason": v.get("message"), "failure_count": occurrences[k], "comment": comment_for(v), } for k, v in unique_errors.items()], "base_url": base_url(user.org), } subject = "Redash failed to execute {} of your scheduled queries".format( len(unique_errors.keys())) html, text = [ render_template("emails/failures.{}".format(f), context) for f in ["html", "txt"] ] send_mail.delay([user.email], subject, html, text) redis_connection.delete(key(user_id))
def send_failure_report(user_id): user = models.User.get_by_id(user_id) errors = [ json_loads(e) for e in redis_connection.lrange(key(user_id), 0, -1) ] if errors: errors.reverse() occurrences = Counter((e.get('id'), e.get('message')) for e in errors) unique_errors = {(e.get('id'), e.get('message')): e for e in errors} context = { 'failures': [{ 'id': v.get('id'), 'name': v.get('name'), 'failed_at': v.get('failed_at'), 'failure_reason': v.get('message'), 'failure_count': occurrences[k], 'comment': comment_for(v) } for k, v in unique_errors.items()], 'base_url': base_url(user.org) } html = render_template('emails/failures.html', **context) text = render_template('emails/failures.txt', **context) subject = "Redash failed to execute {} of your scheduled queries".format( len(unique_errors.keys())) send_mail.delay([user.email], subject, html, text) redis_connection.delete(key(user_id))
def cleanup_tasks(): # in case of cold restart of the workers, there might be jobs that still have their "lock" object, but aren't really # going to run. this job removes them. lock_keys = redis_connection.keys("query_hash_job:*") # TODO: use set instead of keys command if not lock_keys: return query_tasks = [QueryTask(job_id=j) for j in redis_connection.mget(lock_keys)] logger.info("Found %d locks", len(query_tasks)) inspect = celery.control.inspect() active_tasks = inspect.active() if active_tasks is None: active_tasks = [] else: active_tasks = active_tasks.values() all_tasks = set() for task_list in active_tasks: for task in task_list: all_tasks.add(task['id']) logger.info("Active jobs count: %d", len(all_tasks)) for i, t in enumerate(query_tasks): if t.ready(): # if locked task is ready already (failed, finished, revoked), we don't need the lock anymore logger.warning("%s is ready (%s), removing lock.", lock_keys[i], t.celery_status) redis_connection.delete(lock_keys[i])
def cleanup_tasks(): # in case of cold restart of the workers, there might be jobs that still have their "lock" object, but aren't really # going to run. this job removes them. lock_keys = redis_connection.keys( "query_hash_job:*") # TODO: use set instead of keys command if not lock_keys: return query_tasks = [ QueryTask(job_id=j) for j in redis_connection.mget(lock_keys) ] logger.info("Found %d locks", len(query_tasks)) inspect = celery.control.inspect() active_tasks = inspect.active() if active_tasks is None: active_tasks = [] else: active_tasks = active_tasks.values() all_tasks = set() for task_list in active_tasks: for task in task_list: all_tasks.add(task['id']) logger.info("Active jobs count: %d", len(all_tasks)) for i, t in enumerate(query_tasks): if t.ready(): # if locked task is ready already (failed, finished, revoked), we don't need the lock anymore logger.warning("%s is ready (%s), removing lock.", lock_keys[i], t.celery_status) redis_connection.delete(lock_keys[i])
def enqueue_query(query, data_source, user_id, scheduled_query=None, metadata={}): query_hash = gen_query_hash(query) logging.info("Inserting job for %s with metadata=%s", query_hash, metadata) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[%s] Found existing job: %s", query_hash, job_id) job = QueryTask(job_id=job_id) if job.ready(): logging.info("[%s] job found is ready (%s), removing lock", query_hash, job.celery_status) redis_connection.delete(_job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() time_limit = None if scheduled_query: queue_name = data_source.scheduled_queue_name scheduled_query_id = scheduled_query.id else: queue_name = data_source.queue_name scheduled_query_id = None time_limit = settings.ADHOC_QUERY_TIME_LIMIT result = execute_query.apply_async(args=(query, data_source.id, metadata, user_id, scheduled_query_id), queue=queue_name, time_limit=time_limit) job = QueryTask(async_result=result) tracker = QueryTaskTracker.create( result.id, 'created', query_hash, data_source.id, scheduled_query is not None, metadata) tracker.save(connection=pipe) logging.info("[%s] Created new job: %s", query_hash, job.id) pipe.set(_job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def delete(self): Query.query.filter(Query.data_source == self).update(dict(data_source_id=None, latest_query_data_id=None)) QueryResult.query.filter(QueryResult.data_source == self).delete() res = db.session.delete(self) db.session.commit() redis_connection.delete(self._schema_key) return res
def _compare_and_update(latest_version): # TODO: support alpha channel (allow setting which channel to check & parse build number) is_newer = semver.compare(current_version, latest_version) == -1 logging.info("Latest version: %s (newer: %s)", latest_version, is_newer) if is_newer: redis_connection.set(REDIS_KEY, latest_version) else: redis_connection.delete(REDIS_KEY)
def setUp(self): self.list = "test_list" redis_connection.delete(self.list) self.keys = [] for score in range(0, 100): key = 'k:{}'.format(score) self.keys.append(key) redis_connection.zadd(self.list, {key: score}) redis_connection.set(key, 1)
def setUp(self): self.list = "test_list" redis_connection.delete(self.list) self.keys = [] for score in range(0, 100): key = 'k:{}'.format(score) self.keys.append(key) redis_connection.zadd(self.list, score, key) redis_connection.set(key, 1)
def add_task(cls, query, data_source, scheduled=False, metadata={}): query_hash = gen_query_hash(query) logging.info("[Manager][%s] Inserting job", query_hash) logging.info("[Manager] Metadata: [%s]", metadata) try_count = 0 job = None while try_count < cls.MAX_RETRIES: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(cls._job_lock_id(query_hash, data_source.id)) job_id = pipe.get(cls._job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[Manager][%s] Found existing job: %s", query_hash, job_id) job = cls(job_id=job_id) if job.ready(): logging.info( "[%s] job found is ready (%s), removing lock", query_hash, job.celery_status) redis_connection.delete( QueryTask._job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled: queue_name = data_source.scheduled_queue_name else: queue_name = data_source.queue_name result = execute_query.apply_async(args=(query, data_source.id, metadata), queue=queue_name) job = cls(async_result=result) logging.info("[Manager][%s] Created new job: %s", query_hash, job.id) pipe.set(cls._job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def prune(cls, list_name, keep_count, max_keys=100): count = redis_connection.zcard(list_name) if count <= keep_count: return 0 remove_count = min(max_keys, count - keep_count) keys = redis_connection.zrange(list_name, 0, remove_count - 1) redis_connection.delete(*keys) redis_connection.zremrangebyrank(list_name, 0, remove_count - 1) return remove_count
def execute_query(self, query, data_source_id): # TODO: maybe this should be a class? start_time = time.time() logger.info("Loading data source (%d)...", data_source_id) # TODO: we should probably cache data sources in Redis data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state='STARTED', meta={ 'start_time': start_time, 'custom_message': '' }) logger.info("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = get_query_runner(data_source.type, data_source.options) if getattr(query_runner, 'annotate_query', True): # TODO: anotate with queu ename annotated_query = "/* Task Id: %s, Query hash: %s */ %s" % \ (self.request.id, query_hash, query) else: annotated_query = query with statsd_client.timer('query_runner.{}.{}.run_time'.format( data_source.type, data_source.name)): data, error = query_runner(annotated_query) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state='STARTED', meta={ 'start_time': start_time, 'error': error, 'custom_message': '' }) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) # TODO: it is possible that storing the data will fail, and we will need to retry # while we already marked the job as done if not error: query_result = models.QueryResult.store_result( data_source.id, query_hash, query, data, run_time, datetime.datetime.utcnow()) else: raise Exception(error) return query_result.id
def enqueue_query(query, data_source, scheduled=False, metadata={}): query_hash = gen_query_hash(query) logging.info("Inserting job for %s with metadata=%s", query_hash, metadata) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[%s] Found existing job: %s", query_hash, job_id) job = QueryTask(job_id=job_id) tracker = QueryTaskTracker.get_by_task_id(job_id, connection=pipe) # tracker might not exist, if it's an old job if scheduled and tracker: tracker.update(retries=tracker.retries+1) elif tracker: tracker.update(scheduled_retries=tracker.scheduled_retries+1) if job.ready(): logging.info("[%s] job found is ready (%s), removing lock", query_hash, job.celery_status) redis_connection.delete(_job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled: queue_name = data_source.scheduled_queue_name else: queue_name = data_source.queue_name result = execute_query.apply_async(args=(query, data_source.id, metadata), queue=queue_name) job = QueryTask(async_result=result) tracker = QueryTaskTracker.create(result.id, 'created', query_hash, data_source.id, scheduled, metadata) tracker.save(connection=pipe) logging.info("[%s] Created new job: %s", query_hash, job.id) pipe.set(_job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def prune(cls, list_name, keep_count): count = redis_connection.zcard(list_name) if count <= keep_count: return 0 remove_count = count - keep_count keys = redis_connection.zrange(list_name, 0, remove_count - 1) redis_connection.delete(keys) redis_connection.zremrangebyrank(list_name, 0, remove_count - 1) return remove_count
def execute_query(self, query, data_source_id, metadata): signal.signal(signal.SIGINT, signal_handler) start_time = time.time() logger.info("Loading data source (%d)...", data_source_id) # TODO: we should probably cache data sources in Redis data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state="STARTED", meta={"start_time": start_time, "custom_message": ""}) logger.info("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = get_query_runner(data_source.type, data_source.options) if query_runner.annotate_query(): metadata["Task ID"] = self.request.id metadata["Query Hash"] = query_hash metadata["Queue"] = self.request.delivery_info["routing_key"] annotation = u", ".join([u"{}: {}".format(k, v) for k, v in metadata.iteritems()]) logging.debug(u"Annotation: %s", annotation) annotated_query = u"/* {} */ {}".format(annotation, query) else: annotated_query = query with statsd_client.timer("query_runner.{}.{}.run_time".format(data_source.type, data_source.name)): data, error = query_runner.run_query(annotated_query) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state="STARTED", meta={"start_time": start_time, "error": error, "custom_message": ""}) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) if not error: query_result, updated_query_ids = models.QueryResult.store_result( data_source.id, query_hash, query, data, run_time, utils.utcnow() ) for query_id in updated_query_ids: check_alerts_for_query.delay(query_id) else: raise Exception(error) return query_result.id
def remove_ghost_locks(): """ Removes query locks that reference a non existing RQ job. """ keys = redis_connection.keys("query_hash_job:*") locks = {k: redis_connection.get(k) for k in keys} jobs = list(rq_job_ids()) count = 0 for lock, job_id in locks.items(): if job_id not in jobs: redis_connection.delete(lock) count += 1 logger.info("Locks found: {}, Locks removed: {}".format(len(locks), count))
def add_task(cls, query, data_source, scheduled=False, metadata={}): query_hash = gen_query_hash(query) logging.info("[Manager][%s] Inserting job", query_hash) logging.info("[Manager] Metadata: [%s]", metadata) try_count = 0 job = None while try_count < cls.MAX_RETRIES: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(cls._job_lock_id(query_hash, data_source.id)) job_id = pipe.get(cls._job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[Manager][%s] Found existing job: %s", query_hash, job_id) job = cls(job_id=job_id) if job.ready(): logging.info("[%s] job found is ready (%s), removing lock", query_hash, job.celery_status) redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled: queue_name = data_source.scheduled_queue_name else: queue_name = data_source.queue_name result = execute_query.apply_async(args=(query, data_source.id, metadata), queue=queue_name) job = cls(async_result=result) logging.info("[Manager][%s] Created new job: %s", query_hash, job.id) pipe.set(cls._job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def execute_query(self, query, data_source_id, metadata): start_time = time.time() logger.info("Loading data source (%d)...", data_source_id) # TODO: we should probably cache data sources in Redis data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state='STARTED', meta={'start_time': start_time, 'custom_message': ''}) logger.info("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = get_query_runner(data_source.type, data_source.options) if query_runner.annotate_query(): metadata['Task ID'] = self.request.id metadata['Query Hash'] = query_hash metadata['Queue'] = self.request.delivery_info['routing_key'] annotation = u", ".join([u"{}: {}".format(k, v) for k, v in metadata.iteritems()]) logging.debug(u"Annotation: %s", annotation) annotated_query = u"/* {} */ {}".format(annotation, query) else: annotated_query = query with statsd_client.timer('query_runner.{}.{}.run_time'.format(data_source.type, data_source.name)): data, error = query_runner.run_query(annotated_query) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state='STARTED', meta={'start_time': start_time, 'error': error, 'custom_message': ''}) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) if not error: query_result = models.QueryResult.store_result(data_source.id, query_hash, query, data, run_time, utils.utcnow()) else: raise Exception(error) return query_result.id
def setUp(self): super(TestRefreshSchemas, self).setUp() self.COLUMN_NAME = "first_column" self.COLUMN_TYPE = "text" self.COLUMN_EXAMPLE = "some text for column value" self.EXPECTED_COLUMN_METADATA = { "id": 1, "org_id": 1, "table_id": 1, "name": self.COLUMN_NAME, "type": self.COLUMN_TYPE, "example": self.COLUMN_EXAMPLE, "exists": True, "description": None, } get_schema_patcher = patch( "redash.query_runner.pg.PostgreSQL.get_schema") self.patched_get_schema = get_schema_patcher.start() self.addCleanup(get_schema_patcher.stop) self.default_schema_return_value = [{ "name": "table", "columns": [self.COLUMN_NAME], "metadata": [{ "name": self.COLUMN_NAME, "type": self.COLUMN_TYPE, }], }] self.patched_get_schema.return_value = self.default_schema_return_value get_table_sample_patcher = patch( "redash.query_runner.BaseQueryRunner.get_table_sample") patched_get_table_sample = get_table_sample_patcher.start() self.addCleanup(get_table_sample_patcher.stop) patched_get_table_sample.return_value = { self.COLUMN_NAME: self.COLUMN_EXAMPLE } lock_key = "data_source:schema:refresh:{}:lock".format( self.factory.data_source.id) redis_connection.delete(lock_key)
def execute_query(self, query, data_source_id): # TODO: maybe this should be a class? start_time = time.time() logger.info("Loading data source (%d)...", data_source_id) # TODO: we should probably cache data sources in Redis data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state='STARTED', meta={'start_time': start_time, 'custom_message': ''}) logger.info("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = get_query_runner(data_source.type, data_source.options) if getattr(query_runner, 'annotate_query', True): # TODO: anotate with queu ename annotated_query = "/* Task Id: %s, Query hash: %s */ %s" % \ (self.request.id, query_hash, query) else: annotated_query = query with statsd_client.timer('query_runner.{}.{}.run_time'.format(data_source.type, data_source.name)): data, error = query_runner(annotated_query) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state='STARTED', meta={'start_time': start_time, 'error': error, 'custom_message': ''}) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) # TODO: it is possible that storing the data will fail, and we will need to retry # while we already marked the job as done if not error: query_result = models.QueryResult.store_result(data_source.id, query_hash, query, data, run_time, datetime.datetime.utcnow()) else: raise Exception(error) return query_result.id
def _compare_and_update(latest_version): # http://taobaofed.org/blog/2016/08/05/instructions-of-semver/ # Semantic Versioning, 语义化版本 # semver #常规版本号 # 0.1.0 # 大版本(不兼容),小版本(向后兼容),修订(一些小更新) # 预发版本号 # "1.0.0-beta.1"< stage > 一般选用:alpha、beta、rc。 # 因此在版本的大小比较上,仍然先比较常规版本号部分;对于预发标记部分的比较,则是根据 ASCII 字母表中的顺序来进行。 is_newer = semver.compare(current_version, latest_version) == -1 logging.info("Latest version: %s (newer: %s)", latest_version, is_newer) if is_newer: redis_connection.set(REDIS_KEY, latest_version) else: redis_connection.delete(REDIS_KEY)
def _unlock(query_hash, data_source_id): redis_connection.delete(_job_lock_id(query_hash, data_source_id))
def enqueue_query(query, data_source, user_id, is_api_key=False, scheduled_query=None, metadata={}): query_hash = gen_query_hash(query) logging.info("Inserting job for %s with metadata=%s", query_hash, metadata) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[%s] Found existing job: %s", query_hash, job_id) job = QueryTask(job_id=job_id) if job.ready(): logging.info("[%s] job found is ready (%s), removing lock", query_hash, job.celery_status) redis_connection.delete(_job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled_query: queue_name = data_source.scheduled_queue_name scheduled_query_id = scheduled_query.id else: queue_name = data_source.queue_name scheduled_query_id = None args = (query, data_source.id, metadata, user_id, scheduled_query_id, is_api_key) argsrepr = json_dumps({ 'org_id': data_source.org_id, 'data_source_id': data_source.id, 'enqueue_time': time.time(), 'scheduled': scheduled_query_id is not None, 'query_id': metadata.get('Query ID'), 'user_id': user_id }) time_limit = settings.dynamic_settings.query_time_limit(scheduled_query, user_id, data_source.org_id) result = execute_query.apply_async(args=args, argsrepr=argsrepr, queue=queue_name, time_limit=time_limit) job = QueryTask(async_result=result) logging.info("[%s] Created new job: %s", query_hash, job.id) pipe.set(_job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def resume(self): redis_connection.delete(self._pause_key())
from redash import redis_connection if __name__ == '__main__': redis_connection.delete('query_task_trackers')
atfork.monkeypatch_os_fork_functions() import atfork.stdlib_fixer atfork.stdlib_fixer.fix_logging_module() import time from redash.data import worker from redash import models, data_manager, redis_connection if __name__ == '__main__': models.create_db(True, False) print "Creating data source..." data_source = models.DataSource.create(name="Concurrency", type="pg", options="dbname=postgres") print "Clear jobs/hashes:" redis_connection.delete("jobs") query_hashes = redis_connection.keys("query_hash_*") if query_hashes: redis_connection.delete(*query_hashes) starting_query_results_count = models.QueryResult.select().count() jobs_count = 5000 workers_count = 10 print "Creating jobs..." for i in xrange(jobs_count): query = "SELECT {}".format(i) print "Inserting: {}".format(query) data_manager.add_job(query=query, priority=worker.Job.LOW_PRIORITY, data_source=data_source)
def execute_query(self, query, data_source_id, metadata): signal.signal(signal.SIGINT, signal_handler) start_time = time.time() logger.info("task=execute_query state=load_ds ds_id=%d", data_source_id) data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state='STARTED', meta={ 'start_time': start_time, 'custom_message': '' }) logger.debug("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = data_source.query_runner logger.info( "task=execute_query state=before query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) if query_runner.annotate_query(): metadata['Task ID'] = self.request.id metadata['Query Hash'] = query_hash metadata['Queue'] = self.request.delivery_info['routing_key'] annotation = u", ".join( [u"{}: {}".format(k, v) for k, v in metadata.iteritems()]) logging.debug(u"Annotation: %s", annotation) annotated_query = u"/* {} */ {}".format(annotation, query) else: annotated_query = query with statsd_client.timer('query_runner.{}.{}.run_time'.format( data_source.type, data_source.name)): data, error = query_runner.run_query(annotated_query) logger.info( "task=execute_query state=after query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state='STARTED', meta={ 'start_time': start_time, 'error': error, 'custom_message': '' }) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) if not error: query_result, updated_query_ids = models.QueryResult.store_result( data_source.org_id, data_source.id, query_hash, query, data, run_time, utils.utcnow()) logger.info( "task=execute_query state=after_store query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) for query_id in updated_query_ids: check_alerts_for_query.delay(query_id) logger.info( "task=execute_query state=after_alerts query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) else: raise QueryExecutionError(error) return query_result.id
atfork.stdlib_fixer.fix_logging_module() import time from redash.data import worker from redash import models, data_manager, redis_connection if __name__ == '__main__': models.create_db(True, False) print "Creating data source..." data_source = models.DataSource.create(name="Concurrency", type="pg", options="dbname=postgres") print "Clear jobs/hashes:" redis_connection.delete("jobs") query_hashes = redis_connection.keys("query_hash_*") if query_hashes: redis_connection.delete(*query_hashes) starting_query_results_count = models.QueryResult.select().count() jobs_count = 5000 workers_count = 10 print "Creating jobs..." for i in xrange(jobs_count): query = "SELECT {}".format(i) print "Inserting: {}".format(query) data_manager.add_job(query=query, priority=worker.Job.LOW_PRIORITY, data_source=data_source)
def setUp(self): self.list = "test_list" redis_connection.delete(self.list) for score in range(0, 100): redis_connection.zadd(self.list, score, 'k:{}'.format(score))
def enqueue_query(query, data_source, user_id, is_api_key=False, scheduled_query=None, metadata={}): query_hash = gen_query_hash(query) logging.info("Inserting job for %s with metadata=%s", query_hash, metadata) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[%s] Found existing job: %s", query_hash, job_id) job = QueryTask(job_id=job_id) if job.ready(): logging.info("[%s] job found is ready (%s), removing lock", query_hash, job.celery_status) redis_connection.delete(_job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() time_limit = None if scheduled_query: queue_name = data_source.scheduled_queue_name scheduled_query_id = scheduled_query.id else: queue_name = data_source.queue_name scheduled_query_id = None time_limit = settings.ADHOC_QUERY_TIME_LIMIT args = (query, data_source.id, metadata, user_id, scheduled_query_id, is_api_key) argsrepr = json_dumps({ 'org_id': data_source.org_id, 'data_source_id': data_source.id, 'enqueue_time': time.time(), 'scheduled': scheduled_query_id is not None, 'query_id': metadata.get('Query ID'), 'user_id': user_id }) result = execute_query.apply_async(args=args, argsrepr=argsrepr, queue=queue_name, time_limit=time_limit) job = QueryTask(async_result=result) logging.info("[%s] Created new job: %s", query_hash, job.id) pipe.set(_job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def execute_query(self, query, data_source_id, metadata): signal.signal(signal.SIGINT, signal_handler) start_time = time.time() logger.info("task=execute_query state=load_ds ds_id=%d", data_source_id) data_source = models.DataSource.get_by_id(data_source_id) self.update_state(state='STARTED', meta={'start_time': start_time, 'custom_message': ''}) logger.debug("Executing query:\n%s", query) query_hash = gen_query_hash(query) query_runner = data_source.query_runner logger.info("task=execute_query state=before query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) if query_runner.annotate_query(): metadata['Task ID'] = self.request.id metadata['Query Hash'] = query_hash metadata['Queue'] = self.request.delivery_info['routing_key'] annotation = u", ".join([u"{}: {}".format(k, v) for k, v in metadata.iteritems()]) logging.debug(u"Annotation: %s", annotation) annotated_query = u"/* {} */ {}".format(annotation, query) else: annotated_query = query with statsd_client.timer('query_runner.{}.{}.run_time'.format(data_source.type, data_source.name)): data, error = query_runner.run_query(annotated_query) logger.info("task=execute_query state=after query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) run_time = time.time() - start_time logger.info("Query finished... data length=%s, error=%s", data and len(data), error) self.update_state(state='STARTED', meta={'start_time': start_time, 'error': error, 'custom_message': ''}) # Delete query_hash redis_connection.delete(QueryTask._job_lock_id(query_hash, data_source.id)) if not error: query_result, updated_query_ids = models.QueryResult.store_result(data_source.org_id, data_source.id, query_hash, query, data, run_time, utils.utcnow()) logger.info("task=execute_query state=after_store query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) for query_id in updated_query_ids: check_alerts_for_query.delay(query_id) logger.info("task=execute_query state=after_alerts query_hash=%s type=%s ds_id=%d task_id=%s queue=%s query_id=%s username=%s", query_hash, data_source.type, data_source.id, self.request.id, self.request.delivery_info['routing_key'], metadata.get('Query ID', 'unknown'), metadata.get('Username', 'unknown')) else: raise QueryExecutionError(error) return query_result.id
def enqueue_query(query, data_source, user_id, is_api_key=False, scheduled_query=None, metadata={}): query_hash = gen_query_hash(query) logger.info("Inserting job for %s with metadata=%s", query_hash, metadata) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: logger.info("[%s] Found existing job: %s", query_hash, job_id) job_complete = None try: job = Job.fetch(job_id) job_exists = True status = job.get_status() job_complete = status in [ JobStatus.FINISHED, JobStatus.FAILED ] if job_complete: message = "job found is complete (%s)" % status except NoSuchJobError: message = "job found has expired" job_exists = False if job_complete or not job_exists: logger.info("[%s] %s, removing lock", query_hash, message) redis_connection.delete( _job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled_query: queue_name = data_source.scheduled_queue_name scheduled_query_id = scheduled_query.id else: queue_name = data_source.queue_name scheduled_query_id = None time_limit = settings.dynamic_settings.query_time_limit( scheduled_query, user_id, data_source.org_id) metadata["Queue"] = queue_name queue = Queue(queue_name) enqueue_kwargs = { "user_id": user_id, "scheduled_query_id": scheduled_query_id, "is_api_key": is_api_key, "job_timeout": time_limit, "meta": { "data_source_id": data_source.id, "org_id": data_source.org_id, "scheduled": scheduled_query_id is not None, "query_id": metadata.get("Query ID"), "user_id": user_id, }, } if not scheduled_query: enqueue_kwargs["result_ttl"] = settings.JOB_EXPIRY_TIME job = queue.enqueue(execute_query, query, data_source.id, metadata, **enqueue_kwargs) logger.info("[%s] Created new job: %s", query_hash, job.id) pipe.set( _job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME, ) pipe.execute() break except redis.WatchError: continue if not job: logger.error("[Manager][%s] Failed adding job for query.", query_hash) return job
def enqueue_query( query, data_source, user_id, is_api_key=False, scheduled_query=None, metadata={} ): query_id = metadata.get("Query ID", "unknown") query_hash = gen_query_hash(query) get_logger().info("[query_id=%s] [query_hash=%s] Inserting job", query_id, query_hash) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: job_status = "UNKNOWN" job_complete = False job_cancelled = "False" try: job = Job.fetch(job_id) job_exists = True job_status = job.get_status() job_complete = job_status in [JobStatus.FINISHED, JobStatus.FAILED] if job.is_cancelled: job_cancelled = "True" except NoSuchJobError: job_exists = False job_status = "EXPIRED" get_logger().info("[query_id=%s] [query_hash=%s] Found existing job [job.id=%s] [job_status=%s] [job_cancelled=%s]", query_id, query_hash, job_id, job_status, job_cancelled) if job_complete or (not job_exists): #get_logger().info("[query_id=%s] [query_hash=%s] [job.id=%s], removing redis lock", query_id, query_hash, job_id) redis_connection.delete(_job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled_query: queue_name = data_source.scheduled_queue_name #默认都是scheduled_queries scheduled_query_id = scheduled_query.id else: queue_name = data_source.queue_name #默认都是queries scheduled_query_id = None time_limit = settings.dynamic_settings.query_time_limit( scheduled_query, user_id, data_source.org_id ) metadata["Queue"] = queue_name metadata["Enqueue Time"] = time.time() queue = Queue(queue_name) enqueue_kwargs = { "user_id": user_id, "scheduled_query_id": scheduled_query_id, "is_api_key": is_api_key, "job_timeout": time_limit, "meta": { "data_source_id": data_source.id, "org_id": data_source.org_id, "scheduled": scheduled_query_id is not None, "query_id": query_id, "user_id": user_id, }, } if not scheduled_query: enqueue_kwargs["result_ttl"] = settings.JOB_EXPIRY_TIME job = queue.enqueue( execute_query, query, data_source.id, metadata, **enqueue_kwargs ) get_logger().info("[query_id=%s] [query_hash=%s] Created new job [job.id=%s]", query_id, query_hash, job.id) pipe.set( _job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME, ) pipe.execute() break except redis.WatchError: get_logger().error("[query_id=%s] [query_hash=%s] redis.WatchError, try_count = %d", query_id, query_hash, try_count) continue if not job: get_logger().error("[Manager] [query_id=%s] [query_hash=%s] Failed adding job for query.", query_id, query_hash) return job
def enqueue_query(query, data_source, user_id, is_api_key=False, scheduled_query=None, metadata={}): query_hash = gen_query_hash(query) logging.info("Inserting job for %s with metadata=%s", query_hash, metadata) try_count = 0 job = None while try_count < 5: try_count += 1 pipe = redis_connection.pipeline() try: pipe.watch(_job_lock_id(query_hash, data_source.id)) job_id = pipe.get(_job_lock_id(query_hash, data_source.id)) if job_id: logging.info("[%s] Found existing job: %s", query_hash, job_id) job = Job.fetch(job_id) status = job.get_status() if status in [JobStatus.FINISHED, JobStatus.FAILED]: logging.info( "[%s] job found is ready (%s), removing lock", query_hash, status, ) redis_connection.delete( _job_lock_id(query_hash, data_source.id)) job = None if not job: pipe.multi() if scheduled_query: queue_name = data_source.scheduled_queue_name scheduled_query_id = scheduled_query.id else: queue_name = data_source.queue_name scheduled_query_id = None time_limit = settings.dynamic_settings.query_time_limit( scheduled_query, user_id, data_source.org_id) metadata["Queue"] = queue_name queue = Queue(queue_name) job = queue.enqueue( execute_query, query, data_source.id, metadata, user_id=user_id, scheduled_query_id=scheduled_query_id, is_api_key=is_api_key, job_timeout=time_limit, ) logging.info("[%s] Created new job: %s", query_hash, job.id) pipe.set( _job_lock_id(query_hash, data_source.id), job.id, settings.JOB_EXPIRY_TIME, ) pipe.execute() break except redis.WatchError: continue if not job: logging.error("[Manager][%s] Failed adding job for query.", query_hash) return job