def query_execution_to_es(query_execution, data_cell=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" query_execution_id = query_execution.id engine_id = query_execution.engine_id engine = get_query_engine_by_id(engine_id, session=session) table_names, _ = process_query(query_execution.query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) duration = (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) environments = engine.environments environment_ids = [env.id for env in environments] title = data_cell.meta.get("title", "Untitled") if data_cell else None expand_query_execution = { "id": query_execution_id, "query_type": "query_execution", "title": title, "environment_id": environment_ids, "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query_execution.query), "created_at": DATETIME_TO_UTC(query_execution.created_at), "duration": duration, "full_table_name": table_names, "query_text": query_execution.query, } return expand_query_execution
def get_table_and_columns( self, schema_name, table_name) -> Tuple[DataTable, List[DataColumn]]: raw_table_info = next( iter( self._engine.execute(f""" SELECT TABLE_TYPE, CREATE_TIME, UPDATE_TIME, data_length + index_length FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA="{schema_name}" AND TABLE_NAME="{table_name}" """)), None, ) if not raw_table_info: return None, [] table = DataTable( name=table_name, type=raw_table_info[0], owner=None, table_created_at=DATETIME_TO_UTC(raw_table_info[1]) if raw_table_info[1] is not None else None, table_updated_by=None, table_updated_at=DATETIME_TO_UTC(raw_table_info[2]) if raw_table_info[2] is not None else None, data_size_bytes=raw_table_info[3], location=None, partitions=None, raw_description=ujson.pdumps(list(raw_table_info)), ) raw_columns = self._inspect.get_columns(table_name=table_name, schema=schema_name) columns = list( map( lambda col: DataColumn( name=col["name"], type=str(col["type"]), comment= f"Default:{col['default']} Nullable:{col['nullable']}", ), raw_columns, )) return table, columns
def datadocs_to_es(datadoc, session=None): title = datadoc.title cells_as_text = [] for cell in datadoc.cells: if cell.cell_type == DataCellType.text: cells_as_text.append(richtext_to_plaintext(cell.context)) elif cell.cell_type == DataCellType.query: cell_title = cell.meta.get("title", "") cell_text = (cell.context if not cell_title else f"{cell_title}\n{cell.context}") cells_as_text.append(cell_text) else: cells_as_text.append("[... additional unparsable content ...]") joined_cells = escape("\n".join(cells_as_text)) # There is no need to compute the list of editors # for public datadoc since everyone is able to see it editors = ([ editor.uid for editor in get_data_doc_editors_by_doc_id(data_doc_id=datadoc.id, session=session) ] if not datadoc.public else []) expand_datadoc = { "id": datadoc.id, "environment_id": datadoc.environment_id, "owner_uid": datadoc.owner_uid, "created_at": DATETIME_TO_UTC(datadoc.created_at), "cells": joined_cells, "title": title, "public": datadoc.public, "readable_user_ids": editors, } return expand_datadoc
def query_cell_to_es(query_cell, session=None): query_cell_id = query_cell.id query_cell_meta = query_cell.meta engine_id = query_cell_meta.get("engine") engine = get_query_engine_by_id(engine_id, session=session) query = query_cell.context table_names, _ = process_query(query, language=(engine and engine.language)) table_names = list(chain.from_iterable(table_names)) datadoc = query_cell.doc expand_query = { "id": query_cell_id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": get_table_statement_type(query), "created_at": DATETIME_TO_UTC(query_cell.created_at), "full_table_name": table_names, "query_text": query, } return expand_query
def table_to_es(table, session=None): schema = table.data_schema column_names = [c.name for c in table.columns] schema_name = schema.name table_name = table.name description = ( richtext_to_plaintext(table.information.description, escape=True) if table.information else "" ) full_name = "{}.{}".format(schema_name, table_name) weight = get_table_weight(table.id, session=session) expand_table = { "id": table.id, "metastore_id": schema.metastore_id, "schema": schema_name, "name": table_name, "full_name": full_name, "full_name_ngram": full_name, "completion_name": { "input": [full_name, table_name,], "weight": weight, "contexts": {"metastore_id": schema.metastore_id,}, }, "description": description, "created_at": DATETIME_TO_UTC(table.created_at), "columns": column_names, "golden": table.golden, "importance_score": weight, "tags": [tag.tag_name for tag in table.tags], } return expand_table
def generate_presigned_url(self, blob_name, method="GET", expires_in=86400, params={}): blob = self._bucket.blob(blob_name) if blob.exists(): return blob.generate_signed_url( expiration=expires_in + DATETIME_TO_UTC(datetime.utcnow()), method=method, **params, ) return None
def table_to_es(table, fields=None, session=None): schema = table.data_schema schema_name = schema.name table_name = table.name full_name = "{}.{}".format(schema_name, table_name) def get_table_description(): return (richtext_to_plaintext(table.information.description, escape=True) if table.information else "") weight = None def compute_weight(): nonlocal weight if weight is None: weight = get_table_weight(table.id, session=session) return weight def get_completion_name(): return { "input": [ full_name, table_name, ], "weight": compute_weight(), "contexts": { "metastore_id": schema.metastore_id, }, } field_to_getter = { "id": table.id, "metastore_id": schema.metastore_id, "schema": schema_name, "name": table_name, "full_name": full_name, "full_name_ngram": full_name, "completion_name": get_completion_name, "description": get_table_description, "created_at": lambda: DATETIME_TO_UTC(table.created_at), "columns": [c.name for c in table.columns], "golden": table.golden, "importance_score": compute_weight, "tags": [tag.tag_name for tag in table.tags], } return _get_dict_by_field(field_to_getter, fields=fields)
def serialize_value(value): if value: # TODO: since jsonsify also converts # Decide on which conversion is required if isinstance(value, datetime): return DATETIME_TO_UTC(value) elif isinstance(value, date): return DATE_TO_UTC(value) elif isinstance(value, Enum): return value.value elif isinstance(value, dict): return {k: serialize_value(v) for k, v in value.items()} elif isinstance(value, (list, tuple)): return value.__class__(map(serialize_value, value)) elif hasattr(value, "to_dict"): return value.to_dict() return value
def query_execution_to_es(query_execution, data_cell=None, fields=None, session=None): """data_cell is added as a parameter so that bulk insert of query executions won't require re-retrieval of data_cell""" engine_id = query_execution.engine_id engine = admin_logic.get_query_engine_by_id(engine_id, session=session) datadoc = data_cell.doc if data_cell else None def get_duration(): return (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None) field_to_getter = { "id": query_execution.id, "query_type": "query_execution", "title": data_cell.meta.get("title", "Untitled") if data_cell else None, "environment_id": [env.id for env in engine.environments], "author_uid": query_execution.uid, "engine_id": engine_id, "statement_type": lambda: get_table_statement_type(query_execution.query), "created_at": lambda: DATETIME_TO_UTC(query_execution.created_at), "duration": get_duration, "full_table_name": lambda: _get_table_names_from_query( query_execution.query, language=(engine and engine.language)), "query_text": query_execution.query, "public": datadoc is None or datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def run_sample_query( self, table_id, engine_id, uid, limit, partition, where, order_by, order_by_asc, ): # Initialize progress to 0 for polling purposes self.update_state(state="PROGRESS", meta=0) with DBSession() as session: query = make_samples_query( table_id, limit=limit, partition=partition, where=where, order_by=order_by, order_by_asc=order_by_asc, session=session, ) async_execute_query = ExecuteQuery(True) async_execute_query(query, engine_id, uid=uid, session=session) while not async_execute_query.poll(): self.update_state(state="PROGRESS", meta=async_execute_query.progress) results = { "created_at": DATETIME_TO_UTC(datetime.now()), "value": async_execute_query.result, "engine_id": engine_id, "created_by": uid, } mysql_cache.set_key( f"table_samples_{table_id}_{uid}", results, expires_after=seconds_in_a_day, session=session, )
def datadocs_to_es(datadoc, fields=None, session=None): field_to_getter = { "id": datadoc.id, "environment_id": datadoc.environment_id, "owner_uid": datadoc.owner_uid, "created_at": lambda: DATETIME_TO_UTC(datadoc.created_at), "cells": lambda: get_joined_cells(datadoc), "title": datadoc.title, "public": datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def query_cell_to_es(query_cell, fields=None, session=None): query_cell_meta = query_cell.meta query = query_cell.context datadoc = query_cell.doc engine_id = query_cell_meta.get("engine") engine = admin_logic.get_query_engine_by_id(engine_id, session=session) field_to_getter = { "id": query_cell.id, "query_type": "query_cell", "title": query_cell_meta.get("title", "Untitled"), "data_doc_id": datadoc and datadoc.id, "environment_id": datadoc and datadoc.environment_id, "author_uid": datadoc and datadoc.owner_uid, "engine_id": engine_id, "statement_type": lambda: get_table_statement_type(query), "created_at": lambda: DATETIME_TO_UTC(query_cell.created_at), "full_table_name": lambda: _get_table_names_from_query( query, language=(engine and engine.language)), "query_text": query, "public": datadoc is not None and datadoc.public, "readable_user_ids": lambda: _get_datadoc_editors(datadoc, session=session), } return _get_dict_by_field(field_to_getter, fields=fields)
def get_server_status(cls, engine_id) -> EngineStatus: result: EngineStatus = { "status": QueryEngineStatus.UNAVAILABLE.value, "messages": [], } key = cls.generate_server_check_cache_key(engine_id) cache_updated_at = None try: raw_cache = get_raw_key(key) if raw_cache is not None: result = raw_cache["value"] cache_updated_at = raw_cache["updated_at"] except LookupError: pass # Unable to get key if (cache_updated_at is None or DATETIME_TO_UTC(datetime.utcnow()) - cache_updated_at > cls.SERVER_RESULT_EXPIRY()): # Result was expired, getting a new one poll_engine_status.delay(cls.NAME(), engine_id) return result
def get_duration(): return (DATETIME_TO_UTC(query_execution.completed_at) - DATETIME_TO_UTC(query_execution.created_at) if query_execution.completed_at is not None else None)