Пример #1
0
def query_execution_to_es(query_execution, data_cell=None, session=None):
    """data_cell is added as a parameter so that bulk insert of query executions won't require
    re-retrieval of data_cell"""
    query_execution_id = query_execution.id

    engine_id = query_execution.engine_id
    engine = get_query_engine_by_id(engine_id, session=session)

    table_names, _ = process_query(query_execution.query,
                                   language=(engine and engine.language))
    table_names = list(chain.from_iterable(table_names))

    duration = (DATETIME_TO_UTC(query_execution.completed_at) -
                DATETIME_TO_UTC(query_execution.created_at)
                if query_execution.completed_at is not None else None)

    environments = engine.environments
    environment_ids = [env.id for env in environments]

    title = data_cell.meta.get("title", "Untitled") if data_cell else None

    expand_query_execution = {
        "id": query_execution_id,
        "query_type": "query_execution",
        "title": title,
        "environment_id": environment_ids,
        "author_uid": query_execution.uid,
        "engine_id": engine_id,
        "statement_type": get_table_statement_type(query_execution.query),
        "created_at": DATETIME_TO_UTC(query_execution.created_at),
        "duration": duration,
        "full_table_name": table_names,
        "query_text": query_execution.query,
    }
    return expand_query_execution
    def get_table_and_columns(
            self, schema_name,
            table_name) -> Tuple[DataTable, List[DataColumn]]:
        raw_table_info = next(
            iter(
                self._engine.execute(f"""
            SELECT
                TABLE_TYPE,
                CREATE_TIME,
                UPDATE_TIME,
                data_length + index_length
            FROM
                INFORMATION_SCHEMA.TABLES
            WHERE
                TABLE_SCHEMA="{schema_name}" AND TABLE_NAME="{table_name}"
        """)),
            None,
        )

        if not raw_table_info:
            return None, []

        table = DataTable(
            name=table_name,
            type=raw_table_info[0],
            owner=None,
            table_created_at=DATETIME_TO_UTC(raw_table_info[1])
            if raw_table_info[1] is not None else None,
            table_updated_by=None,
            table_updated_at=DATETIME_TO_UTC(raw_table_info[2])
            if raw_table_info[2] is not None else None,
            data_size_bytes=raw_table_info[3],
            location=None,
            partitions=None,
            raw_description=ujson.pdumps(list(raw_table_info)),
        )

        raw_columns = self._inspect.get_columns(table_name=table_name,
                                                schema=schema_name)
        columns = list(
            map(
                lambda col: DataColumn(
                    name=col["name"],
                    type=str(col["type"]),
                    comment=
                    f"Default:{col['default']} Nullable:{col['nullable']}",
                ),
                raw_columns,
            ))

        return table, columns
Пример #3
0
def datadocs_to_es(datadoc, session=None):
    title = datadoc.title

    cells_as_text = []
    for cell in datadoc.cells:
        if cell.cell_type == DataCellType.text:
            cells_as_text.append(richtext_to_plaintext(cell.context))
        elif cell.cell_type == DataCellType.query:
            cell_title = cell.meta.get("title", "")
            cell_text = (cell.context if not cell_title else
                         f"{cell_title}\n{cell.context}")
            cells_as_text.append(cell_text)
        else:
            cells_as_text.append("[... additional unparsable content ...]")

    joined_cells = escape("\n".join(cells_as_text))

    # There is no need to compute the list of editors
    # for public datadoc since everyone is able to see it
    editors = ([
        editor.uid
        for editor in get_data_doc_editors_by_doc_id(data_doc_id=datadoc.id,
                                                     session=session)
    ] if not datadoc.public else [])
    expand_datadoc = {
        "id": datadoc.id,
        "environment_id": datadoc.environment_id,
        "owner_uid": datadoc.owner_uid,
        "created_at": DATETIME_TO_UTC(datadoc.created_at),
        "cells": joined_cells,
        "title": title,
        "public": datadoc.public,
        "readable_user_ids": editors,
    }
    return expand_datadoc
Пример #4
0
def query_cell_to_es(query_cell, session=None):
    query_cell_id = query_cell.id
    query_cell_meta = query_cell.meta

    engine_id = query_cell_meta.get("engine")
    engine = get_query_engine_by_id(engine_id, session=session)

    query = query_cell.context
    table_names, _ = process_query(query,
                                   language=(engine and engine.language))
    table_names = list(chain.from_iterable(table_names))

    datadoc = query_cell.doc

    expand_query = {
        "id": query_cell_id,
        "query_type": "query_cell",
        "title": query_cell_meta.get("title", "Untitled"),
        "data_doc_id": datadoc and datadoc.id,
        "environment_id": datadoc and datadoc.environment_id,
        "author_uid": datadoc and datadoc.owner_uid,
        "engine_id": engine_id,
        "statement_type": get_table_statement_type(query),
        "created_at": DATETIME_TO_UTC(query_cell.created_at),
        "full_table_name": table_names,
        "query_text": query,
    }
    return expand_query
Пример #5
0
def table_to_es(table, session=None):
    schema = table.data_schema

    column_names = [c.name for c in table.columns]
    schema_name = schema.name
    table_name = table.name
    description = (
        richtext_to_plaintext(table.information.description, escape=True)
        if table.information
        else ""
    )

    full_name = "{}.{}".format(schema_name, table_name)
    weight = get_table_weight(table.id, session=session)

    expand_table = {
        "id": table.id,
        "metastore_id": schema.metastore_id,
        "schema": schema_name,
        "name": table_name,
        "full_name": full_name,
        "full_name_ngram": full_name,
        "completion_name": {
            "input": [full_name, table_name,],
            "weight": weight,
            "contexts": {"metastore_id": schema.metastore_id,},
        },
        "description": description,
        "created_at": DATETIME_TO_UTC(table.created_at),
        "columns": column_names,
        "golden": table.golden,
        "importance_score": weight,
        "tags": [tag.tag_name for tag in table.tags],
    }
    return expand_table
 def generate_presigned_url(self,
                            blob_name,
                            method="GET",
                            expires_in=86400,
                            params={}):
     blob = self._bucket.blob(blob_name)
     if blob.exists():
         return blob.generate_signed_url(
             expiration=expires_in + DATETIME_TO_UTC(datetime.utcnow()),
             method=method,
             **params,
         )
     return None
Пример #7
0
def table_to_es(table, fields=None, session=None):
    schema = table.data_schema
    schema_name = schema.name
    table_name = table.name
    full_name = "{}.{}".format(schema_name, table_name)

    def get_table_description():
        return (richtext_to_plaintext(table.information.description,
                                      escape=True)
                if table.information else "")

    weight = None

    def compute_weight():
        nonlocal weight
        if weight is None:
            weight = get_table_weight(table.id, session=session)
        return weight

    def get_completion_name():
        return {
            "input": [
                full_name,
                table_name,
            ],
            "weight": compute_weight(),
            "contexts": {
                "metastore_id": schema.metastore_id,
            },
        }

    field_to_getter = {
        "id": table.id,
        "metastore_id": schema.metastore_id,
        "schema": schema_name,
        "name": table_name,
        "full_name": full_name,
        "full_name_ngram": full_name,
        "completion_name": get_completion_name,
        "description": get_table_description,
        "created_at": lambda: DATETIME_TO_UTC(table.created_at),
        "columns": [c.name for c in table.columns],
        "golden": table.golden,
        "importance_score": compute_weight,
        "tags": [tag.tag_name for tag in table.tags],
    }
    return _get_dict_by_field(field_to_getter, fields=fields)
def serialize_value(value):
    if value:
        # TODO: since jsonsify also converts
        # Decide on which conversion is required
        if isinstance(value, datetime):
            return DATETIME_TO_UTC(value)
        elif isinstance(value, date):
            return DATE_TO_UTC(value)
        elif isinstance(value, Enum):
            return value.value
        elif isinstance(value, dict):
            return {k: serialize_value(v) for k, v in value.items()}
        elif isinstance(value, (list, tuple)):
            return value.__class__(map(serialize_value, value))
        elif hasattr(value, "to_dict"):
            return value.to_dict()
    return value
Пример #9
0
def query_execution_to_es(query_execution,
                          data_cell=None,
                          fields=None,
                          session=None):
    """data_cell is added as a parameter so that bulk insert of query executions won't require
    re-retrieval of data_cell"""
    engine_id = query_execution.engine_id
    engine = admin_logic.get_query_engine_by_id(engine_id, session=session)
    datadoc = data_cell.doc if data_cell else None

    def get_duration():
        return (DATETIME_TO_UTC(query_execution.completed_at) -
                DATETIME_TO_UTC(query_execution.created_at)
                if query_execution.completed_at is not None else None)

    field_to_getter = {
        "id":
        query_execution.id,
        "query_type":
        "query_execution",
        "title":
        data_cell.meta.get("title", "Untitled") if data_cell else None,
        "environment_id": [env.id for env in engine.environments],
        "author_uid":
        query_execution.uid,
        "engine_id":
        engine_id,
        "statement_type":
        lambda: get_table_statement_type(query_execution.query),
        "created_at":
        lambda: DATETIME_TO_UTC(query_execution.created_at),
        "duration":
        get_duration,
        "full_table_name":
        lambda: _get_table_names_from_query(
            query_execution.query, language=(engine and engine.language)),
        "query_text":
        query_execution.query,
        "public":
        datadoc is None or datadoc.public,
        "readable_user_ids":
        lambda: _get_datadoc_editors(datadoc, session=session),
    }

    return _get_dict_by_field(field_to_getter, fields=fields)
Пример #10
0
def run_sample_query(
    self,
    table_id,
    engine_id,
    uid,
    limit,
    partition,
    where,
    order_by,
    order_by_asc,
):
    # Initialize progress to 0 for polling purposes
    self.update_state(state="PROGRESS", meta=0)

    with DBSession() as session:
        query = make_samples_query(
            table_id,
            limit=limit,
            partition=partition,
            where=where,
            order_by=order_by,
            order_by_asc=order_by_asc,
            session=session,
        )

        async_execute_query = ExecuteQuery(True)
        async_execute_query(query, engine_id, uid=uid, session=session)
        while not async_execute_query.poll():
            self.update_state(state="PROGRESS",
                              meta=async_execute_query.progress)

        results = {
            "created_at": DATETIME_TO_UTC(datetime.now()),
            "value": async_execute_query.result,
            "engine_id": engine_id,
            "created_by": uid,
        }

        mysql_cache.set_key(
            f"table_samples_{table_id}_{uid}",
            results,
            expires_after=seconds_in_a_day,
            session=session,
        )
Пример #11
0
def datadocs_to_es(datadoc, fields=None, session=None):
    field_to_getter = {
        "id":
        datadoc.id,
        "environment_id":
        datadoc.environment_id,
        "owner_uid":
        datadoc.owner_uid,
        "created_at":
        lambda: DATETIME_TO_UTC(datadoc.created_at),
        "cells":
        lambda: get_joined_cells(datadoc),
        "title":
        datadoc.title,
        "public":
        datadoc.public,
        "readable_user_ids":
        lambda: _get_datadoc_editors(datadoc, session=session),
    }
    return _get_dict_by_field(field_to_getter, fields=fields)
Пример #12
0
def query_cell_to_es(query_cell, fields=None, session=None):
    query_cell_meta = query_cell.meta
    query = query_cell.context
    datadoc = query_cell.doc

    engine_id = query_cell_meta.get("engine")
    engine = admin_logic.get_query_engine_by_id(engine_id, session=session)

    field_to_getter = {
        "id":
        query_cell.id,
        "query_type":
        "query_cell",
        "title":
        query_cell_meta.get("title", "Untitled"),
        "data_doc_id":
        datadoc and datadoc.id,
        "environment_id":
        datadoc and datadoc.environment_id,
        "author_uid":
        datadoc and datadoc.owner_uid,
        "engine_id":
        engine_id,
        "statement_type":
        lambda: get_table_statement_type(query),
        "created_at":
        lambda: DATETIME_TO_UTC(query_cell.created_at),
        "full_table_name":
        lambda: _get_table_names_from_query(
            query, language=(engine and engine.language)),
        "query_text":
        query,
        "public":
        datadoc is not None and datadoc.public,
        "readable_user_ids":
        lambda: _get_datadoc_editors(datadoc, session=session),
    }

    return _get_dict_by_field(field_to_getter, fields=fields)
Пример #13
0
    def get_server_status(cls, engine_id) -> EngineStatus:
        result: EngineStatus = {
            "status": QueryEngineStatus.UNAVAILABLE.value,
            "messages": [],
        }
        key = cls.generate_server_check_cache_key(engine_id)

        cache_updated_at = None
        try:
            raw_cache = get_raw_key(key)
            if raw_cache is not None:
                result = raw_cache["value"]
                cache_updated_at = raw_cache["updated_at"]
        except LookupError:
            pass  # Unable to get key

        if (cache_updated_at is None
                or DATETIME_TO_UTC(datetime.utcnow()) - cache_updated_at >
                cls.SERVER_RESULT_EXPIRY()):

            # Result was expired, getting a new one
            poll_engine_status.delay(cls.NAME(), engine_id)
        return result
Пример #14
0
 def get_duration():
     return (DATETIME_TO_UTC(query_execution.completed_at) -
             DATETIME_TO_UTC(query_execution.created_at)
             if query_execution.completed_at is not None else None)