예제 #1
0
파일: mls.py 프로젝트: Robert-sktai/skt
def get_meta_table(meta_table: str,
                   aws_env: AWSENV = AWSENV.STG.value,
                   user="******",
                   edd: bool = False) -> Dict[str, Any]:
    """
    Get a meta_table information
    Args. :
        - meta_table   :   (str) the name of meta_table
        - aws_env      :   (str) AWS ENV in 'stg / prd' (default is 'stg')
        - user         :   (str) the name of user (default is 'reco')
        - edd          :   (bool) True if On-prem env is on EDD (default is False)
    Returns :
        - Dictionary value of meta_table (id / name / description / schema / items / created_at / updated_at)
    """
    assert type(meta_table) == str
    assert type(aws_env) == str

    secret = get_secrets("mls")
    token = secret.get("user_token").get(user)

    url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"]
    url = f"{url}{MLS_META_API_URL}/{meta_table}"

    response = requests.get(url,
                            headers={
                                "Authorization": f"Basic {{{token}}}"
                            }).json()
    results = response.get("results")

    if not results:
        raise MLSModelError(response.get("error"))
    else:
        return results
예제 #2
0
class Hash:
    access_token = get_access_token()
    url = {
        "hash": get_secrets("lake/hash")["hash_url"],
        "unhash": get_secrets("lake/hash")["unhash_url"],
    }

    @classmethod
    def renew_token(cls):
        cls.access_token = get_access_token()

    @classmethod
    def make_headers(cls):
        return {
            "Authorization": f"Bearer {cls.access_token}",
        }

    @classmethod
    def map_s(cls, values, unhash=False):
        task = hash_task
        if unhash:
            task = unhash_task
        url = cls.url[task.url_key]
        data = {"type": "s", task.input_key: values}
        r = requests.post(url, headers=cls.make_headers(), json=data)
        if r.status_code == 401:
            cls.renew_token()
            r = requests.post(url, headers=cls.make_headers(), json=data)
        if r.status_code != 200:
            raise Exception(r.content.decode("utf8"))
        return [x[task.output_key] for x in r.json()["response"]]
예제 #3
0
def get_github_util():
    from skt.github_utils import GithubUtil

    github_token = get_secrets("github/sktaiflow")["token"]
    proxy = get_secrets("proxy")["proxy"]
    proxies = {
        "http": proxy,
        "https": proxy,
    }
    g = GithubUtil(github_token, proxies)
    return g
예제 #4
0
def publish_relation(source, destination, context=None):
    from datetime import datetime

    msg = {
        "source": source,
        "destination": destination,
        "timestamp": round(datetime.utcnow().timestamp() * 1000),
        "context": context,
    }
    proxies = get_secrets(path="proxies")
    url = get_secrets(path="data_lineage")["url"]

    return requests.post(url, proxies=proxies, json=msg)
예제 #5
0
파일: mls.py 프로젝트: Robert-sktai/skt
def update_meta_table_item(
    meta_table: str,
    item_name: str,
    item_dict: Dict[str, Any],
    aws_env: AWSENV = AWSENV.STG.value,
    user="******",
    edd: bool = False,
) -> None:
    """
    Update a meta_item
    Args. :
        - meta_table   :   (str) the name of meta_table
        - item_name    :   (str) the name of meta_item to be added
        - item_dict    :   (dict) A dictionary type (item-value) value to upload to or update of the item
        - aws_env      :   (str) AWS ENV in 'stg / prd' (default is 'stg')
        - user         :   (str) the name of user (default is 'reco')
        - edd          :   (bool) True if On-prem env is on EDD (default is False)
    """
    assert type(meta_table) == str
    assert type(item_name) == str
    assert type(item_dict) == dict
    assert type(aws_env) == str

    secret = get_secrets("mls")
    token = secret.get("user_token").get(user)

    meta_table_info = get_meta_table(meta_table, aws_env, user, edd)

    values_data = dict()
    for field_name, field_spec in meta_table_info["schema"].items():
        values_data[field_name] = item_dict.get(field_name)

    request_data = dict()
    request_data["name"] = item_name
    request_data["values"] = values_data

    url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"]
    url = f"{url}{MLS_META_API_URL}/{meta_table}/meta_items/{item_name}"

    response = requests.put(url,
                            json=request_data,
                            headers={
                                "Authorization": f"Basic {{{token}}}"
                            }).json()
    results = response.get("results")

    if not results:
        raise MLSModelError(response.get("error"))
예제 #6
0
파일: mls.py 프로젝트: Robert-sktai/skt
def update_ml_model_meta(
    user: str,
    model_name: str,
    model_version: str,
    model_meta_dict: Dict[str, Any],
    aws_env: AWSENV = AWSENV.STG.value,
    edd: bool = False,
) -> None:
    """
    Update(or Create) model_meta
    Args. :
        - user            :   (str) the name of a MLModel user
        - model_name      :   (str) the name of MLModel
        - model_version   :   (str) the version of MLModel
        - model_meta_dict :   (dict) the version of MLModel
        - aws_env         :   (str) AWS ENV in 'stg / prd' (default is 'stg')
        - edd             :   (bool) True if On-prem env is on EDD (default is False)
    """
    assert type(model_name) == str
    assert type(model_version) == str
    assert type(model_meta_dict) == dict
    assert type(aws_env) == str

    url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"]
    url = f"{url}{MLS_MLMODEL_API_URL}/{model_name}/versions/{model_version}/meta"

    request_data = dict()
    request_data["user"] = user
    request_data["model_meta"] = model_meta_dict

    requests.patch(url, json=request_data).json()
예제 #7
0
파일: gcp.py 프로젝트: Robert-sktai/skt
def _bq_table_to_df(dataset, table_name, col_list, partition=None, where=None, spark_session=None):
    import base64
    from skt.vault_utils import get_secrets

    if not spark_session:
        spark_session = get_spark()
    spark_session.conf.set("spark.sql.execution.arrow.enabled", "false")
    key = get_secrets("gcp/sktaic-datahub/dataflow")["config"]
    df = (
        spark_session.read.format("bigquery")
        .option("project", "sktaic-datahub")
        .option("table", f"sktaic-datahub:{dataset}.{table_name}")
        .option("credentials", base64.b64encode(key.encode()).decode())
    )
    if partition:
        table = get_bigquery_client().get_table(f"{dataset}.{table_name}")
        if "timePartitioning" in table._properties:
            partition_column_name = table._properties["timePartitioning"]["field"]
            filter = f"{partition_column_name} = '{partition}'"
        elif "rangePartitioning" in table._properties:
            partition_column_name = table._properties["rangePartitioning"]["field"]
            filter = f"{partition_column_name} = {partition}"
        else:
            partition_column_name = None
        if partition_column_name:
            df = df.option("filter", filter)
    df = df.load().select(col_list)
    if where:
        df.where(where)
    return df
예제 #8
0
파일: mls.py 프로젝트: Robert-sktai/skt
def get_ml_model_meta(user: str,
                      model_name: str,
                      model_version: str,
                      aws_env: AWSENV = AWSENV.STG.value,
                      edd: bool = False) -> Dict[str, Any]:
    """
    Get a list of MLModel meta
    Args. :
        - user           :   (str) the name of a MLModel user
        - model_name     :   (str) the name of MLModel
        - model_version  :   (str) the version of MLModel
        - aws_env        :   (str) AWS ENV in 'stg / prd' (default is 'stg')
        - edd          :   (bool) True if On-prem env is on EDD (default is False)
    Returns :
        - Dictionary value of model_meta
    """
    assert type(user) == str
    assert type(model_name) == str
    assert type(model_version) == str
    assert type(aws_env) == str

    url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"]
    url = f"{url}{MLS_MLMODEL_API_URL}/{model_name}/versions/{model_version}/meta"

    response = requests.get(url, params={"user": user}).json()
    results = response.get("results")

    if not results:
        raise MLSModelError(
            f"No MLModel for user: {user} / model_name: {model_name} / model_version: {model_version}"
        )
    else:
        return results[0].get("model_meta")
예제 #9
0
def get_access_token():
    secrets = get_secrets("lake/hash")
    url = secrets["auth_url"]
    client_id = secrets["client_id"]
    client_secret = secrets["client_secret"]
    data = {"grant_type": "client_credentials"}
    res = requests.post(url, auth=(client_id, client_secret), data=data)
    return res.json()["access_token"]
예제 #10
0
def get_sqlalchemy_engine():
    from sqlalchemy import create_engine

    hiveserver2 = get_secrets(path="ye/hiveserver2")
    host = hiveserver2["ip"]
    port = hiveserver2["port"]
    user = hiveserver2["user"]
    return create_engine(f"hive://{user}@{host}:{port}/tmp")
예제 #11
0
파일: gcp.py 프로젝트: Robert-sktai/skt
def _df_to_bq_table(df, dataset, table_name, partition=None, mode="overwrite"):
    import base64
    from skt.vault_utils import get_secrets

    key = get_secrets("gcp/sktaic-datahub/dataflow")["config"]
    table = f"{dataset}.{table_name}${partition}" if partition else f"{dataset}.{table_name}"
    df.write.format("bigquery").option("project", "sktaic-datahub").option(
        "credentials", base64.b64encode(key.encode()).decode()
    ).option("table", table).option("temporaryGcsBucket", "temp-seoul-7d").save(mode=mode)
예제 #12
0
def get_hive_conn():
    from pyhive import hive

    hiveserver2 = get_secrets(path="ye/hiveserver2")
    host = hiveserver2["ip"]
    port = hiveserver2["port"]
    user = hiveserver2["user"]
    conn = hive.connect(host, port=port, username=user)
    return conn
예제 #13
0
def set_gcp_credentials():
    import os
    import tempfile
    from skt.vault_utils import get_secrets

    key = get_secrets("gcp/sktaic-datahub/dataflow")["config"]
    key_file_name = tempfile.mkstemp()[1]
    with open(key_file_name, "wb") as key_file:
        key_file.write(key.encode())
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file.name
예제 #14
0
def slack_send(
    text="This is default text",
    username="******",
    channel="#leavemealone",
    icon_emoji=":large_blue_circle:",
    blocks=None,
    dataframe=False,
):
    import requests
    from skt.vault_utils import get_secrets

    if dataframe:
        from tabulate import tabulate

        text = "```" + tabulate(text, tablefmt="simple",
                                headers="keys") + "```"

    token = get_secrets("slack")["bot_token"]["airflow"]
    proxy = get_secrets("proxy")["proxy"]
    proxies = {
        "http": proxy,
        "https": proxy,
    }
    headers = {
        "Content-Type": "application/json;charset=utf-8",
        "Authorization": f"Bearer {token}",
    }
    json_body = {
        "username": username,
        "channel": channel,
        "text": text,
        "blocks": blocks,
        "icon_emoji": icon_emoji,
    }
    r = requests.post(
        "https://www.slack.com/api/chat.postMessage",
        proxies=proxies,
        headers=headers,
        json=json_body,
    )
    r.raise_for_status()
    if not r.json()["ok"]:
        raise Exception(r.json())
예제 #15
0
def get_table_top_n_columns(n, start_date=None, end_date=None):
    lineage_secrets = get_secrets(DATA_LINEAGE_SECRETS_NAME)

    params = {"top_n": n, "start_date": start_date, "end_date": end_date}

    response = requests.get(lineage_secrets["url_prd"] +
                            "/relationships/queries/top_n/columns",
                            params=params).json()

    return response
예제 #16
0
파일: mls.py 프로젝트: Robert-sktai/skt
def meta_table_to_pandas(meta_table: str,
                         aws_env: AWSENV = AWSENV.STG.value,
                         user="******",
                         edd: bool = False) -> Any:
    """
    Get a meta_table as pandas dataframe
    Args. :
        - meta_table   :   (str) the name of meta_table
        - aws_env      :   (str) AWS ENV in 'stg / prd' (default is 'stg')
        - user         :   (str) the name of user (default is 'reco')
        - edd          :   (bool) True if On-prem env is on EDD (default is False)
    Returns :
        - A Pandas dataframe type of the item_meta
    """
    assert type(meta_table) == str
    assert type(aws_env) == str

    secret = get_secrets("mls")
    token = secret.get("user_token").get(user)

    url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"]
    url = f"{url}{MLS_META_API_URL}/{meta_table}"

    response = requests.get(url,
                            headers={
                                "Authorization": f"Basic {{{token}}}"
                            }).json()

    if not response.get("results"):
        raise MLSModelError(
            f"No meta_table '{meta_table}' exists on AWS {aws_env}")

    items = response["results"]["items"]
    key = pd.DataFrame.from_records(items)["name"]
    values = pd.DataFrame.from_records(
        pd.DataFrame.from_records(items)["values"])

    df = pd.concat([key, values], axis=1)

    return df
예제 #17
0
파일: mls.py 프로젝트: Robert-sktai/skt
def pandas_to_meta_table(
    method: str,
    meta_table: str,
    df: pd.DataFrame,
    key: str,
    values: list,
    aws_env: AWSENV = AWSENV.STG.value,
    user="******",
    edd: bool = False,
) -> None:
    """
    Create or Update items of a meta_table from Pandas Dataframe
    Args. :
        - method       :   (str) requests method 'create' or 'update'
        - meta_table   :   (str) MLS meta table name
        - df           :   (pd.DataFrame) input table
        - key          :   (str) key column in dataframe
        - values       :   (list) Dataframe columns for input
        - aws_env      :   (str) AWS ENV in 'stg / prd' (default is 'stg')
        - user         :   (str) the name of user (default is 'reco')
        - edd          :   (bool) True if On-prem env is on EDD (default is False)
    """
    assert type(aws_env) == str
    assert method in ["create", "update"]
    assert type(meta_table) == str
    assert type(df) == pd.core.frame.DataFrame
    assert type(key) == str
    assert type(values) == list

    url = get_secrets("mls")[f"ab_{'onprem_' if edd else ''}{aws_env}_url"]
    url = f"{url}{MLS_META_API_URL}/{meta_table}/meta_items"

    def to_json(x):
        insert_dict = {}
        insert_dict["name"] = x[key]
        insert_dict["values"] = {}

        for value in values:
            insert_dict["values"][value] = x[value]

        return insert_dict

    json_series = df.apply(lambda x: to_json(x), axis=1)

    for meta in json_series:
        if method == "create":
            create_meta_table_item(meta_table, meta.get("name"),
                                   meta.get("values"), aws_env, user)
        else:
            update_meta_table_item(meta_table, meta.get("name"),
                                   meta.get("values"), aws_env, user)
예제 #18
0
파일: mls.py 프로젝트: Robert-sktai/skt
def set_model_name(comm_db, params, user="******", edd: bool = False):
    secret = get_secrets("mls")
    token = secret.get("user_token").get(user)
    if comm_db[-3:] == "dev":  # stg
        url = secret["ab_onprem_stg_url"] if edd else secret["ab_stg_url"]
        url = f"{url}{MLS_COMPONENTS_API_URL}"
    else:  # prd
        url = secret["ab_onprem_prd_url"] if edd else secret["ab_prd_url"]
        url = f"{url}{MLS_COMPONENTS_API_URL}"
    requests.post(
        url,
        json=params,
        headers={"Authorization": f"Basic {{{token}}}"},
    )
예제 #19
0
파일: gcp.py 프로젝트: Robert-sktai/skt
def get_bigquery_client():
    import os
    import tempfile
    from google.cloud import bigquery
    from skt.vault_utils import get_secrets

    if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ and os.path.isfile(os.environ["GOOGLE_APPLICATION_CREDENTIALS"]):
        return bigquery.Client()
    key = get_secrets("gcp/sktaic-datahub/dataflow")["config"]
    with tempfile.NamedTemporaryFile() as f:
        f.write(key.encode())
        f.seek(0)
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = f.name
        client = bigquery.Client()
    return client
예제 #20
0
파일: mls.py 프로젝트: Robert-sktai/skt
def get_mls_meta_table_client(env="stg", user="******"):
    from sktmls.meta_tables.meta_table import MetaTableClient
    from sktmls import MLSENV

    if env == "prd":
        env = MLSENV.PRD
    else:
        env = MLSENV.STG

    secrets = get_secrets(path="mls")
    if user != "reco":
        user_id = secrets.get(f"{user}_id")
        user_pass = secrets.get(f"{user}_pass")
    else:
        user_id = secrets.get("reco_id")
        user_pass = secrets.get("reco_pass")

    if not user_id or not user_pass:
        raise Exception("No ID or Password for the user {user}")

    return MetaTableClient(env=env, username=user_id, password=user_pass)
예제 #21
0
파일: mls.py 프로젝트: Robert-sktai/skt
def get_all_recent_model_path(comm_db, user="******", edd: bool = False):
    secret = get_secrets("mls")
    token = secret.get("user_token").get(user)
    if comm_db[-3:] == "dev":  # stg
        url = secret["ab_onprem_stg_url"] if edd else secret["ab_stg_url"]
        url = f"{url}{MLS_COMPONENTS_API_URL}"
    else:  # prd
        url = secret["ab_onprem_prd_url"] if edd else secret["ab_prd_url"]
        url = f"{url}{MLS_COMPONENTS_API_URL}"

    response = requests.get(url,
                            headers={
                                "Authorization": f"Basic {{{token}}}"
                            }).json().get("results")

    results = {
        component.get("name"): component.get("info")
        for component in response if component.get("is_latest")
    }

    return results
예제 #22
0
def search_queries_by_table_id(table_id, **kwargs):
    limit = kwargs.get("limit", 100)
    fuzziness = kwargs.get("fuzziness", "AUTO")
    operator = kwargs.get("operator", "and")
    offset = kwargs.get("offset", None)
    fields = kwargs.get("fields", None)
    must = kwargs.get("must", None)
    sort = kwargs.get("sort", "desc")
    start_date = kwargs.get("start_date", None)
    end_date = kwargs.get("end_date", None)

    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)

    es_sort = [{"start_time": sort}]

    params = {
        "inputs": table_id,
        "outputs": table_id,
        "limit": limit,
        "fuzziness": fuzziness,
        "offset": offset,
        "operator": operator,
        "fields": fields,
        "must": must,
        "sort": json.dumps(es_sort),
    }

    if start_date or end_date:
        range_filter = {"range": {"start_time": {}}}

        if start_date:
            range_filter["range"]["start_time"]["gte"] = start_date

        if end_date:
            range_filter["range"]["start_time"]["lt"] = end_date

        params["range_filter"] = json.dumps(range_filter)

    return requests.get(secrets["url_prd"] + "/v1/search/processes",
                        params=params).json()
예제 #23
0
def get_user_queries(user_name, start_date=None, end_date=None, **kwargs):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)

    default_order = "asc" if (start_date or end_date) else "desc"
    order = kwargs.get("sort", default_order)
    limit = kwargs.get("limit", 100)

    es_sort = [{"start_time": order}]
    es_limit = min(100, limit)

    params = {
        "user_name": user_name,
        "limit": es_limit,
        "sort": json.dumps(es_sort)
    }

    gte = start_date or (datetime.datetime.now() -
                         datetime.timedelta(days=1)).strftime("%Y-%m-%d")
    lt = end_date or datetime.datetime.now().strftime("%Y-%m-%d")

    range_filter = {"start_time": {"gte": gte, "lt": lt}}

    params["range_filter"] = json.dumps(range_filter)

    total_queries = []

    response = requests.get(secrets["url_prd"] + "/v1/search/processes",
                            params=params).json()

    total_queries.extend(response["user_name"]["hits"])
    total = response["user_name"]["total"]["value"]

    while total > len(total_queries) and limit < len(total_queries):
        params["offset"] = json.dumps(total_queries[-1]["sort"])

        response = requests.get(secrets["url_prd"] + "/v1/search/processes",
                                params=params).json()
        total_queries.extend(response["user_name"]["hits"])

    return total_queries
예제 #24
0
def search_table_by_name(name, **kwargs):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)
    kwargs["name"] = name

    return requests.get(f"{secrets['url_prd']}/v1/search/tables",
                        params=kwargs).json()
예제 #25
0
def get_queries(source, limit=100):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)
    return requests.get(
        f"{secrets['url_prd']}/sources/{source}/processes?limit={limit}").json(
        )
예제 #26
0
def get_source(source):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)
    return requests.get(f"{secrets['url_prd']}/sources/{source}").json()
예제 #27
0
def get_spark(scale=0, queue=None):
    import os
    import uuid
    import tempfile
    from pyspark.sql import SparkSession
    from skt.vault_utils import get_secrets

    tmp_uuid = str(uuid.uuid4())
    app_name = f"skt-{os.environ.get('USER', 'default')}-{tmp_uuid}"
    if not queue:
        if "JUPYTERHUB_USER" in os.environ:
            queue = "dmig_eda"
        else:
            queue = "airflow_job"
    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"

    key = get_secrets("gcp/sktaic-datahub/dataflow")["config"]
    key_file_name = tempfile.mkstemp()[1]
    with open(key_file_name, "wb") as key_file:
        key_file.write(key.encode())
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_file.name

    if scale in [1, 2, 3, 4]:
        spark = (SparkSession.builder.config(
            "spark.app.name", app_name
        ).config("spark.driver.memory", f"{scale*8}g").config(
            "spark.executor.memory", f"{scale*3}g"
        ).config(
            "spark.executor.instances", f"{scale*8}"
        ).config(
            "spark.driver.maxResultSize",
            f"{scale*4}g"
        ).config(
            "spark.rpc.message.maxSize",
            "1024"
        ).config(
            "spark.yarn.queue",
            queue
        ).config(
            "spark.ui.enabled",
            "false"
        ).config(
            "spark.port.maxRetries",
            "128"
        ).config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1").config(
            "spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1"
        ).config(
            "spark.jars",
            "gs://external_libs/spark/jars/spark-bigquery-with-dependencies_2.11-0.16.1.jar",
        ).enableHiveSupport().getOrCreate())
    else:
        spark = (SparkSession.builder.config(
            "spark.app.name", app_name
        ).config("spark.driver.memory", "6g").config(
            "spark.executor.memory", "8g"
        ).config("spark.shuffle.service.enabled", "true").config(
            "spark.dynamicAllocation.enabled", "true"
        ).config("spark.dynamicAllocation.maxExecutors", "200").config(
            "spark.driver.maxResultSize",
            "6g"
        ).config(
            "spark.rpc.message.maxSize",
            "1024"
        ).config(
            "spark.yarn.queue",
            queue
        ).config(
            "spark.ui.enabled",
            "false"
        ).config(
            "spark.port.maxRetries",
            "128"
        ).config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", "1").config(
            "spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT", "1"
        ).config(
            "spark.jars",
            "gs://external_libs/spark/jars/spark-bigquery-with-dependencies_2.11-0.16.1.jar",
        ).enableHiveSupport().getOrCreate())
    spark.conf.set("spark.sql.execution.arrow.enabled", "true")
    return spark
예제 #28
0
def get_columns(source, table_id):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)
    return requests.get(
        f"{secrets['url_prd']}/sources/{source}/tables/{table_id}/columns"
    ).json()
예제 #29
0
def get_user_data_access(user_name,
                         start_date=None,
                         end_date=None,
                         timeseries=False,
                         **kwargs):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)
    lineage_secrets = get_secrets(DATA_LINEAGE_SECRETS_NAME)

    default_order = "asc" if (start_date or end_date) else "desc"
    order = kwargs.get("sort", default_order)
    limit = kwargs.get("limit", 1000)

    es_sort = [{"start_time": order}]
    es_limit = min(1000, limit)

    params = {
        "user_name": user_name,
        "sort": json.dumps(es_sort),
        "limit": es_limit,
        "fields": json.dumps(["inputs", "outputs"]),
    }

    gte = start_date or (datetime.datetime.now() -
                         datetime.timedelta(days=1)).strftime("%Y-%m-%d")
    lt = end_date or datetime.datetime.now().strftime("%Y-%m-%d")

    range_filter = {"start_time": {"gte": gte, "lt": lt}}

    params["range_filter"] = json.dumps(range_filter)

    total_queries = []

    response = requests.get(secrets["url_prd"] + "/v1/search/processes",
                            params=params).json()

    total_queries.extend(response["user_name"]["hits"])
    total = response["user_name"]["total"]["value"]

    while total > len(total_queries) and limit < len(total_queries):
        params["offset"] = json.dumps(total_queries[-1]["sort"])

        response = requests.get(secrets["url_prd"] + "/v1/search/processes",
                                params=params).json()
        total_queries.extend(response["user_name"]["hits"])

    result = []

    table_dict = {}
    column_dict = {}

    for each_query in total_queries:
        query_id = each_query["_id"]

        if timeseries:
            inputs = each_query["_source"].get("inputs", [])
            outputs = each_query["_source"].get("outputs", [])

            response = requests.get(
                lineage_secrets["url_prd"] +
                f"/relationships/queries/query/{query_id}/columns",
                params=params).json()
            column_list = list(map(lambda each: each["target"], response))

            result.append({
                "inputs": inputs,
                "outputs": outputs,
                "columns": column_list,
                "start_time": each_query["sort"][0],
                "query_id": query_id,
            })
        else:
            inputs = each_query["_source"].get("inputs", []) or []
            outputs = each_query["_source"].get("outputs", []) or []
            for each in inputs:
                if each not in table_dict:
                    table_dict[each] = 1

            for each in outputs:
                if each not in table_dict:
                    table_dict[each] = 1

            response = requests.get(
                lineage_secrets["url_prd"] +
                f"/relationships/queries/query/{query_id}/columns",
                params=params).json()
            column_list = list(map(lambda each: each["target"], response))

            for each_column in column_list:
                if each_column not in column_dict:
                    column_dict[each_column] = 1

    if timeseries:
        return result
    else:
        return {
            "tables": list(table_dict.keys()),
            "columns": list(column_dict.keys())
        }
예제 #30
0
def get_resource(resource_name, resource_id):
    secrets = get_secrets(DATA_CATALOG_SECRETS_NAME)
    return requests.get(
        f"{secrets['url_prd']}/v1/resources/{resource_name}/{resource_id}"
    ).json()