Exemplo n.º 1
0
 def execute(self, context):
     self.log.info(f'Checking DataQuality for {self._table}')
     snowflake_hook = SnowflakeHook(self._conn_id)
     self.log.info(f'Running query: {self._query}')
     records = snowflake_hook.get_records(self._query)
     if len(records) < 1 or len(records[0] < 1):
         raise ValueError(
             f"Data quality check failed. {self._table} returned no results"
         )
     logging.info(f'Passed Data Quality Check for {self._table}')
Exemplo n.º 2
0
 def execute(self, context):
     self.log.info(f'Checking HasRows for {self._table}')
     snowflake_hook = SnowflakeHook(self._conn_id)
     self.log.info(f'Running rcord count: {self._table}')
     records = snowflake_hook.get_records(
         f"SELECT COUNT(1) FROM {self._table}")
     if len(records) < 1 or len(records[0] < 1):
         raise ValueError(
             f"Data quality check failed. {self._table} returned no results"
         )
     logging.info(f'Passed Has Rows Check for {self._table}')
Exemplo n.º 3
0
def snowflake_db_monitor(**op_kwarg):
    snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn")

    with snowflake_query_tracker(database=DATABASE, schema=SCHEMA) as st:
        snowflake_tables = snowflake_hook.get_pandas_df(GET_COLUMNS)
        snowflake_shapes = DataFrame()
        snowflake_tables = snowflake_tables[snowflake_tables["schema_name"] ==
                                            "{}".format(SCHEMA)]

    snowflake_shapes["column_count"] = snowflake_tables.groupby(
        "table_name").nunique("column_name")["column_name"]
    snowflake_shapes["table_name"] = snowflake_tables["table_name"].unique()

    table_row_info = {}
    snowflake_rows = snowflake_hook.get_records(GET_DB_ROW_INFO)
    for tablename, row_count in snowflake_rows:
        table_row_info[tablename] = row_count

    row_counts = list(table_row_info.values())
    log_metric("Max table row count", max(row_counts))
    log_metric("Min table row count", min(row_counts))
    log_metric("Mean table row count", round(mean(row_counts), 2))
    log_metric("Median table row count", median(row_counts))

    snowflake_shapes["row_count"] = (snowflake_shapes["table_name"].map(
        table_row_info).fillna(0).astype(int))

    for _, row in snowflake_shapes.iterrows():
        log_metric(
            "{} shape".format(row["table_name"]),
            (row["column_count"], row["row_count"]),
        )

    log_metric("Max table column count",
               snowflake_shapes["column_count"].max())
    log_metric("Min table column count",
               snowflake_shapes["column_count"].max())
    log_metric("Mean table column count",
               round(snowflake_shapes["column_count"].mean(), 2))
    log_metric("Median table column count",
               snowflake_shapes["column_count"].median())
Exemplo n.º 4
0
def process_customers(**kwargs):
    snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID)
    customers = snowflake_hook.get_records(select_query)

    # Process records
    process_records(customers)
Exemplo n.º 5
0
def process_customers_with_monitoring(**kwargs):
    snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID)
    with snowflake_query_tracker(log_tables=False, database=database):
        customers = snowflake_hook.get_records(select_query)
    # Process records - Same code
    process_records(customers)