def log_all_snowflake_tables(cd, tables, reset=True, **kwargs): # type: (ConnectionData, Union[bool, List[str]], bool, ...) -> Optional[List[str]] """ :param cd: ConnectionData object :param tables: if True - will log automatically discovered tables; if list of strings - will log only mentioned tables :param with_preview: :param with_schema: :param raise_on_error: :param reset: should reset the ConnectionData.tables once done :return: """ if tables is True: tables = [table_op.name for table_op in cd.tables_ops] else: # don't reset if explicit tables provided reset = False if tables: for table in tables: parts = table.split(".") table_name = parts.pop() schema = parts.pop() if parts else cd.schema database = parts.pop() if parts else cd.database log_snowflake_table( table_name=table_name, schema=schema, database=database, connection_string=cd.connection, **kwargs, ) if reset: cd.reset_tables() return tables
def execute(self, context): hook = self.get_hook() return log_snowflake_table( table_name=self.table, connection_string=hook.get_conn(), database=hook.database, schema=hook.schema, key=self.key, with_preview=self.with_preview, with_schema=self.with_schema, raise_on_error=self.raise_on_error, )
def execute(self, context): hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id) session_id, query_ids = snowflake_run(hook, select_query) connection_string = hook.get_uri() log_snowflake_table( table_name=table, connection_string=connection_string, database=database, schema=schema, key=f"example1.{table}", with_preview=False, raise_on_error=False, ) log_snowflake_resource_usage( database=database, key=f"example1.{session_id}{query_ids[0]}", connection_string=connection_string, query_ids=query_ids, session_id=int(session_id), raise_on_error=True, )
def snowflake_table_monitor(**context): full_table_path = context["target_table"] database, schema, table = full_table_path.split(".") snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn") with snowflake_query_tracker(database=database, schema=schema) as snowflake_qt: record_count = get_record_count(snowflake_hook, database, table) log_metric("records", record_count) col_metadata = get_column_info(snowflake_hook, database, schema, table) log_metric("column metadata", col_metadata) if ENABLE_SNOWFLAKE_TABLE_SAMPLE: data = get_random_sample(snowflake_hook, database, schema, table, SNOWFLAKE_TABLE_SAMPLE_ROW_PROB) log_snowflake_table(table, connection_string=snowflake_hook.get_uri(), database=database, schema=schema, with_preview=True, with_schema=True) # get difference between last known state of table and the current state table_delta = 0 column_diff = [] try: previous_record_count = Variable.get( "{}_record_cnt".format(full_table_path)) table_delta = previous_record_count - record_count previous_col_names = Variable.get( "{}_column_names".format(full_table_path)) column_diff = list( set(previous_col_names) - set(col_metadata['column_names'])) except: pass col_changed = True if column_diff else False log_metric("table_delta", table_delta) log_metric("columns_changed", col_changed) Variable.set("{}_record_cnt".format(full_table_path), record_count) Variable.set("{}_column_names".format(full_table_path), col_metadata['column_names']) # log metrics of the sampled data (if sampled) if ENABLE_SNOWFLAKE_TABLE_SAMPLE: log_metric("sample_size(%)", SNOWFLAKE_TABLE_SAMPLE_ROW_PROB) for column in data.columns: log_metric("{} null record count".format(column), int(data[column].isna().sum())) if issubdtype(data[column].dtype, number): log_metric("{} mean".format(column), round(data[column].mean(), 2)) log_metric("{} median".format(column), data[column].median()) log_metric("{} min".format(column), data[column].min()) log_metric("{} max".format(column), data[column].max()) log_metric("{} std".format(column), round(data[column].std(), 2)) context['ti'].xcom_push(key="{}_table_delta".format(full_table_path), value=table_delta) context['ti'].xcom_push(key="{}_record_count".format(full_table_path), value=record_count)