def execute(self, context): snowflake_hook = SnowflakeHook( snowflake_conn_id=self.snowflake_conn_id) # Snowflake won't accept list of files it has to be tuple only. # but in python tuple([1]) = (1,) => which is invalid for snowflake files = str(self.s3_keys) files = files.replace('[', '(') files = files.replace(']', ')') # we can extend this based on stage base_sql = """ FROM @{stage}/ files={files} file_format={file_format} """.format(stage=self.stage, files=files, file_format=self.file_format) if self.columns_array: copy_query = """ COPY INTO {schema}.{table}({columns}) {base_sql} """.format(schema=self.schema, table=self.table, columns=",".join(self.columns_array), base_sql=base_sql) else: copy_query = """ COPY INTO {schema}.{table} {base_sql} """.format(schema=self.schema, table=self.table, base_sql=base_sql) self.log.info('Executing COPY command...') print(snowflake_hook.get_uri()) snowflake_hook.run(copy_query, self.autocommit) self.log.info("COPY command completed")
def _log_snowflake_resources( query_text, snowflake_conn_id, session_id=None, warehouse=None, database=None, role=None, schema=None, account=None, ): hook = SnowflakeHook( snowflake_conn_id, warehouse=warehouse, account=account, database=database, role=role, schema=schema, ) conn = hook.get_uri() conn_params = hook._get_conn_params() log_snowflake_resource_usage( query_text, database=hook.database, user=conn_params["user"], connection_string=conn, session_id=session_id, )
def _log_snowflake_table( table, snowflake_conn_id, warehouse=None, database=None, role=None, schema=None, account=None, ): hook = SnowflakeHook( snowflake_conn_id=snowflake_conn_id, warehouse=warehouse, account=account, database=database, role=role, schema=schema, ) connection_string = hook.get_uri() return log_snowflake_table(table, connection_string, database, schema)
def execute(self, context): hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id) session_id, query_ids = snowflake_run(hook, select_query) connection_string = hook.get_uri() log_snowflake_table( table_name=table, connection_string=connection_string, database=database, schema=schema, key=f"example1.{table}", with_preview=False, raise_on_error=False, ) log_snowflake_resource_usage( database=database, key=f"example1.{session_id}{query_ids[0]}", connection_string=connection_string, query_ids=query_ids, session_id=int(session_id), raise_on_error=True, )
def snowflake_table_monitor(**context): full_table_path = context["target_table"] database, schema, table = full_table_path.split(".") snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn") with snowflake_query_tracker(database=database, schema=schema) as snowflake_qt: record_count = get_record_count(snowflake_hook, database, table) log_metric("records", record_count) col_metadata = get_column_info(snowflake_hook, database, schema, table) log_metric("column metadata", col_metadata) if ENABLE_SNOWFLAKE_TABLE_SAMPLE: data = get_random_sample(snowflake_hook, database, schema, table, SNOWFLAKE_TABLE_SAMPLE_ROW_PROB) log_snowflake_table(table, connection_string=snowflake_hook.get_uri(), database=database, schema=schema, with_preview=True, with_schema=True) # get difference between last known state of table and the current state table_delta = 0 column_diff = [] try: previous_record_count = Variable.get( "{}_record_cnt".format(full_table_path)) table_delta = previous_record_count - record_count previous_col_names = Variable.get( "{}_column_names".format(full_table_path)) column_diff = list( set(previous_col_names) - set(col_metadata['column_names'])) except: pass col_changed = True if column_diff else False log_metric("table_delta", table_delta) log_metric("columns_changed", col_changed) Variable.set("{}_record_cnt".format(full_table_path), record_count) Variable.set("{}_column_names".format(full_table_path), col_metadata['column_names']) # log metrics of the sampled data (if sampled) if ENABLE_SNOWFLAKE_TABLE_SAMPLE: log_metric("sample_size(%)", SNOWFLAKE_TABLE_SAMPLE_ROW_PROB) for column in data.columns: log_metric("{} null record count".format(column), int(data[column].isna().sum())) if issubdtype(data[column].dtype, number): log_metric("{} mean".format(column), round(data[column].mean(), 2)) log_metric("{} median".format(column), data[column].median()) log_metric("{} min".format(column), data[column].min()) log_metric("{} max".format(column), data[column].max()) log_metric("{} std".format(column), round(data[column].std(), 2)) context['ti'].xcom_push(key="{}_table_delta".format(full_table_path), value=table_delta) context['ti'].xcom_push(key="{}_record_count".format(full_table_path), value=record_count)