def execute(self, context): self.log.info( f"LoadDimensionOperator starts execution for table '{self.dim_table_name}'" ) # Connection to Redshift redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Connection to Redshift has been made.") # Create dim table self.log.info( f"Create dim table '{self.dim_table_name}' if not exists") redshift.run(f"{self.dim_table_sql_create}") self.log.info( f"Dimension table '{self.dim_table_name}' has been created") # LOG information self.log.info( f"Inserting entries to dim table '{self.dim_table_name}'") # Check for the STATS output to see in log file how many rows have been inserted entries_before = redshift.get_first( f"SELECT COUNT(1) FROM {self.dim_table_name};") # Insert data into dim table. Operation_mode should be in 'truncate_load' mode if self.operation_mode == "truncate_load": self.log.info( f"Data for dim table '{self.dim_table_name}' works in " f"'{self.operation_mode}' mode.") redshift.run(f"{self.dim_table_sql_truncate}") redshift.run(f"{self.dim_table_sql_insert}") self.log.info( f"Data for dim table '{self.dim_table_name}' has been inserted." ) elif self.operation_mode == "append_only": self.log.info( f"Data for dim table '{self.dim_table_name}' works in " f"'{self.operation_mode}' mode.") redshift.run(f"{self.dim_table_sql_insert}") self.log.info( f"Adding data to dimension table '{self.dim_table_name}' should in " f"operation_mode = 'truncate_load'. This mode inserts ONLY new entries!" ) else: raise ValueError( f"Please configure operation_mode == (\"truncate_load\" | \"append_only\")." ) # Check for the STATS output to see in log file how many rows have been inserted entries_after = redshift.get_first( f"SELECT COUNT(1) FROM {self.dim_table_name};") entries_inserted = entries_after[0] - entries_before[0] self.log.info( f"STATS: Before insert: {entries_before[0]}; After insert: {entries_after[0]}; " f"Diff: {entries_inserted}")
def create_common_countries_table(): ''' Creates a common country_or_area table from commodities_staging and temperature_staging ''' table = "country_or_area" postgres_hook = PostgresHook(postgres_conn_id='postgres', schema='world') engine = postgres_hook.get_sqlalchemy_engine() min_year_commodities = postgres_hook.get_first( "select min(year) from commodities_staging;")[0] max_year_commodities = postgres_hook.get_first( "select max(year) from commodities_staging;")[0] get_countries_from_commodities_staging = "select distinct(country_or_area) from commodities_staging;" get_countries_from_temperature_staging = f"select distinct(country_or_area) from temperature_staging where year >= {min_year_commodities} and year <= {max_year_commodities};" commodities_countries_records = postgres_hook.get_records( get_countries_from_commodities_staging) temperature_countries_records = postgres_hook.get_records( get_countries_from_temperature_staging) commodities_countries_set = set( reduce(operator.concat, commodities_countries_records)) temperature_countries_set = set( reduce(operator.concat, temperature_countries_records)) common_country_set = commodities_countries_set.union( temperature_countries_set) print(f"common_country_set: {common_country_set}") country_or_area_df = pd.DataFrame(list(common_country_set), columns=['country_or_area']) country_or_area_df.to_sql(table, engine, index=False, if_exists="append")
def execute(self, context): redshift_hook = PostgresHook("redshift") for stmt in self.check_stmts: result = int(redshift_hook.get_first(sql=stmt['sql'])[0]) # check if equal if stmt['op'] == 'eq': if result != stmt['val']: raise AssertionError( f"Data Quality Check failed: {result} {stmt['op']} {stmt['val']}" ) # check if not equal elif stmt['op'] == 'ne': if result == stmt['val']: raise AssertionError( f"Data Quality Check failed: {result} {stmt['op']} {stmt['val']}" ) # check if greater than elif stmt['op'] == 'gt': if result <= stmt['val']: raise AssertionError( f"Data Quality Check failed: {result} {stmt['op']} {stmt['val']}" ) self.log.info( f"Data Quality Check Passed: {result} {stmt['op']} {stmt['val']}" )
def execute(self, context): """ Perform data quality checks on resulting fact and dimension tables. Parameters: ---------- redshift_conn_id: string airflow connection to redshift cluster table: string table located in redshift cluster test_stmt: string test SQL command to check validity of target table result: string result of test_stmt to check validity """ pg_hook = PostgresHook(self.redshift_conn_id) records = pg_hook.get_records(f"SELECT COUNT(*) FROM {self.table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError(f"Fail: No results for {self.table}") num_records = records[0][0] if num_records < 1: raise ValueError(f"Fail: 0 rows in {self.table}") if self.test_stmt: output = pg_hook.get_first(self.test_stmt) if self.result != output: raise ValueError(f"Fail: {output} != {self.result}") self.log.info(f"Success: {self.table} has {records[0][0]} records")
def execute(self, context): """ Perform data quality checks on resulting fact and dimension tables. Parameters: ---------- redshift_conn_id: string airflow connection to redshift cluster table: string table located in redshift cluster test_stmt: string test SQL command to check validity of target table result: string result of test_stmt to check validity """ aws_hook = PostgresHook(self.redshift_conn_id) for table in self.tables: records = aws_hook.get_records(f"SELECT COUNT(*) FROM {table}") if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1: raise ValueError( f"Data quality check failed. {table} returned no results") self.log.error( f"Data quality check failed. {table} returned no results") self.log.info( f"Data quality on table {table} check passed with {records[0][0]} records" ) if self.test_stmt: output = aws_hook.get_first(self.test_stmt) if self.result != output: raise ValueError(f"Fail: {output} != {self.result}")
def execute(self, context): self.log.info( 'Detect number of entries per table, optionally compare to expected numbers' ) redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for idx, table in enumerate(self.tables): query = f"SELECT COUNT(*) FROM {table};" count = redshift.get_first(query)[0] print(f"result of {query} is {count}") if len(self.expected_counts) > 0: expected_count = self.expected_counts[idx] if count != expected_count: print( f"Validation error: table {table} contains {count} records while {expected_count} where expected." ) else: print( f"Validation success: table {table} contains {count} records as expected." ) else: if count > 0: print( f"Validation success: table {table} contains {count} records." ) else: print( f"Validation error: table {table} contains no records while some records where expected." ) return True
def execute(self, context): self.log.info('DataQualityOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: for stmt in self.stmts_checks: self.log.info(stmt['sql'].format(self.sql_schema, table)) sql = stmt['sql'].format(self.sql_schema, table) result = int(redshift.get_first(sql)[0]) strError = 'Check failed: {} {} {}'.format( result, stmt['op'], stmt['val']) # Check greater than if stmt['op'] == 'gt' and result <= stmt['val']: raise AssertionError(strError) # Check equal elif stmt['op'] == 'eq' and result != stmt['val']: raise AssertionError(strError) # Check if not equal elif stmt['op'] == 'ne' and result == stmt['val']: raise AssertionError(strError) self.log.info('Passed check: {} {} {}'.format( result, stmt['op'], stmt['val']))
def execute(self, context): ''' Perform data quality checks by running a list of queries. Parameters: ---------- conn_id (string) : Airflow connection to redshift cluster queries (list) : List of check queries, specified as {'sql':'SELECT COUNT(*) FROM time WHERE hour < 0', 'expect':0} ''' redshift = PostgresHook(postgres_conn_id=self.conn_id) for query in self.queries: sql = query.get('sql') if sql is None: self.log.error( 'Data quality check: no SQL expression specified.') break expect = query.get('expect') if expect is None: expect = 0 count = redshift.get_first(sql)[ 0] #https://stackoverflow.com/a/59420411 if (count != expect): self.log.error( f'Check failed: {sql} returns {count}, expected: {expect}') else: self.log.info(f'Check passed: {sql} returns {count}.')
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) copy_options = '\n\t\t\t'.join(self.copy_options) copy_statement = self._build_copy_query(credentials_block, copy_options) self.log.info("Creating the staging table...") postgres_hook.run(self.create_table_sql) self.log.info("Creating the staging table complete...") self.log.info('Executing COPY command...') postgres_hook.run(copy_statement) self.log.info("COPY command complete...") self.log.info("Logging the number of rows and files on S3 affected...") number_of_rows = postgres_hook.get_first(f"SELECT count(*) FROM {self.schema}.{self.table}")[0] number_of_keys_s3 = s3_hook.list_keys(bucket_name=self.s3_bucket, prefix=self.s3_key) self.log.info(f"{self.schema}.{self.table} has {number_of_rows} rows") self.log.info(f"{self.s3_bucket}/{self.s3_key} has {len(number_of_keys_s3)} files") self.log.info("Logging the number of rows and files on S3 affected complete...")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for quality_check in self.queries_and_results: self.log.info("Running data validation query") result = redshift.get_first(quality_check['query']) self.log.info(f"result: {result}") if result[0] != quality_check['result']: raise ValueError
def execute(self, context): redshift_hook=PostgresHook(self.redshift_conn_id) for (sql_query,result) in self.sql_queries: row=redshift_hook.get_first(sql_query) if row is not None and row[0] == result: self.log.info('The Result {} is matched with expected Result {}\n==================================='.format(sql_query,result) else: raiseValueErorr('Test Faild : {} not equal {} \n==================================='.format(sql_query,result))
def test_writer(self): rows = [{"foo": "%s" % i} for i in range(0, 100)] with self.dataset.get_writer(chunksize=10) as writer: for row in rows: writer.write_row_dict(row) db = PostgresHook("postgres_test") count = db.get_first("SELECT COUNT(*) FROM test.test")[0] self.assertEqual(count, 100)
def check_table_exists(check_table_exists_sql): print("checking sql={}".format(check_table_exists_sql)) hook = PostgresHook() records = hook.get_first(check_table_exists_sql) if records is None: return "create_table" else: return "skip_table_creation"
def monitor_redshift_db(**op_kwarg): """Redshift database monitor collects the following metrics: - Number of tables in database - Shape of each table in the database - Min, max, mean, median number of rows across all tables, - Min, max, mean, median number of columns across all tables, - Total number of rows and columns - Largest tables by row and column - Disk capacity, Free space on disk, Used space on disk (in GB) - Disk percent usage """ hook = PostgresHook(REDSHIFT_CONN_ID) num_redshift_tables = hook.get_first(COUNT_TABLES, parameters=[TARGET_SCHEMA])[0] log_metric("table count", num_redshift_tables) table_row_counts = hook.get_records(COUNT_TABLE_ROWS, parameters=[TARGET_SCHEMA]) num_rows_per_table = {} for tablename, row_count in table_row_counts: num_rows_per_table[tablename] = int(round(row_count)) row_counts = list(num_rows_per_table.values()) log_metric("Max table row count", max(row_counts)) log_metric("Min table row count", min(row_counts)) log_metric("Mean table row count", round(mean(row_counts), 2)) log_metric("Median table row count", median(row_counts)) tables = hook.get_pandas_df(DESCRIBE_TABLES, parameters=[TARGET_SCHEMA]) table_shapes = DataFrame() table_shapes["columns"] = tables.groupby("tablename").nunique("column")["column"] table_shapes["tablename"] = tables["tablename"].unique() table_shapes["rows"] = ( table_shapes["tablename"].map(num_rows_per_table).fillna(0).astype(int) ) for _, row in table_shapes.iterrows(): log_metric("{} shape".format(row["tablename"]), (row["columns"], row["rows"])) log_metric("Max table column count", table_shapes["columns"].max()) log_metric("Min table column count", table_shapes["columns"].max()) log_metric("Mean table column count", round(table_shapes["columns"].mean(), 2)) log_metric("Median table column count", table_shapes["columns"].median()) log_metric("Total columns", table_shapes["columns"].sum()) log_metric("Total rows", table_shapes["rows"].sum()) max_row_table = table_shapes[table_shapes["rows"] == table_shapes["rows"].max()] max_col_table = table_shapes[ table_shapes["columns"] == table_shapes["columns"].max() ] log_metric("Largest table (by row count)", max_row_table["tablename"][0]) log_metric("Largest table (by col count)", max_col_table["tablename"][0]) disk_stats = hook.get_records(DISK_USAGE).pop() disk_capacity, disk_used, disk_free = disk_stats log_metric("Disk capacity (GB)", disk_capacity) log_metric("Disk used (GB)", disk_used) log_metric("Disk free (GB)", disk_free) log_metric("Percent Disk usage", round((disk_used / disk_capacity) * 100, 2))
def execute(self, context): self.log.info(f'Running Data Quality {'tests' if len(self.sql_test)>1 else 'test'}') hook = PostgresHook(self.conn_id) for (test_sql, expectation) in zip(self.sql_test,self.expected_result): result = hook.get_first(test_sql)[0] if str(result)!=expectation: self.log.info(f"Running test SQL: \n{test_sql}") raise ValueError(f'This test did not pass \n{test_sql}') self.log.info("All tests passed!")
class PostgresXcomOperator(PostgresOperator): """ Regular PostgresOperator does not return a value, so cannot do Xcom """ def execute(self, context): self.log.info("Executing: %s", self.sql) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) return self.hook.get_first(self.sql, parameters=self.parameters)
def last_timestamp_loaded(ops_type): pg_hook = PostgresHook(postgres_conn_id='ctd') ret = pg_hook.get_first( 'select max(ops_timestamp) from ods.tz_data where ops_type = %s', parameters=(ops_type, )) if not ret[0]: return utc_timestamp(INITIAL_LOAD_UTC) else: return ret[0]
def runSql(self, x): myObj = self.tables redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('connection to Redshift successful...') formatted_sql = DataQualityOperator.statement.format( myObj["tables"][x], myObj["fields"][x]) num = redshift.get_first(formatted_sql) num = num[0] return num
def execute(self, context): self.log.info('Checking data quality') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) result = redshift.get_first(self.sql_test) result = result[0] if result != self.expected_result: self.log.info(f'Generated result: {result}') raise ValueError("Data validation fails")
def execute(self, context): self.log.info('Running DataQualityOperator') redshift = PostgresHook(postgres_conn_id=self.conn_id) for f in self.fmt: query = self.query.format(f) res = redshift.get_first(query)[0] if res == self.failure_value: raise ValueError(f"failed query {query}, failure {self.failure_value}")
def init(): hook = PostgresHook() query = ''' SELECT 1 FROM information_schema.tables WHERE table_schema = %s AND table_name = %s ''' is_exist = hook.get_first(sql=query, parameters=(schema, table)) return ['do_nothing' if is_exist else 'create_table']
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for sql, expected_result in self.checks: result = redshift.get_first(sql)[0] assert (result == expected_result), f""" Data quality check failed! Query: {sql} Expected result: {expected_result} Actual result: {result} """ self.log.info("All checks passed!")
def execute_data_quality_checks(query, tables): logger = logging.getLogger(__name__) pg_hook = PostgresHook('redshift_lake') for table in tables: statement = query + table result = pg_hook.get_first(statement) if result is None: raise Exception(f'Load of data into table {table} failed, please review') return True
def _extract_last_updated_value(self) -> str: hook = PostgresHook(postgres_conn_id=self.destination_conn_id) last_updated_field = hook.get_first(sql=self.last_updated_sql)[0] if not last_updated_field: self.log.info( f'Last event value not found, ' + ( f'using default value - {self.default_last_updated_value}'), ) return self.default_last_updated_value self.log.info(f'Last event value was {last_updated_field}') return last_updated_field
class PostgreSQLCountRowsOperator(BaseOperator): @apply_defaults def __init__(self, table_name, postgres_conn_id, *args, **kwargs): self.table_name = table_name self.postgres_conn_id = postgres_conn_id self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) super(PostgreSQLCountRowsOperator, self).__init__(*args, **kwargs) def execute(self, context): result = self.hook.get_first( sql=f'SELECT COUNT(*) FROM {self.table_name};') return result
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) self.log.info('DataQualityOperator started.') for (sql_stmt, answer) in self.sql_stmts: row = redshift_hook.get_first(sql_stmt) if row is not None: if row[0] == answer: self.log.info("Test {} Passed.".format(sql_stmt)) else: raise ValueError("Test {} Failed.".format(sql_stmt)) self.log.info('DataQualityOperator finished.')
def execute(self, context): # Establish Connection Hooks redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Execute Tests & Compare to Expected Results for test in range(len(self.test_queries)): test_result = redshift.get_first(self.test_queries[test]) if test_result[0] != self.expected_results[test]: raise ValueError('Test no. {} failed;\n {}'.format( test, self.test_queries[test])) else: self.log.info("Test no. {} passed".format(test))
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) test_result = redshift.get_first(self.sql) # Test result is a tuple, extract only the first element, which should be a number if self.test_operator(test_result[0], self.result): self.log.info(f"Data quality check passed!") else: raise ValueError( f"Test did not pass. Test result: {test_result}, expected: {self.result}." )
def f_check_table_exists(table_name): connect = PostgresHook(postgres_conn_id=database) query = """ select count(1) from information_schema.tables where table_schema not like %s and table_name = %s """ res = connect.get_first(query, parameters=('', table_name)) if res[0] == 0: return 'create_table' else: return 'table_exists'
def query_latest_id(task_type, etl_hook: PostgresHook): """ 查询最新id :param task_type: :param etl_hook: :return: """ la = etl_hook.get_first( "select task_type,latest_id from cn_spider_snapshot where task_type=%s", parameters=(task_type, )) if la and la[1]: return la[1]