def execute(self, context): self.log.info('Connecting to redshift!') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: records = redshift.get_records(f"SELECT COUNT(*) FROM {table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {table} returned no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {table} contained 0 rows") for col in self.columns[table]: records = redshift.get_records( f"SELECT COUNT(*) FROM {table} WHERE {col} IS NULL") num_records = records[0][0] if num_records > 0: raise ValueError( f"The column {col} in table {table} had a NULL value!") self.log.info( f"Data quality on table {table} check passed with {num_records} records" )
def execute(self, context): """ Data Quality Checks: 1. Check the target table has a positive number of rows 2. Check the target table has no duplicate primary key Args: context: Returns: None """ hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) qf_row_count = self.q_row_count.format(schema=self.schema, table=self.table) self.log.info('Starting Data Quality Checks') # Test for presence of any records records = hook.get_records(qf_row_count) if any([len(records) < 1, len(records[0]) < 1, records[0][0] < 1]): self.log.error("{} returned no lines".format(self.table)) raise ValueError("{} returned no lines".format(self.table)) del records qf_dupes = self.q_dupes.format(schema=self.schema, table=self.table, pkey=self.pkey) # Test for no duplicates records = hook.get_records(qf_dupes) if records[0][0] > 1: self.log.error("{} returned duplicates".format(self.table)) raise ValueError("{} returned duplicates".format(self.table)) self.log.info("Data Quality checked passed on {}".format(self.table)) pass
def execute(self, context): self.log.info('Start DataQualityOperator') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) error_messages = "Data quality check failed" for table_dict in self.tables_list: if 'data_quality' in table_dict: table = table_dict['name'] task_name = f"Check for records in table {table}" min_records = 0 records_count = 0 if 'minimum_records' in table_dict['data_quality']: min_records = table_dict['data_quality']['minimum_records'] self.log.info(task_name) records = redshift.get_records(f"SELECT COUNT(*) FROM {table}") if len(records) >= 1 and len(records[0]) >= 1 : records_count = records[0][0] if records_count < min_records: raise ValueError( f""" {error_messages} for {task_name} found ${records_count}, Expected a minimum of {min_records} records. """ ) else: raise ValueError(f"{error_messages}. No result for {task_name}") if 'not_null_columns' in table_dict['data_quality'] and records_count > 0: not_null_columns = table_dict['data_quality']['not_null_columns'] for column in not_null_columns: check_null_task_name = f"Check for null values in {table}.{column}" self.log.info(check_null_task_name) count_nulls = redshift.get_records(f"SELECT COUNT(*) FROM {table} WHERE {column} is null") if len(count_nulls) >= 1 and len(count_nulls[0]) >= 1 : null_values = count_nulls[0][0] if null_values > 0: raise ValueError( f""" {error_messages} for {check_null_task_name}, Found {null_values} null records in {table}.{column}. """ ) else: raise ValueError(f"{error_messages}. No result for {check_null_task_name}")
def transfer_oltp_olap(**kwargs): """Get records from OLTP and transfer to OLAP database""" dest_table = kwargs.get('dest_table') sql = kwargs.get('sql') params = kwargs.get('params') oltp_hook = PostgresHook(postgres_conn_id='oltp') olap_hook = PostgresHook(postgres_conn_id='olap') data_extracted = oltp_hook.get_records(sql=sql, parameters=params) olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000)
def execute(self, context): """ Description: This custom function implements one or more data quality checks that are passed as SQL commands in the data_quality_checks list, executes them and checks the return value for correctness. If everything fits, this function works without any problems. If there is a disagreement, an error is thrown. Arguments: self: Instance of the class context: Context dictionary Returns: None """ # Build connection postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) # If no quality checks were specified, the function is terminated if len(self.data_quality_checks) <= 0: self.log.info( 'No data quality checks were specified. Data quality checks canceled.' ) return # Here every single quality check is run through, the associated SQL command is executed and the return value is checked. for check in self.data_quality_checks: sql_query = check.get('sql_query') expected_result = check.get('expected_result') try: self.log.info( 'Starting SQL query for data check - {}'.format(sql_query)) records = postgres.get_records(sql_query) num_records = records[0][0] if num_records != expected_result: raise ValueError( 'Data quality check failed. {} entries excpected. {} given' .format(expected_result, num_records)) else: self.log.info( 'Data Check passed for query - {}. Result: {}'.format( sql_query, num_records)) except ValueError as v: self.log.info(v.args) raise except Exception as e: self.log.info( 'SQL query for data check failed - {}. Exception: {}'. format(sql_query, e)) raise
def execute(self, context): connection = PostgresHook(postgres_conn_id=self.postgres_conn_id) results = connection.get_records( "SELECT max(cdc_case_earliest_dt) FROM covid_per_popgroup") last_datetime = results[0][0] last_date = datetime(last_datetime.year, last_datetime.month, last_datetime.day) self.log.info(f"last_date : {last_date}, {type(last_date)}") #str_date = results[0][0].isoformat() str_date = last_date.isoformat() + ".000" self.log.info(f"str_date : {str_date}, {type(str_date)}") context["task_instance"].xcom_push(key="last_cdc_date", value=str_date)
def execute(self, context): redshift = PostgresHook(self.redshift_conn_id) for num, query in enumerate(self.sql_check): self.log.info("Executing data quality check query:") self.log.info(query) result = redshift.get_records(query) if len(result) < 1 or len(result[0]) < 1: raise ValueError( "Data quality check failed. No rows returns for query {}". format(query)) num_records = result[0][0] if self.expected_results[num] != num_records: raise ValueError( "Data quality check failed: {}. Expecting num of records {}, but returned {}" .format(query, self.expected_results[num], num_records)) self.log.info("All data quality checks are passed")