def execute(self, context): postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() unload_options = '\n\t\t\t'.join(self.unload_options) s3_key = '{}/{}_'.format( self.s3_key, self.table) if self.table_as_file_name else self.s3_key select_query = "SELECT * FROM {schema}.{table}".format( schema=self.schema, table=self.table) unload_query = """ UNLOAD ('{select_query}') TO 's3://{s3_bucket}/{s3_key}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format(select_query=select_query, s3_bucket=self.s3_bucket, s3_key=s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, unload_options=unload_options) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def execute(self, context): """ Data Quality Checks: 1. Check the target table has a positive number of rows 2. Check the target table has no duplicate primary key Args: context: Returns: None """ hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) qf_row_count = self.q_row_count.format(schema=self.schema, table=self.table) self.log.info('Starting Data Quality Checks') # Test for presence of any records records = hook.get_records(qf_row_count) if any([len(records) < 1, len(records[0]) < 1, records[0][0] < 1]): self.log.error("{} returned no lines".format(self.table)) raise ValueError("{} returned no lines".format(self.table)) del records qf_dupes = self.q_dupes.format(schema=self.schema, table=self.table, pkey=self.pkey) # Test for no duplicates records = hook.get_records(qf_dupes) if records[0][0] > 1: self.log.error("{} returned duplicates".format(self.table)) raise ValueError("{} returned duplicates".format(self.table)) self.log.info("Data Quality checked passed on {}".format(self.table)) pass
def __datasource_to_csv(self, execution_date): final_query = self.extract_query.\ replace("$EXECUTION_DATE", """'%s'""" % execution_date) logging.info("QUERY : %s" % final_query) cursor = PostgresHook(self.connection).get_conn().cursor() cursor.execute(final_query) result = cursor.fetchall() # Write to CSV file temp_path = self.file_path + \ self.table_name + \ '_' + execution_date + '.csv' with open(temp_path, 'w') as fp: a = csv.writer(fp, quoting=csv.QUOTE_MINIMAL, delimiter='|') a.writerow([i[0] for i in cursor.description]) a.writerows(result) # Read CSV file full_path = temp_path + '.gz' with open(temp_path, 'rb') as f: data = f.read() # Compress CSV file with gzip.open(full_path, 'wb') as output: try: output.write(data) finally: output.close() # Close file after reading f.close() # Delete csv file os.remove(temp_path) # Change access mode os.chmod(full_path, 0o777)
def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def execute(self, context): self._postgres_hook = PostgresHook( postgres_conn_id=self.redshift_conn_id) self._s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self._s3_hook.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=self.schema, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') self._postgres_hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context: 'Context'): self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) if self.runtime_parameters: final_sql = [] sql_param = {} for param in self.runtime_parameters: set_param_sql = f"SET {{}} TO %({param})s;" dynamic_sql = SQL(set_param_sql).format(Identifier(f"{param}")) final_sql.append(dynamic_sql) for param, val in self.runtime_parameters.items(): sql_param.update({f"{param}": f"{val}"}) if self.parameters: sql_param.update(self.parameters) if isinstance(self.sql, str): final_sql.append(SQL(self.sql)) else: final_sql.extend(list(map(SQL, self.sql))) self.hook.run(final_sql, self.autocommit, parameters=sql_param) else: self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def execute(self, context, testing=False): """ Query data from Postgresql master database and then write into a json file. The default json file name is the table name + the utc time. """ self.log.info('LoadToJsonOperator Starting...') try: self.log.info("Initializing Postgres Master DB Connection...") psql_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) psql_conn = psql_hook.get_conn() psql_cursor = psql_conn.cursor(cursor_factory=RealDictCursor) execution_date = context.get('execution_date') execution_date = execution_date.strftime(self._date_format) for table in self._tables: ports_select_all = self._query.format(table=table) psql_cursor.execute(ports_select_all) if not self._filenames: self._filenames[table] = f"{table}_{execution_date}.json" self.save_to_json(cursor=psql_cursor, key=table, file_name=self._filenames[table]) except (UndefinedTable, OperationalError): self.log.error("LoadToJsonOperator FAILED.") raise Exception("LoadToJsonOperator FAILED. OperationalError") except Exception: self.log.error(traceback.format_exc()) raise Exception("LoadToJsonOperator FAILED.") finally: if not testing: self.log.info('Closing database connections...') psql_conn.close() self.log.info('LoadToJsonOperator SUCCESS!')
def grower_export_wrap(ds, **kwargs): from lib.utils import print_time db = PostgresHook(postgres_conn_id=postgresConnId) conn = db.get_conn() try: date = datetime.now().strftime("%Y-%m-%d") CKAN_DOMAIN = Variable.get("CKAN_DOMAIN") # check if CKAN_DOMAIN exists assert CKAN_DOMAIN CKAN_DATASET_NAME_GROWER_DATA = Variable.get( "CKAN_DATASET_NAME_GROWER_DATA") assert CKAN_DATASET_NAME_GROWER_DATA CKAN_API_KEY = Variable.get("CKAN_API_KEY") assert CKAN_API_KEY ckan_config = { "CKAN_DOMAIN": CKAN_DOMAIN, "CKAN_DATASET_NAME_GROWER_DATA": CKAN_DATASET_NAME_GROWER_DATA, "CKAN_API_KEY": CKAN_API_KEY, } print("ckan_config:", ckan_config) grower_export(conn, date, 178, ckan_config) return 0 except Exception as e: print("get error when export:", e) raise ValueError('Error executing query')
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id, client_type="redshift") credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_path = "s3://{}/{}/{}".format(self.s3_bucket, self.s3_directory, self.table["s3"]["key"]) files_format = self.table["s3"]["format"] delimiter = self.table["s3"]["delimiter"] ignoreheader = self.table["s3"]["ignoreheader"] delimiter_text = '' if (delimiter): delimiter_text = f"delimiter '{delimiter}'" self.log.info( f"Start Copying data from {s3_path} to Table { self.table['name']}" ) redshift.run(f""" COPY { self.table["name"]} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY '{credentials.secret_key}' {files_format} {delimiter_text} {ignoreheader} """)
def process_song_file(filepath): """ Reads from song datasource and loads data into song and artist tables :param filepath: Source data location """ postgres_hook = PostgresHook(postgres_conn_id='local_postgres', schema='postgres') conn = postgres_hook.get_conn() cur = conn.cursor() # open song file df = pd.read_json(filepath, lines=True) # insert song record song_data = df[['song_id', 'title', 'artist_id', 'year', 'duration']].values[0].tolist() cur.execute(song_table_insert, song_data) # insert artist record artist_data = df[[ 'artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude' ]].values[0].tolist() cur.execute(artist_table_insert, artist_data) cur.close() conn.commit()
def tearDownClass(cls): postgres = PostgresHook() with postgres.get_conn() as conn: with conn.cursor() as cur: for table in TABLES: cur.execute( "DROP TABLE IF EXISTS {} CASCADE;".format(table))
def query(self): """Queries Postgres and returns a cursor to the results.""" hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) conn = hook.get_conn() cursor = conn.cursor() cursor.execute(self.sql, self.parameters) return cursor
def test_get_conn_from_connection_with_schema(self, mock_connect): conn = Connection(login='******', password='******', host='host', schema='schema') hook = PostgresHook(connection=conn, schema='schema-override') hook.get_conn() mock_connect.assert_called_once_with( user='******', password='******', host='host', dbname='schema-override', port=None )
def execute(self, context): self.log.info('Connecting to redshift!') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: records = redshift.get_records(f"SELECT COUNT(*) FROM {table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {table} returned no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {table} contained 0 rows") for col in self.columns[table]: records = redshift.get_records( f"SELECT COUNT(*) FROM {table} WHERE {col} IS NULL") num_records = records[0][0] if num_records > 0: raise ValueError( f"The column {col} in table {table} had a NULL value!") self.log.info( f"Data quality on table {table} check passed with {num_records} records" )
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_statement = f""" COPY {self.schema}.{self.table} FROM 's3://{self.s3_bucket}/{self.s3_key}' with credentials 'aws_access_key_id={credentials.access_key};aws_secret_access_key={credentials.secret_key}' {copy_options}; """ if self.truncate_table: truncate_statement = f'TRUNCATE TABLE {self.schema}.{self.table};' sql = f""" BEGIN; {truncate_statement} {copy_statement} COMMIT """ else: sql = copy_statement self.log.info('Executing COPY command...') postgres_hook.run(sql, self.autocommit) self.log.info("COPY command complete...")
def execute(self, **kwargs) -> None: """Execute the operator """ hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) hook.bulk_load( table=self.target_table, tmp_file=self.source_file, )
def check_if_table_exists(table_name, connection): hook = PostgresHook(postgres_conn_id=connection) query = hook.get_first(f'''SELECT * from information_schema.tables where table_name = \'{table_name}\' and table_schema = \'public\'''') if query: return 'insert_new_row' else: return 'create_table'
def check_table_exist(sql_to_get_schema, sql_to_check_table_exist, table_name): """ callable function to check if table exist """ hook = PostgresHook() query = hook.get_first(sql=sql_to_check_table_exist.format(table_name.lower())) if query: return "insert_row" else: return "create_table"
def insert_row(sql_query, table_name, custom_id, dt_now, **kwargs): """ postgres hook to insert a new row: | id | user | timestamp | """ hook = PostgresHook() connection = hook.get_conn() cursor = connection.cursor() cursor.execute( sql_query, (custom_id, kwargs["ti"].xcom_pull(task_ids="getting_current_user"), dt_now) ) connection.commit()
def execute(self, context): hook = PostgresHook() connection = hook.get_conn() cursor = connection.cursor() cursor.execute(f"SELECT COUNT(*) FROM {self.table_name.lower()};") count_r = cursor.fetchall() context["ti"].xcom_push( key="{db_name}_rows_count".format(db_name=self.table_name), value=count_r, )
def create_new_person_records(ds, **kwargs): db = PostgresHook(postgres_conn_id=postgresConnId) conn = db.get_conn() cursor = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) try: cursor.execute(""" SELECT * FROM planter JOIN ( SELECT regexp_replace((trim(lower(first_name)) || trim(lower(last_name))), '[ .-]', '', 'g') as name_key, count(*) FROM planter WHERE planter.organization_id IN ( select entity_id from getEntityRelationshipChildren(178) ) GROUP BY name_key HAVING count(*) = 1 ORDER BY name_key ) eligible_records ON regexp_replace((trim(lower(first_name)) || trim(lower(last_name))), '[ .-]', '', 'g') = eligible_records.name_key WHERE planter.organization_id IN ( select entity_id from getEntityRelationshipChildren(178) ) AND person_id IS NULL """); print("SQL result:", cursor.query) for row in cursor: #do something with every single row here #optionally print the row print(row) updateCursor = conn.cursor() updateCursor.execute(""" INSERT INTO entity (type, first_name, last_name, email, phone) values ('p', %s, %s, %s, %s) RETURNING * """, ( row['first_name'], row['last_name'], row['email'], row['phone'] ) ); personId = updateCursor.fetchone()[0]; print(personId) updateCursor.execute(""" UPDATE planter SET person_id = %s WHERE id = %s """, (personId, row['id']) ) conn.commit() return 0 except Exception as e: print("get error when exec SQL:", e) print("SQL result:", updateCursor.query) raise ValueError('Error executing query') return 1
def execute(self, context): es = ElasticHook(conn_id=self.elastic_conn_id) pg = PostgresHook(postgres_conn_id=self.postgres_conn_id) with closing(pg.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.itersize = 1000 cursor.execute(self.sql) for row in cursor: doc = json.dumps(row, indent=2) es.add_doc(index=self.index, doc_type='external', doc=doc)
def query(self): """Queries Postgres and returns a cursor to the results.""" hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) conn = hook.get_conn() cursor = conn.cursor(name=self._unique_name()) cursor.execute(self.sql, self.parameters) if self.use_server_side_cursor: cursor.itersize = self.cursor_itersize return _PostgresServerSideCursorDecorator(cursor) return cursor
def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) result = self.hook.get_first(self.sql, parameters=self.parameters) if not result: raise AirflowException("The query returned None") record = result[0] self.log.info('First record: {0}'.format(record)) for output in self.hook.conn.notices: self.log.info(output) return record
def test_rowcount(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute(f"CREATE TABLE {self.table} (c VARCHAR)") values = ",".join(f"('{data}')" for data in input_data) cur.execute(f"INSERT INTO {self.table} VALUES {values}") conn.commit() assert cur.rowcount == len(input_data)
def execute(self, context): self.log.info('Start DataQualityOperator') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) error_messages = "Data quality check failed" for table_dict in self.tables_list: if 'data_quality' in table_dict: table = table_dict['name'] task_name = f"Check for records in table {table}" min_records = 0 records_count = 0 if 'minimum_records' in table_dict['data_quality']: min_records = table_dict['data_quality']['minimum_records'] self.log.info(task_name) records = redshift.get_records(f"SELECT COUNT(*) FROM {table}") if len(records) >= 1 and len(records[0]) >= 1 : records_count = records[0][0] if records_count < min_records: raise ValueError( f""" {error_messages} for {task_name} found ${records_count}, Expected a minimum of {min_records} records. """ ) else: raise ValueError(f"{error_messages}. No result for {task_name}") if 'not_null_columns' in table_dict['data_quality'] and records_count > 0: not_null_columns = table_dict['data_quality']['not_null_columns'] for column in not_null_columns: check_null_task_name = f"Check for null values in {table}.{column}" self.log.info(check_null_task_name) count_nulls = redshift.get_records(f"SELECT COUNT(*) FROM {table} WHERE {column} is null") if len(count_nulls) >= 1 and len(count_nulls[0]) >= 1 : null_values = count_nulls[0][0] if null_values > 0: raise ValueError( f""" {error_messages} for {check_null_task_name}, Found {null_values} null records in {table}.{column}. """ ) else: raise ValueError(f"{error_messages}. No result for {check_null_task_name}")
def test_rowcount(self): hook = PostgresHook() input_data = ["foo", "bar", "baz"] with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("CREATE TABLE {} (c VARCHAR)".format(self.table)) values = ",".join("('{}')".format(data) for data in input_data) cur.execute("INSERT INTO {} VALUES {}".format( self.table, values)) conn.commit() self.assertEqual(cur.rowcount, len(input_data))
def process_log_file(filepath): """ Reads from log datasource and loads data into time, user and songplays tables :param filepath: Source data location """ postgres_hook = PostgresHook(postgres_conn_id='local_postgres', schema='postgres') conn = postgres_hook.get_conn() cur = conn.cursor() # open log file df = pd.read_json(filepath, lines=True) # filter by NextSong action df = df[df['page'] == 'NextSong'] # convert timestamp column to datetime t = pd.to_datetime(df['ts'], unit='ms') # insert time data records time_data = [t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday] column_labels = ['start_time', 'hour', 'day', 'week', 'month', 'year', 'weekday'] time_df = pd.DataFrame.from_dict(dict(zip(column_labels, time_data))) for i, row in time_df.iterrows(): cur.execute(time_table_insert, list(row)) # load user table user_df = df[['userId', 'firstName', 'lastName', 'gender', 'level']] # insert user records for i, row in user_df.iterrows(): cur.execute(user_table_insert, row) # insert songplay records for index, row in df.iterrows(): # get songid and artistid from song and artist tables cur.execute(song_select, (row.song, row.artist, row.length)) results = cur.fetchone() if results: songid, artistid = results else: songid, artistid = None, None # insert songplay record songplay_data = (pd.to_datetime(row['ts'], unit='ms'), row['userId'], row['level'], songid, artistid, row['sessionId'], row['location'], row['userAgent']) cur.execute(songplay_table_insert, songplay_data) cur.close() conn.commit()
def execute(self, context): """ Description: This custom function implements one or more data quality checks that are passed as SQL commands in the data_quality_checks list, executes them and checks the return value for correctness. If everything fits, this function works without any problems. If there is a disagreement, an error is thrown. Arguments: self: Instance of the class context: Context dictionary Returns: None """ # Build connection postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) # If no quality checks were specified, the function is terminated if len(self.data_quality_checks) <= 0: self.log.info( 'No data quality checks were specified. Data quality checks canceled.' ) return # Here every single quality check is run through, the associated SQL command is executed and the return value is checked. for check in self.data_quality_checks: sql_query = check.get('sql_query') expected_result = check.get('expected_result') try: self.log.info( 'Starting SQL query for data check - {}'.format(sql_query)) records = postgres.get_records(sql_query) num_records = records[0][0] if num_records != expected_result: raise ValueError( 'Data quality check failed. {} entries excpected. {} given' .format(expected_result, num_records)) else: self.log.info( 'Data Check passed for query - {}. Result: {}'.format( sql_query, num_records)) except ValueError as v: self.log.info(v.args) raise except Exception as e: self.log.info( 'SQL query for data check failed - {}. Exception: {}'. format(sql_query, e)) raise
def ingest_file_into_db(self, request_id): pg_hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) with closing(pg_hook.get_conn()) as pg_conn: with closing(pg_conn.cursor()) as pg_cursor: csv_file = self.unzip_csv( self.get_file_data(request_id, pg_cursor)) rowcount = self.upload_local_file_into_table( csv_file, pg_cursor, table.weather_daily) csv_file.close() pg_conn.commit() return rowcount