def test_load_to_table_delta(): """Tests that upserts work. This is done by adding the delta_params inside the table metadatabase and comparing that the row exists in both tables (master and delta) and that indeed the row matches the expected input """ hook = PgHook() table_md_mock = table_metadata_mock() table_md_mock.delta_params = { "master_table": table_md_mock.table_name, "delta_key": "id", } table_md_mock.table_name = f"{table_md_mock.table_name}_delta" header = ["name", "id"] data = ["david", "fz234kal"] input_data = [header, data] with NamedTemporaryFile(dir="/tmp", prefix=table_md_mock.load_prefix, mode="w+") as f: for row in input_data: f.write(",".join(row) + "\n") f.flush() hook.load_to_table(src_path=f.name, table_md=table_md_mock) with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute( f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name} " f"UNION ALL " f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.delta_params['master_table']}" ) result = cur.fetchall() # check that the row was indeed loaded assert result == [tuple(data), tuple(data)] # check that the same row is present both on the delta and master table assert result[0] == result[1]
def test_copy_expert_file_not_exists(): """Tests that when a file that does not exist is loaded (or tried to) then an exception is raised""" hook = PgHook() table_md_mock = table_metadata_mock() sql_gen = SQLGenerator(table_md=table_md_mock) with pytest.raises(FileNotFoundError): hook.copy_expert(query=sql_gen.copy_query(), src_path="/tmp/notexists.csv")
def drop_table(): """ a cleanup fixture when needed after a specific test for dropping a designated table """ yield table_md_mock = table_metadata_mock() hook = PgHook() hook.execute( f"DROP TABLE {table_md_mock.schema_name}.{table_md_mock.table_name};")
def teardown_module(module): print("\nTEARING DOWN TEST ENVIRONMENT IN DATABASE") hook = PgHook() hook.execute("DROP SCHEMA test CASCADE;") print("\nTEARING DOWN TEST ENVIRONMENT IN RABBITMQ") connection = pika.BlockingConnection( pika.ConnectionParameters(host=RABBIT_MQ_HOST)) channel = connection.channel() channel.queue_delete(queue=TEST_QUEUE)
def test_get_conn(mocker): mock = MagicMock() mocker.patch("psql_client.psycopg2.connect", mock) hook = PgHook(**MOCK_DB_AUTH) hook.get_conn() mock.assert_called_once_with(database="test", user="******", password="******", host="test", port="test")
def setup_module(module): print("SETTING UP TEST QUEUE IN RABBITMQ") connection = pika.BlockingConnection( pika.ConnectionParameters(host=RABBIT_MQ_HOST)) channel = connection.channel() channel.queue_declare(queue=TEST_QUEUE) # wait for the queue to setup sleep(5) hook = PgHook() hook.execute("CREATE SCHEMA test;")
def __load_to_pgres_callback(self, ch, method, properties, body): data = json.loads(body) # encapsulating values inside single quotes for loading into the database row = [data[field] for field in self.fields] hook = PgHook() sql_gen = SQLGenerator(self.table_md) queries = [ sql_gen.create_table_query(), # create table if not exists for loading sql_gen.insert_values_into(values=row), ] hook.execute(queries) ch.basic_ack(delivery_tag=method.delivery_tag)
def main(table_metadata_dir: str, raw_data_dir: str) -> None: while True: val = cli() if val == "3": break import_sources( tables_md_dir=table_metadata_dir, raw_data_dir=raw_data_dir, date_filter_val=val, ) with open(join(SQL_MODELLING_DIR, "fact_events.sql"), "r") as f: sql = f.read() pg_hook = PgHook() pg_hook.execute(sql)
def test_import_sources(): """This function includes the entire mechanism of extracting the data and loading it to the database. It then verifies that the data exists and asserts its content. TemporaryDirectories and NamedTemporaryFiles are used for simulating the environment. The import sources function uses prefix to pick up files for loading, and therefore requires that the raw data is stored in the following structure: . └── raw_data directory └── subdir (e.g. organization_data) ├── foo.csv └── bar.csv """ raw_data_dir = TemporaryDirectory(dir="/tmp", prefix="raw_data") test_data_dir = join(raw_data_dir.name, "test") mkdir(test_data_dir) raw_data_file = NamedTemporaryFile(dir=test_data_dir, prefix="test", suffix=".json") raw_data_file.write(get_mock_json().encode("utf-8")) raw_data_file.flush() table_md_yaml = get_mock_table_md_yaml() with TemporaryDirectory(dir="/tmp", prefix="table_metadata") as md_dir: with NamedTemporaryFile(dir=md_dir) as md: md.write(table_md_yaml.encode("utf-8")) md.flush() import_sources(tables_md_dir=md_dir, raw_data_dir=raw_data_dir.name) raw_data_file.close() raw_data_dir.cleanup() expected = [ ("foo", "created", datetime(2020, 12, 8, 20, 3, 16, 759617)), ("bar", "created", datetime(2014, 12, 8, 20, 3, 16, 759617)), ] pg_hook = PgHook() with pg_hook.get_conn() as conn: with conn.cursor() as cur: cur.execute("SELECT * FROM test.test_table_delta;") assert cur.fetchall() == expected
def test_load_to_table(drop_table): """Tests that data is loaded to a certain table""" hook = PgHook() table_md_mock = table_metadata_mock() header = ["name", "id"] row = ["david", "fz234kal"] input_data = [header, row] with NamedTemporaryFile(dir="/tmp", prefix=table_md_mock.load_prefix, mode="w+") as f: for row in input_data: f.write(",".join(row) + "\n") f.flush() hook.load_to_table(src_path=f.name, table_md=table_md_mock) with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute( f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name};" ) assert list(cur.fetchall()[0]) == row
def test_copy_expert(): hook = PgHook() table_md_mock = table_metadata_mock() sql_gen = SQLGenerator(table_md=table_md_mock) header = ["name", "id"] row = ["sarah", "fz234kal"] input_data = [header, row] with NamedTemporaryFile(dir="/tmp", prefix=table_md_mock.load_prefix, mode="w+") as f: for row in input_data: f.write(",".join(row) + "\n") f.flush() hook.copy_expert(src_path=f.name, query=sql_gen.copy_query()) with hook.get_conn() as conn: with conn.cursor() as cur: cur.execute( f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name} WHERE name='sarah';" ) assert list(cur.fetchall()[0]) == row
def test_consumer_consume_events(publish_for_consumption): table_md = get_mock_table_md() pg_hook = PgHook() consumer = Consumer(host=RABBIT_MQ_HOST, queue=TEST_QUEUE, table_md=table_md) consumer_thread = threading.Thread(target=consumer.consume_events, daemon=True) consumer_thread.start() sleep(4) expected = [ ("foo", "created", datetime(2020, 12, 8, 20, 3, 16, 759617)), ("bar", "created", datetime(2014, 12, 8, 20, 3, 16, 759617)), ] with pg_hook.get_conn() as conn: with conn.cursor() as cur: cur.execute( f"SELECT * FROM {table_md.schema_name}.{table_md.table_name};") assert cur.fetchall() == expected consumer_thread.__running = False
def batch_load_to_pgres(self): """ Inactivity timeout is added in cases where there are less than 5 messages left in the queue. If no further messages arrive in 15 seconds, the inactivity timeout kicks in and triggers the processing of the batch currently stored in memory. """ sql_gen = SQLGenerator(self.table_md) hook = PgHook() hook.execute(sql_gen.create_table_query()) while True: try: connection, channel = self.__get_conn() batch = [] # Get five messages and break out. for method_frame, properties, body in channel.consume( queue=self.queue, inactivity_timeout=15 ): # if no more messages exist in the queue, break out of the loop if not method_frame: break data = json.loads(body) row = [data[field] for field in self.fields] batch.append(sql_gen.insert_values_into(values=row)) channel.basic_ack(method_frame.delivery_tag) if method_frame.delivery_tag == 5: break # Requeing the rest of the messages after having pulled a batch channel.cancel() print("processing batch") hook.execute(batch) # Close the channel and the connection safely when interrupting so we don't get hanging connections except KeyboardInterrupt: # safely channel.close() connection.close() raise
def test_execute(queries): hook = PgHook() hook.execute(queries)
def test_execute_accepts_types(mocker, queries): """Tests that .execute takes both a single string and a list of strings""" mock = MagicMock() mocker.patch("psql_client.psycopg2.connect", mock) hook = PgHook(**MOCK_DB_AUTH) hook.execute(queries=queries)
def teardown_module(module): print("\nTEARING DOWN TEST ENVIRONMENT IN DATABASE") hook = PgHook() hook.execute("DROP SCHEMA test CASCADE;")
def setup_module(module): print("SETTING UP TEST ENVIRONMENT IN DATABASE") hook = PgHook() hook.execute("CREATE SCHEMA test;")