Пример #1
0
def test_load_to_table_delta():
    """Tests that upserts work. This is done by adding the delta_params inside the table metadatabase and comparing
    that the row exists in both tables (master and delta) and that indeed the row matches the expected input
    """
    hook = PgHook()
    table_md_mock = table_metadata_mock()
    table_md_mock.delta_params = {
        "master_table": table_md_mock.table_name,
        "delta_key": "id",
    }
    table_md_mock.table_name = f"{table_md_mock.table_name}_delta"
    header = ["name", "id"]
    data = ["david", "fz234kal"]
    input_data = [header, data]
    with NamedTemporaryFile(dir="/tmp",
                            prefix=table_md_mock.load_prefix,
                            mode="w+") as f:
        for row in input_data:
            f.write(",".join(row) + "\n")
            f.flush()
        hook.load_to_table(src_path=f.name, table_md=table_md_mock)

    with hook.get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name} "
                f"UNION ALL "
                f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.delta_params['master_table']}"
            )
            result = cur.fetchall()
            # check that the row was indeed loaded
            assert result == [tuple(data), tuple(data)]
            # check that the same row is present both on the delta and master table
            assert result[0] == result[1]
Пример #2
0
def test_copy_expert_file_not_exists():
    """Tests that when a file that does not exist is loaded (or tried to) then an exception is raised"""
    hook = PgHook()
    table_md_mock = table_metadata_mock()
    sql_gen = SQLGenerator(table_md=table_md_mock)
    with pytest.raises(FileNotFoundError):
        hook.copy_expert(query=sql_gen.copy_query(),
                         src_path="/tmp/notexists.csv")
Пример #3
0
def drop_table():
    """
    a cleanup fixture when needed after a specific test for dropping a designated table
    """
    yield
    table_md_mock = table_metadata_mock()
    hook = PgHook()
    hook.execute(
        f"DROP TABLE {table_md_mock.schema_name}.{table_md_mock.table_name};")
Пример #4
0
def teardown_module(module):
    print("\nTEARING DOWN TEST ENVIRONMENT IN DATABASE")
    hook = PgHook()
    hook.execute("DROP SCHEMA test CASCADE;")
    print("\nTEARING DOWN TEST ENVIRONMENT IN RABBITMQ")
    connection = pika.BlockingConnection(
        pika.ConnectionParameters(host=RABBIT_MQ_HOST))
    channel = connection.channel()
    channel.queue_delete(queue=TEST_QUEUE)
Пример #5
0
def test_get_conn(mocker):
    mock = MagicMock()
    mocker.patch("psql_client.psycopg2.connect", mock)
    hook = PgHook(**MOCK_DB_AUTH)
    hook.get_conn()
    mock.assert_called_once_with(database="test",
                                 user="******",
                                 password="******",
                                 host="test",
                                 port="test")
Пример #6
0
def setup_module(module):
    print("SETTING UP TEST QUEUE IN RABBITMQ")
    connection = pika.BlockingConnection(
        pika.ConnectionParameters(host=RABBIT_MQ_HOST))
    channel = connection.channel()
    channel.queue_declare(queue=TEST_QUEUE)
    # wait for the queue to setup
    sleep(5)
    hook = PgHook()
    hook.execute("CREATE SCHEMA test;")
Пример #7
0
 def __load_to_pgres_callback(self, ch, method, properties, body):
     data = json.loads(body)
     # encapsulating values inside single quotes for loading into the database
     row = [data[field] for field in self.fields]
     hook = PgHook()
     sql_gen = SQLGenerator(self.table_md)
     queries = [
         sql_gen.create_table_query(),  # create table if not exists for loading
         sql_gen.insert_values_into(values=row),
     ]
     hook.execute(queries)
     ch.basic_ack(delivery_tag=method.delivery_tag)
Пример #8
0
def main(table_metadata_dir: str, raw_data_dir: str) -> None:
    while True:
        val = cli()
        if val == "3":
            break
        import_sources(
            tables_md_dir=table_metadata_dir,
            raw_data_dir=raw_data_dir,
            date_filter_val=val,
        )

        with open(join(SQL_MODELLING_DIR, "fact_events.sql"), "r") as f:
            sql = f.read()

        pg_hook = PgHook()
        pg_hook.execute(sql)
Пример #9
0
def test_import_sources():
    """This function includes the entire mechanism of extracting the data and loading it to the database.
    It then verifies that the data exists and asserts its content.
    TemporaryDirectories and NamedTemporaryFiles are used for simulating the environment. The import sources function
    uses prefix to pick up files for loading, and therefore requires that the raw data is stored in the following
    structure:
    .
    └── raw_data directory
        └── subdir (e.g. organization_data)
            ├── foo.csv
            └── bar.csv

    """
    raw_data_dir = TemporaryDirectory(dir="/tmp", prefix="raw_data")
    test_data_dir = join(raw_data_dir.name, "test")
    mkdir(test_data_dir)
    raw_data_file = NamedTemporaryFile(dir=test_data_dir,
                                       prefix="test",
                                       suffix=".json")
    raw_data_file.write(get_mock_json().encode("utf-8"))
    raw_data_file.flush()

    table_md_yaml = get_mock_table_md_yaml()
    with TemporaryDirectory(dir="/tmp", prefix="table_metadata") as md_dir:
        with NamedTemporaryFile(dir=md_dir) as md:
            md.write(table_md_yaml.encode("utf-8"))
            md.flush()
            import_sources(tables_md_dir=md_dir,
                           raw_data_dir=raw_data_dir.name)

    raw_data_file.close()
    raw_data_dir.cleanup()

    expected = [
        ("foo", "created", datetime(2020, 12, 8, 20, 3, 16, 759617)),
        ("bar", "created", datetime(2014, 12, 8, 20, 3, 16, 759617)),
    ]
    pg_hook = PgHook()
    with pg_hook.get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute("SELECT * FROM test.test_table_delta;")
            assert cur.fetchall() == expected
Пример #10
0
def test_load_to_table(drop_table):
    """Tests that data is loaded to a certain table"""
    hook = PgHook()
    table_md_mock = table_metadata_mock()
    header = ["name", "id"]
    row = ["david", "fz234kal"]
    input_data = [header, row]
    with NamedTemporaryFile(dir="/tmp",
                            prefix=table_md_mock.load_prefix,
                            mode="w+") as f:
        for row in input_data:
            f.write(",".join(row) + "\n")
            f.flush()
        hook.load_to_table(src_path=f.name, table_md=table_md_mock)

    with hook.get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name};"
            )
            assert list(cur.fetchall()[0]) == row
Пример #11
0
def test_copy_expert():
    hook = PgHook()
    table_md_mock = table_metadata_mock()
    sql_gen = SQLGenerator(table_md=table_md_mock)
    header = ["name", "id"]
    row = ["sarah", "fz234kal"]
    input_data = [header, row]

    with NamedTemporaryFile(dir="/tmp",
                            prefix=table_md_mock.load_prefix,
                            mode="w+") as f:
        for row in input_data:
            f.write(",".join(row) + "\n")
            f.flush()
        hook.copy_expert(src_path=f.name, query=sql_gen.copy_query())

    with hook.get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT * FROM {table_md_mock.schema_name}.{table_md_mock.table_name} WHERE name='sarah';"
            )
            assert list(cur.fetchall()[0]) == row
Пример #12
0
def test_consumer_consume_events(publish_for_consumption):
    table_md = get_mock_table_md()
    pg_hook = PgHook()
    consumer = Consumer(host=RABBIT_MQ_HOST,
                        queue=TEST_QUEUE,
                        table_md=table_md)
    consumer_thread = threading.Thread(target=consumer.consume_events,
                                       daemon=True)
    consumer_thread.start()
    sleep(4)

    expected = [
        ("foo", "created", datetime(2020, 12, 8, 20, 3, 16, 759617)),
        ("bar", "created", datetime(2014, 12, 8, 20, 3, 16, 759617)),
    ]

    with pg_hook.get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
                f"SELECT * FROM {table_md.schema_name}.{table_md.table_name};")
            assert cur.fetchall() == expected

    consumer_thread.__running = False
Пример #13
0
    def batch_load_to_pgres(self):
        """
        Inactivity timeout is added in cases where there are less than 5 messages left in the queue. If no further messages
        arrive in 15 seconds, the inactivity timeout kicks in and triggers the processing of the batch currently
        stored in memory.
        """
        sql_gen = SQLGenerator(self.table_md)
        hook = PgHook()
        hook.execute(sql_gen.create_table_query())

        while True:
            try:
                connection, channel = self.__get_conn()
                batch = []
                # Get five messages and break out.
                for method_frame, properties, body in channel.consume(
                    queue=self.queue, inactivity_timeout=15
                ):

                    # if no more messages exist in the queue, break out of the loop
                    if not method_frame:
                        break
                    data = json.loads(body)
                    row = [data[field] for field in self.fields]
                    batch.append(sql_gen.insert_values_into(values=row))
                    channel.basic_ack(method_frame.delivery_tag)

                    if method_frame.delivery_tag == 5:
                        break
                # Requeing the rest of the messages after having pulled a batch
                channel.cancel()
                print("processing batch")
                hook.execute(batch)

            # Close the channel and the connection safely when interrupting so we don't get hanging connections
            except KeyboardInterrupt:  # safely
                channel.close()
                connection.close()
                raise
Пример #14
0
def test_execute(queries):
    hook = PgHook()
    hook.execute(queries)
Пример #15
0
def test_execute_accepts_types(mocker, queries):
    """Tests that .execute takes both a single string and a list of strings"""
    mock = MagicMock()
    mocker.patch("psql_client.psycopg2.connect", mock)
    hook = PgHook(**MOCK_DB_AUTH)
    hook.execute(queries=queries)
Пример #16
0
def teardown_module(module):
    print("\nTEARING DOWN TEST ENVIRONMENT IN DATABASE")
    hook = PgHook()
    hook.execute("DROP SCHEMA test CASCADE;")
Пример #17
0
def setup_module(module):
    print("SETTING UP TEST ENVIRONMENT IN DATABASE")
    hook = PgHook()
    hook.execute("CREATE SCHEMA test;")