예제 #1
0
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "kafka"
    ) as docker_services:

        wait_for_port(docker_services, "test_broker", 9092, timeout=120)

        # Set up topics and produce some data
        command = f"{test_resources_dir}/send_records.sh {test_resources_dir}"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "kafka_mces.json",
            golden_path=test_resources_dir / "kafka_mces_golden.json",
            ignore_paths=[],
        )
예제 #2
0
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/hive"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "hive"
    ) as docker_services:
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)

        # Set up the container.
        command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            config_file = (test_resources_dir / "hive_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "hive_mces.json",
            golden_path=test_resources_dir / "hive_mces_golden.json",
            ignore_paths=[
                # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:']
                # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime']
                r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]"
            ],
        )
예제 #3
0
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "sql-server") as docker_services:
        # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available
        # does not mean the server is accepting connections.
        # TODO: find a better way to check for liveness.
        wait_for_port(docker_services, "testsqlserver", 1433)
        time.sleep(5)

        # Run the setup.sql file to populate the database.
        docker = "docker"
        command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql"
        ret = subprocess.run(command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        assert ret.returncode == 0

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "mssql_to_file.yml").resolve()
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

            output = mce_helpers.load_json_file("mssql_mces.json")

        # Verify the output.
        golden = mce_helpers.load_json_file(
            str(test_resources_dir / "mssql_mce_golden.json"))
        mce_helpers.assert_mces_equal(output, golden)
예제 #4
0
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/trino"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "trino") as docker_services:
        wait_for_port(docker_services, "testtrino", 8080)
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)

        docker_services.wait_until_responsive(
            timeout=30,
            pause=1,
            check=lambda: requests.get("http://localhost:5300/v1/info").json()[
                "starting"] is False,
        )

        # Set up the hive db
        command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            print(tmp_path)

            # Run the metadata ingestion pipeline for trino catalog referring to postgres database
            config_file = (test_resources_dir / "trino_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert_result_ok(result)

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="trino_mces.json",
                golden_path=test_resources_dir / "trino_mces_golden.json",
            )

            # Limitation 1  - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
            # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070

            # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
            # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html

            # Run the metadata ingestion pipeline for trino catalog referring to hive database
            config_file = (test_resources_dir /
                           "trino_hive_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert_result_ok(result)

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="trino_hive_mces.json",
                golden_path=test_resources_dir / "trino_hive_mces_golden.json",
                ignore_paths=[
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
                ],
            )
예제 #5
0
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path,
                      mock_time):

    # Run the metadata ingestion pipeline.
    with fs_helpers.isolated_filesystem(tmp_path):

        # Run the metadata ingestion pipeline for trino catalog referring to postgres database
        mce_out_file = "trino_mces.json"
        events_file = tmp_path / mce_out_file

        pipeline_config = {
            "run_id": "trino-test",
            "source": {
                "type":
                data_platform,
                "config":
                TrinoConfig(
                    host_port="localhost:5300",
                    database="postgresqldb",
                    database_alias="library_catalog",
                    username="******",
                    schema_pattern=AllowDenyPattern(allow=["^librarydb"]),
                    profile_pattern=AllowDenyPattern(
                        allow=["library_catalog.librarydb.*"]),
                    profiling=GEProfilingConfig(
                        enabled=True,
                        include_field_null_count=True,
                        include_field_min_value=True,
                        include_field_max_value=True,
                        include_field_mean_value=True,
                        include_field_median_value=True,
                        include_field_stddev_value=True,
                        include_field_quantiles=True,
                        include_field_distinct_value_frequencies=True,
                        include_field_histogram=True,
                        include_field_sample_values=True,
                    ),
                ).dict(),
            },
            "sink": {
                "type": "file",
                "config": FileSinkConfig(filename=str(events_file)).dict(),
            },
        }

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create(pipeline_config)
        pipeline.run()
        pipeline.pretty_print_summary()
        pipeline.raise_from_status(raise_warnings=True)
        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path="trino_mces.json",
            golden_path=test_resources_dir / "trino_mces_golden.json",
        )
예제 #6
0
def run_datahub_cmd(
    command: List[str], tmp_path: Optional[Path] = None, check_result: bool = True
) -> Result:
    runner = CliRunner()

    if tmp_path is None:
        result = runner.invoke(datahub, command)
    else:
        with fs_helpers.isolated_filesystem(tmp_path):
            result = runner.invoke(datahub, command)

    if check_result:
        assert_result_ok(result)
    return result
예제 #7
0
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "mysql") as docker_services:
        wait_for_port(docker_services, "testmysql", 3306)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            config_file = (test_resources_dir / "mysql_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="mysql_mces.json",
                golden_path=test_resources_dir / "mysql_mces_golden.json",
            )
예제 #8
0
def test_nifi_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/nifi"
    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "nifi") as docker_services:
        wait_for_port(
            docker_services,
            container_name="nifi1",
            container_port=9443,
            timeout=300,
        )
        wait_for_port(
            docker_services,
            container_name="nifi01",
            container_port=9080,
            timeout=60,
        )
        wait_for_port(
            docker_services,
            container_name="nifi02",
            container_port=9081,
            timeout=60,
        )
        wait_for_port(
            docker_services,
            container_name="nifi03",
            container_port=9082,
            timeout=60,
        )

        # Wait for nifi to execute all processors
        time.sleep(120)

        # Run the metadata ingestion pipeline.
        with fs_helpers.isolated_filesystem(tmp_path):

            # Run nifi ingestion run.
            pipeline = Pipeline.create({
                "run_id": "nifi-test-standalone",
                "source": {
                    "type": "nifi",
                    "config": {
                        "site_url": "http://localhost:9443/nifi/",
                        #                        "auth": "CLIENT_CERT",
                        #                        "client_cert_file": f"{test_resources_dir}/setup/ssl_files/client-cert.pem",
                        #                        "client_key_file": f"{test_resources_dir}/setup/ssl_files/client-private-key.pem",
                        #                        "client_key_password": "******",
                        #                        "ca_file": f"{test_resources_dir}/setup/ssl_files/server_certfile.pem",
                        "process_group_pattern": {
                            "deny": ["^WIP"]
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": "./nifi_mces.json"
                    },
                },
            })
            pipeline.run()
            pipeline.raise_from_status()

            # Verify the output. ignore values for aspects having last_event_time values
            # TODO: ignore paths with respect to aspect value in case of MCPs
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="nifi_mces.json",
                golden_path=test_resources_dir /
                "nifi_mces_golden_standalone.json",
                ignore_paths=[
                    r"root\[1\]\['aspect'\]\['value'\]",
                    r"root\[5\]\['aspect'\]\['value'\]",
                    r"root\[7\]\['aspect'\]\['value'\]",
                ],
            )

            # Run nifi ingestion run.
            pipeline = Pipeline.create({
                "run_id": "nifi-test-cluster",
                "source": {
                    "type": "nifi",
                    "config": {
                        "site_url": "http://localhost:9080/nifi/",
                        "auth": "NO_AUTH",
                        "site_url_to_site_name": {
                            "http://nifi01:9080/nifi/": "default",
                            "http://nifi02:9081/nifi/": "default",
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": "./nifi_mces_cluster.json"
                    },
                },
            })
            pipeline.run()
            pipeline.raise_from_status()

            # Verify the output.
            # TODO: ignore paths with respect to aspect value in case of MCPs
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="nifi_mces_cluster.json",
                golden_path=test_resources_dir /
                "nifi_mces_golden_cluster.json",
                ignore_paths=[
                    r"root\[5\]\['aspect'\]\['value'\]",
                    r"root\[7\]\['aspect'\]\['value'\]",
                    r"root\[15\]\['aspect'\]\['value'\]",
                    r"root\[19\]\['aspect'\]\['value'\]",
                ],
            )