def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/trino" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "trino") as docker_services: wait_for_port(docker_services, "testtrino", 8080) wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:5300/v1/info").json()[ "starting"] is False, ) # Set up the hive db command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): print(tmp_path) # Run the metadata ingestion pipeline for trino catalog referring to postgres database config_file = (test_resources_dir / "trino_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert_result_ok(result) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", ) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert_result_ok(result) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_hive_mces.json", golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def test_ldap_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "ldap") as docker_services: # The openldap container loads the sample data after exposing the port publicly. As such, # we must wait a little bit extra to ensure that the sample data is loaded. wait_for_port(docker_services, "openldap", 389) time.sleep(5) pipeline = Pipeline.create({ "run_id": "ldap-test", "source": { "type": "ldap", "config": { "ldap_server": "ldap://localhost", "ldap_user": "******", "ldap_password": "******", "base_dn": "dc=example,dc=org", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/ldap_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() output = mce_helpers.load_json_file(str(tmp_path / "ldap_mces.json")) golden = mce_helpers.load_json_file( str(test_resources_dir / "ldap_mces_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/hive" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "hive") as docker_services: wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) # Set up the container. command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "hive_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "hive_mces.json", golden_path=test_resources_dir / "hive_mces_golden.json", ignore_paths=[ # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:'] # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime'] r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]", r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]", ], )
def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "mongo") as docker_services: wait_for_port(docker_services, "testmongodb", 27017) # Run the metadata ingestion pipeline. pipeline = Pipeline.create({ "run_id": "mongodb-test", "source": { "type": "mongodb", "config": { "connect_uri": "mongodb://localhost:57017", "username": "******", "password": "******", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/mongodb_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "mongodb_mces.json", golden_path=test_resources_dir / "mongodb_mces_golden.json", )
def test_mysql_ingest_with_db_alias( docker_compose_runner, pytestconfig, tmp_path, mock_time ): test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "mysql" ) as docker_services: wait_for_port(docker_services, "testmysql", 3306) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mysql_to_file_dbalias.yml").resolve() run_datahub_cmd( ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path ) # Verify the output. # Assert that all events generated have instance specific urns import re urn_pattern = "^" + re.escape( "urn:li:dataset:(urn:li:dataPlatform:mysql,foogalaxy." ) mce_helpers.assert_mcp_entity_urn( filter="ALL", entity_type="dataset", regex_pattern=urn_pattern, file=tmp_path / "mysql_mces_dbalias.json", )
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", 9092, timeout=120) # Set up topics and produce some data command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "kafka_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_mces.json", golden_path=test_resources_dir / "kafka_mces_golden.json", ignore_paths=[], )
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", 59092, timeout=120) wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120) # Set up topics and produce some data command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "kafka_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_mces.json", golden_path=test_resources_dir / "kafka_mces_golden.json", ignore_paths=[], )
def test_feast_ingest(docker_compose_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/feast" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "feast") as docker_services: wait_for_port(docker_services, "testfeast", 6565) # container listens to this port once test cases have been setup wait_for_port(docker_services, "testfeast_setup", 6789) # Run the metadata ingestion pipeline. pipeline = Pipeline.create({ "run_id": "feast-test", "source": { "type": "feast", "config": { "core_url": "localhost:6565", "use_local_build": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/feast_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. output = mce_helpers.load_json_file(str(tmp_path / "feast_mces.json")) golden = mce_helpers.load_json_file( str(test_resources_dir / "feast_mce_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "sql-server") as docker_services: # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available # does not mean the server is accepting connections. # TODO: find a better way to check for liveness. wait_for_port(docker_services, "testsqlserver", 1433) time.sleep(5) # Run the setup.sql file to populate the database. docker = "docker" command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql" ret = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert ret.returncode == 0 # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mssql_to_file.yml").resolve() runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 output = mce_helpers.load_json_file("mssql_mces.json") # Verify the output. golden = mce_helpers.load_json_file( str(test_resources_dir / "mssql_mce_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def test_ge_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time, **kwargs): test_resources_dir = pytestconfig.rootpath / "tests/integration/great-expectations" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "great-expectations") as docker_services, mock.patch( "datahub.emitter.rest_emitter.DatahubRestEmitter.emit_mcp" ) as mock_emit_mcp: wait_for_port(docker_services, "ge_postgres", 5432) emitter = MockDatahubEmitter("") mock_emit_mcp.side_effect = emitter.emit_mcp shutil.copytree( test_resources_dir / "setup/great_expectations", tmp_path / "great_expectations", ) context = ge.DataContext.create(tmp_path) context.run_checkpoint(checkpoint_name="test_checkpoint") emitter.write_to_file(tmp_path / "ge_mcps.json") mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "ge_mcps.json", golden_path=test_resources_dir / "ge_mcps_golden.json", ignore_paths=[], )
def trino_runner(docker_compose_runner, pytestconfig): test_resources_dir = pytestconfig.rootpath / "tests/integration/trino" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "trino") as docker_services: wait_for_port(docker_services, "testtrino", 8080) wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:5300/v1/info").json()[ "starting"] is False, ) yield docker_services
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "mysql" ) as docker_services: wait_for_port(docker_services, "testmysql", 3306) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mysql_to_file.yml").resolve() run_datahub_cmd( ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path ) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "mysql_mces.json", golden_path=test_resources_dir / "mysql_mces_golden.json", )
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "mysql") as docker_services: wait_for_port(docker_services, "testmysql", 3306) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "mysql_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="mysql_mces.json", golden_path=test_resources_dir / "mysql_mces_golden.json", )
def hive_runner(docker_compose_runner, pytestconfig): test_resources_dir = pytestconfig.rootpath / "tests/integration/hive" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "hive") as docker_services: wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) yield docker_services
def test_kafka_connect_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka-connect" test_resources_dir_kafka = pytestconfig.rootpath / "tests/integration/kafka" # Share Compose configurations between files and projects # https://docs.docker.com/compose/extends/ docker_compose_file = [ str(test_resources_dir_kafka / "docker-compose.yml"), str(test_resources_dir / "docker-compose.override.yml"), ] with docker_compose_runner(docker_compose_file, "kafka-connect") as docker_services: wait_for_port(docker_services, "test_broker", 59092, timeout=120) wait_for_port(docker_services, "test_connect", 58083, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:58083/connectors", ). status_code == 200, ) # Creating MySQL source with no transformations , only topic prefix r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source1", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "topic.prefix": "test-mysql-jdbc-", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with regex router transformations , only topic prefix r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source2", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}", "transforms": "TotalReplacement", "transforms.TotalReplacement.type": "org.apache.kafka.connect.transforms.RegexRouter", "transforms.TotalReplacement.regex": ".*(book)", "transforms.TotalReplacement.replacement": "my-new-topic-$1" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with regex router transformations , no topic prefix, table whitelist r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source3", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "table.whitelist": "book", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}", "transforms": "TotalReplacement", "transforms.TotalReplacement.type": "org.apache.kafka.connect.transforms.RegexRouter", "transforms.TotalReplacement.regex": ".*", "transforms.TotalReplacement.replacement": "my-new-topic" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with query , topic prefix r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source4", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "query": "select * from member", "topic.prefix": "query-topic", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with ExtractTopic router transformations - source dataset not added r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source5", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "table.whitelist": "book", "topic.prefix": "test-mysql-jdbc2-", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}", "transforms": "changetopic", "transforms.changetopic.type": "io.confluent.connect.transforms.ExtractTopic$Value", "transforms.changetopic.field": "name" } } """, ) assert r.status_code == 201 # Created # Creating MySQL sink connector - not added r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_sink", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSinkConnector", "insert.mode": "insert", "auto.create": true, "topics": "my-topic", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}" } } """, ) assert r.status_code == 201 # Created # Creating Debezium MySQL source connector r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "debezium-mysql-connector", "config": { "name": "debezium-mysql-connector", "connector.class": "io.debezium.connector.mysql.MySqlConnector", "database.hostname": "test_mysql", "database.port": "3306", "database.user": "******", "database.password": "******", "database.server.name": "debezium.topics", "database.history.kafka.bootstrap.servers": "test_broker:9092", "database.history.kafka.topic": "dbhistory.debeziummysql", "include.schema.changes": "false" } } """, ) assert r.status_code == 201 # Created # Give time for connectors to process the table data time.sleep(45) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "kafka_connect_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_connect_mces.json", golden_path=test_resources_dir / "kafka_connect_mces_golden.json", ignore_paths=[], )
def test_kafka_ingest_with_stateful( docker_compose_runner, pytestconfig, tmp_path, mock_time, mock_datahub_graph ): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" topic_prefix: str = "stateful_ingestion_test" topic_names: List[str] = [f"{topic_prefix}_t1", f"{topic_prefix}_t2"] platform_instance = "test_platform_instance_1" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", KAFKA_PORT, timeout=120) wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120) source_config_dict: Dict[str, Any] = { "connection": { "bootstrap": KAFKA_BOOTSTRAP_SERVER, }, "platform_instance": f"{platform_instance}", # enable stateful ingestion "stateful_ingestion": { "enabled": True, "remove_stale_metadata": True, "state_provider": { "type": "datahub", "config": {"datahub_api": {"server": GMS_SERVER}}, }, }, } pipeline_config_dict: Dict[str, Any] = { "source": { "type": "kafka", "config": source_config_dict, }, "sink": { # we are not really interested in the resulting events for this test "type": "console" }, "pipeline_name": "test_pipeline", # enable reporting "reporting": [ { "type": "datahub", "config": {"datahub_api": {"server": GMS_SERVER}}, } ], } # topics will be automatically created and deleted upon test completion with KafkaTopicsCxtManager( topic_names, KAFKA_BOOTSTRAP_SERVER ) as kafka_ctx, patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint, patch( "datahub.ingestion.reporting.datahub_ingestion_reporting_provider.DataHubGraph", mock_datahub_graph, ) as mock_reporting: # both checkpoint and reporting will use the same mocked graph instance mock_checkpoint.return_value = mock_datahub_graph mock_reporting.return_value = mock_datahub_graph # 1. Do the first run of the pipeline and get the default job's checkpoint. pipeline_run1 = run_and_get_pipeline(pipeline_config_dict) checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1) assert checkpoint1 assert checkpoint1.state # 2. Drop the first topic created during step 1 + rerun the pipeline and get the checkpoint state. kafka_ctx.delete_kafka_topics([kafka_ctx.topics[0]]) # sleep to guarantee eventual consistency for kafka topic deletion time.sleep(1) pipeline_run2 = run_and_get_pipeline(pipeline_config_dict) checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) assert checkpoint2 assert checkpoint2.state # 3. Perform all assertions on the states. The deleted topic should not be # part of the second state state1 = cast(KafkaCheckpointState, checkpoint1.state) state2 = cast(KafkaCheckpointState, checkpoint2.state) difference_urns = list(state1.get_topic_urns_not_in(state2)) assert len(difference_urns) == 1 assert ( difference_urns[0] == f"urn:li:dataset:(urn:li:dataPlatform:kafka,{platform_instance}.{kafka_ctx.topics[0]},PROD)" ) # 4. Checkpoint configuration should be the same. assert checkpoint1.config == checkpoint2.config # 5. Validate that all providers have committed successfully. # NOTE: The following validation asserts for presence of state as well # and validates reporting. validate_all_providers_have_committed_successfully( pipeline=pipeline_run1, expected_providers=2 ) validate_all_providers_have_committed_successfully( pipeline=pipeline_run1, expected_providers=2 )
def test_nifi_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/nifi" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "nifi") as docker_services: wait_for_port( docker_services, container_name="nifi1", container_port=9443, timeout=300, ) wait_for_port( docker_services, container_name="nifi01", container_port=9080, timeout=60, ) wait_for_port( docker_services, container_name="nifi02", container_port=9081, timeout=60, ) wait_for_port( docker_services, container_name="nifi03", container_port=9082, timeout=60, ) # Wait for nifi to execute all processors time.sleep(120) # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run nifi ingestion run. pipeline = Pipeline.create({ "run_id": "nifi-test-standalone", "source": { "type": "nifi", "config": { "site_url": "http://localhost:9443/nifi/", # "auth": "CLIENT_CERT", # "client_cert_file": f"{test_resources_dir}/setup/ssl_files/client-cert.pem", # "client_key_file": f"{test_resources_dir}/setup/ssl_files/client-private-key.pem", # "client_key_password": "******", # "ca_file": f"{test_resources_dir}/setup/ssl_files/server_certfile.pem", "process_group_pattern": { "deny": ["^WIP"] }, }, }, "sink": { "type": "file", "config": { "filename": "./nifi_mces.json" }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. ignore values for aspects having last_event_time values # TODO: ignore paths with respect to aspect value in case of MCPs mce_helpers.check_golden_file( pytestconfig, output_path="nifi_mces.json", golden_path=test_resources_dir / "nifi_mces_golden_standalone.json", ignore_paths=[ r"root\[1\]\['aspect'\]\['value'\]", r"root\[5\]\['aspect'\]\['value'\]", r"root\[7\]\['aspect'\]\['value'\]", ], ) # Run nifi ingestion run. pipeline = Pipeline.create({ "run_id": "nifi-test-cluster", "source": { "type": "nifi", "config": { "site_url": "http://localhost:9080/nifi/", "auth": "NO_AUTH", "site_url_to_site_name": { "http://nifi01:9080/nifi/": "default", "http://nifi02:9081/nifi/": "default", }, }, }, "sink": { "type": "file", "config": { "filename": "./nifi_mces_cluster.json" }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. # TODO: ignore paths with respect to aspect value in case of MCPs mce_helpers.check_golden_file( pytestconfig, output_path="nifi_mces_cluster.json", golden_path=test_resources_dir / "nifi_mces_golden_cluster.json", ignore_paths=[ r"root\[5\]\['aspect'\]\['value'\]", r"root\[7\]\['aspect'\]\['value'\]", r"root\[15\]\['aspect'\]\['value'\]", r"root\[19\]\['aspect'\]\['value'\]", ], )