def test_example_F(test_cases_for_sql_data_connector_sqlite_execution_engine): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME data_assets: table_partitioned_by_foreign_key__F: splitter_method: _split_on_column_value splitter_kwargs: column_name: session_id """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": ["table_partitioned_by_foreign_key__F"], "data_assets": { "table_partitioned_by_foreign_key__F": { "batch_definition_count": 49, # TODO Abe 20201029 : These values should be sorted "example_data_references": [ { "session_id": 3 }, { "session_id": 2 }, { "session_id": 4 }, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], "example_data_reference": { "n_rows": 2, "batch_spec": { "table_name": "table_partitioned_by_foreign_key__F", "partition_definition": { "session_id": 2 }, "splitter_method": "_split_on_column_value", "splitter_kwargs": { "column_name": "session_id" }, }, }, }
def test_example_A(test_cases_for_sql_data_connector_sqlite_execution_engine): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME data_assets: table_partitioned_by_date_column__A: splitter_method: _split_on_column_value splitter_kwargs: column_name: date """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": ["table_partitioned_by_date_column__A"], "data_assets": { "table_partitioned_by_date_column__A": { "batch_definition_count": 30, "example_data_references": [ { "date": "2020-01-01" }, { "date": "2020-01-02" }, { "date": "2020-01-03" }, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], "example_data_reference": { "n_rows": 8, "batch_spec": { "table_name": "table_partitioned_by_date_column__A", "partition_definition": { "date": "2020-01-02" }, "splitter_method": "_split_on_column_value", "splitter_kwargs": { "column_name": "date" }, }, }, }
def test_basic_self_check( test_cases_for_sql_data_connector_sqlite_execution_engine): random.seed(0) execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME assets: table_partitioned_by_date_column__A: #table_name: events # If table_name is omitted, then the table_name defaults to the asset name splitter_method: _split_on_column_value splitter_kwargs: column_name: date """, ) config["execution_engine"] = execution_engine my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": ["table_partitioned_by_date_column__A"], "data_assets": { "table_partitioned_by_date_column__A": { "batch_definition_count": 30, "example_data_references": [ { "date": "2020-01-01" }, { "date": "2020-01-02" }, { "date": "2020-01-03" }, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": { # "n_rows": 8, # "batch_spec": { # "table_name": "table_partitioned_by_date_column__A", # "data_asset_name": "table_partitioned_by_date_column__A", # "batch_identifiers": {"date": "2020-01-02"}, # "splitter_method": "_split_on_column_value", # "splitter_kwargs": {"column_name": "date"}, # }, # }, }
def test_example_G(test_cases_for_sql_data_connector_sqlite_execution_engine): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME assets: table_partitioned_by_multiple_columns__G: splitter_method: _split_on_multi_column_values splitter_kwargs: column_names: - y - m - d """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": ["table_partitioned_by_multiple_columns__G"], "data_assets": { "table_partitioned_by_multiple_columns__G": { "batch_definition_count": 30, # TODO Abe 20201029 : These values should be sorted "example_data_references": [ {"y": 2020, "m": 1, "d": 1}, {"y": 2020, "m": 1, "d": 2}, {"y": 2020, "m": 1, "d": 3}, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": { # "n_rows": 8, # "batch_spec": { # "table_name": "table_partitioned_by_multiple_columns__G", # "data_asset_name": "table_partitioned_by_multiple_columns__G", # "batch_identifiers": { # "y": 2020, # "m": 1, # "d": 2, # }, # "splitter_method": "_split_on_multi_column_values", # "splitter_kwargs": {"column_names": ["y", "m", "d"]}, # }, # }, }
def test_example_B(test_cases_for_sql_data_connector_sqlite_execution_engine): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load(""" name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME assets: table_partitioned_by_timestamp_column__B: splitter_method: _split_on_converted_datetime splitter_kwargs: column_name: timestamp """) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": ["table_partitioned_by_timestamp_column__B"], "data_assets": { "table_partitioned_by_timestamp_column__B": { "batch_definition_count": 30, "example_data_references": [ { "timestamp": "2020-01-01" }, { "timestamp": "2020-01-02" }, { "timestamp": "2020-01-03" }, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": { # "n_rows": 8, # "batch_spec": { # "table_name": "table_partitioned_by_timestamp_column__B", # "data_asset_name": "table_partitioned_by_timestamp_column__B", # "batch_identifiers": {"timestamp": "2020-01-02"}, # "splitter_method": "_split_on_converted_datetime", # "splitter_kwargs": {"column_name": "timestamp"}, # }, # }, }
def test_example_C( splitter_method_name_prefix, test_cases_for_sql_data_connector_sqlite_execution_engine, ): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( f""" name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME assets: table_partitioned_by_regularly_spaced_incrementing_id_column__C: splitter_method: {splitter_method_name_prefix}split_on_divided_integer splitter_kwargs: column_name: id divisor: 10 """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": [ "table_partitioned_by_regularly_spaced_incrementing_id_column__C" ], "data_assets": { "table_partitioned_by_regularly_spaced_incrementing_id_column__C": { "batch_definition_count": 12, "example_data_references": [ {"id": 0}, {"id": 1}, {"id": 2}, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": { # "n_rows": 10, # "batch_spec": { # "table_name": "table_partitioned_by_regularly_spaced_incrementing_id_column__C", # "data_asset_name": "table_partitioned_by_regularly_spaced_incrementing_id_column__C", # "batch_identifiers": {"id": 1}, # "splitter_method": "_split_on_divided_integer", # "splitter_kwargs": {"column_name": "id", "divisor": 10}, # }, # }, }
def test_more_complex_instantiation_of_ConfiguredAssetSqlDataConnector_include_schema_name_prefix_suffix( splitter_method_name_prefix, test_cases_for_sql_data_connector_sqlite_execution_engine, ): my_data_connector: ConfiguredAssetSqlDataConnector = ConfiguredAssetSqlDataConnector( name="my_sql_data_connector", datasource_name="my_test_datasource", execution_engine="test_cases_for_sql_data_connector_sqlite_execution_engine", assets={ "table_partitioned_by_date_column__A": { "splitter_method": f"{splitter_method_name_prefix}split_on_column_value", "splitter_kwargs": {"column_name": "date"}, "include_schema_name": True, "schema_name": "main", "data_asset_name_prefix": "taxi__", "data_asset_name_suffix": "__asset", }, }, ) assert ( "taxi__main.table_partitioned_by_date_column__A__asset" in my_data_connector.assets ) # schema_name provided, but include_schema_name is set to False with pytest.raises(ge_exceptions.DataConnectorError) as e: ConfiguredAssetSqlDataConnector( name="my_sql_data_connector", datasource_name="my_test_datasource", execution_engine="test_cases_for_sql_data_connector_sqlite_execution_engine", assets={ "table_partitioned_by_date_column__A": { "splitter_method": f"{splitter_method_name_prefix}split_on_column_value", "splitter_kwargs": {"column_name": "date"}, "include_schema_name": False, "schema_name": "main", "data_asset_name_prefix": "taxi__", "data_asset_name_suffix": "__asset", }, }, ) assert ( e.value.message == "ConfiguredAssetSqlDataConnector ran into an error while initializing Asset names. Schema main was specified, but 'include_schema_name' flag was set to False." )
def test_behavior_with_whole_table_splitter( test_cases_for_sql_data_connector_sqlite_execution_engine, ): db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME assets: table_partitioned_by_date_column__A: splitter_method : "_split_on_whole_table" splitter_kwargs : {} """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report_object = my_data_connector.self_check() print(json.dumps(report_object, indent=2)) batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", data_asset_name="table_partitioned_by_date_column__A", ) ) ) assert len(batch_definition_list) == 1 assert batch_definition_list[0]["batch_identifiers"] == {} batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", data_asset_name="table_partitioned_by_date_column__A", data_connector_query={}, ) ) ) assert len(batch_definition_list) == 1 assert batch_definition_list[0]["batch_identifiers"] == {} batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", data_asset_name="table_partitioned_by_date_column__A", data_connector_query={"batch_filter_parameters": {}}, ) ) ) assert len(batch_definition_list) == 1 assert batch_definition_list[0]["batch_identifiers"] == {}
def test_get_batch_definition_list_from_batch_request( test_cases_for_sql_data_connector_sqlite_execution_engine, ): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME data_assets: table_partitioned_by_date_column__A: splitter_method: _split_on_column_value splitter_kwargs: column_name: date """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) my_data_connector._refresh_data_references_cache() batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", data_asset_name="table_partitioned_by_date_column__A", partition_request={ "partition_identifiers": { "date": "2020-01-01" } }, ))) assert len(batch_definition_list) == 1 batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", data_asset_name="table_partitioned_by_date_column__A", partition_request={"partition_identifiers": {}}, ))) assert len(batch_definition_list) == 30 # Note: Abe 20201109: It would be nice to put in safeguards for mistakes like this. # In this case, "date" should go inside "partition_identifiers". # Currently, the method ignores "date" entirely, and matches on too many partitions. # I don't think this is unique to ConfiguredAssetSqlDataConnector. # with pytest.raises(DataConnectorError) as e: # batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request( # batch_request=BatchRequest( # datasource_name="FAKE_Datasource_NAME", # data_connector_name="my_sql_data_connector", # data_asset_name="table_partitioned_by_date_column__A", # partition_request={ # "partition_identifiers" : {}, # "date" : "2020-01-01", # } # )) # assert "Unmatched key" in e.value.message batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", data_asset_name="table_partitioned_by_date_column__A", ))) assert len(batch_definition_list) == 30 with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_Datasource_NAME", data_connector_name="my_sql_data_connector", )) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest(datasource_name="FAKE_Datasource_NAME", )) with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest())
def test_example_E(test_cases_for_sql_data_connector_sqlite_execution_engine): random.seed(0) db = test_cases_for_sql_data_connector_sqlite_execution_engine config = yaml.load( """ name: my_sql_data_connector datasource_name: FAKE_Datasource_NAME data_assets: table_partitioned_by_incrementing_batch_id__E: splitter_method: _split_on_column_value splitter_kwargs: column_name: batch_id """, ) config["execution_engine"] = db my_data_connector = ConfiguredAssetSqlDataConnector(**config) report = my_data_connector.self_check() print(json.dumps(report, indent=2)) assert report == { "class_name": "ConfiguredAssetSqlDataConnector", "data_asset_count": 1, "example_data_asset_names": ["table_partitioned_by_incrementing_batch_id__E"], "data_assets": { "table_partitioned_by_incrementing_batch_id__E": { "batch_definition_count": 11, "example_data_references": [ { "batch_id": 0 }, { "batch_id": 1 }, { "batch_id": 2 }, ], } }, "unmatched_data_reference_count": 0, "example_unmatched_data_references": [], # FIXME: (Sam) example_data_reference removed temporarily in PR #2590: # "example_data_reference": { # "n_rows": 9, # "batch_spec": { # "table_name": "table_partitioned_by_incrementing_batch_id__E", # "data_asset_name": "table_partitioned_by_incrementing_batch_id__E", # "partition_definition": {"batch_id": 1}, # "splitter_method": "_split_on_column_value", # "splitter_kwargs": {"column_name": "batch_id"}, # }, # }, }
"class_name": "SqlAlchemyExecutionEngine", "connection_string": connection_string, }, ) # 2. Set sampler in data connector config data_connector_name: str = "test_data_connector" data_asset_name: str = table_name # Read from generated table name column_name: str = taxi_splitting_test_cases.test_column_name data_connector: ConfiguredAssetSqlDataConnector = ( ConfiguredAssetSqlDataConnector( name=data_connector_name, datasource_name=datasource_name, execution_engine=context.datasources[datasource_name]. execution_engine, assets={ data_asset_name: { "sampling_method": test_case.sampling_method_name, "sampling_kwargs": test_case.sampling_kwargs, } }, )) # 3. Check if resulting batches are as expected # using data_connector.get_batch_definition_list_from_batch_request() batch_request: BatchRequest = BatchRequest( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, ) batch_definition_list: List[ BatchDefinition] = data_connector.get_batch_definition_list_from_batch_request(