def test__get_data_reference_name(basic_datasource): partition_request: dict = { "batch_identifiers": { "airflow_run_id": 1234567890, } } partition_definition = PartitionDefinition( partition_request["batch_identifiers"]) test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"]) assert (test_runtime_data_connector._get_data_reference_name( partition_definition) == "1234567890") partition_request: dict = { "batch_identifiers": { "run_id_1": 1234567890, "run_id_2": 1111111111, } } partition_definition = PartitionDefinition( partition_request["batch_identifiers"]) test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"]) assert (test_runtime_data_connector._get_data_reference_name( partition_definition) == "1234567890-1111111111")
def test_sorter_instantiation_custom_list_with_periodic_table( periodic_table_of_elements, ): # CustomListSorter sorter_params: dict = { "reference_list": periodic_table_of_elements, } my_custom_sorter = CustomListSorter(name="element", orderby="asc", **sorter_params) # noinspection PyProtectedMember assert my_custom_sorter._reference_list == periodic_table_of_elements # This element exists : Hydrogen test_batch_def = BatchDefinition( datasource_name="test", data_connector_name="fake", data_asset_name="nowhere", partition_definition=PartitionDefinition({"element": "Hydrogen"}), ) returned_partition_key = my_custom_sorter.get_partition_key(test_batch_def) assert returned_partition_key == 0 # This element does not : Vibranium test_batch_def = BatchDefinition( datasource_name="test", data_connector_name="fake", data_asset_name="nowhere", partition_definition=PartitionDefinition({"element": "Vibranium"}), ) with pytest.raises(ge_exceptions.SorterError): my_custom_sorter.get_partition_key(test_batch_def)
def test__build_batch_spec(basic_datasource): partition_request: dict = { "batch_identifiers": { "custom_key_0": "staging", "airflow_run_id": 1234567890, } } test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"] ) # noinspection PyProtectedMember batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", partition_definition=PartitionDefinition( partition_request["batch_identifiers"] ), ), batch_data=pd.DataFrame({"x": range(10)}), ) assert type(batch_spec) == RuntimeDataBatchSpec assert set(batch_spec.keys()) == {"batch_data"} assert batch_spec["batch_data"].shape == (10, 1)
def test__generate_batch_spec_parameters_from_batch_definition( basic_datasource, ): partition_request: dict = { "batch_identifiers": { "custom_key_0": "staging", "airflow_run_id": 1234567890, } } test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"] ) expected_batch_spec_parameters: dict = {} # noinspection PyProtectedMember batch_spec_parameters: dict = test_runtime_data_connector._generate_batch_spec_parameters_from_batch_definition( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", partition_definition=PartitionDefinition( partition_request["batch_identifiers"] ), ) ) assert batch_spec_parameters == expected_batch_spec_parameters
def map_data_reference_string_to_batch_definition_list_using_regex( datasource_name: str, data_connector_name: str, data_reference: str, regex_pattern: str, group_names: List[str], data_asset_name: Optional[str] = None, ) -> Optional[List[BatchDefinition]]: processed_data_reference: Optional[Tuple[ str, PartitionDefinitionSubset]] = convert_data_reference_string_to_partition_definition_using_regex( data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, ) if processed_data_reference is None: return None data_asset_name_from_partition_definition: str = processed_data_reference[ 0] partition_definition: PartitionDefinitionSubset = processed_data_reference[ 1] if data_asset_name is None: data_asset_name = data_asset_name_from_partition_definition return [ BatchDefinition( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, partition_definition=PartitionDefinition(partition_definition), ) ]
def get_batch_definition_list_from_batch_request( self, batch_request: BatchRequest, ) -> List[BatchDefinition]: self._validate_batch_request(batch_request=batch_request) partition_identifiers: Optional[dict] = None if batch_request.partition_request: self._validate_partition_identifiers( partition_identifiers=batch_request.partition_request.get( "partition_identifiers")) partition_identifiers = batch_request.partition_request.get( "partition_identifiers") if not partition_identifiers: partition_identifiers = {} batch_definition_list: List[BatchDefinition] batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=DEFAULT_DATA_ASSET_NAME, partition_definition=PartitionDefinition(partition_identifiers), ) if batch_definition_matches_batch_request( batch_definition=batch_definition, batch_request=batch_request): batch_definition_list = [batch_definition] else: batch_definition_list = [] return batch_definition_list
def map_data_reference_string_to_batch_definition_list_using_regex( datasource_name: str, data_connector_name: str, data_asset_name: str, data_reference: str, regex_pattern: str, group_names: List[str], ) -> Optional[List[BatchDefinition]]: batch_request: BatchRequest = ( convert_data_reference_string_to_batch_request_using_regex( data_reference=data_reference, regex_pattern=regex_pattern, group_names=group_names, )) if batch_request is None: return None if data_asset_name is None: data_asset_name = batch_request.data_asset_name return [ BatchDefinition( datasource_name=datasource_name, data_connector_name=data_connector_name, data_asset_name=data_asset_name, partition_definition=PartitionDefinition( batch_request.partition_request), ) ]
def _get_batch_definition_list_from_batch_request( self, batch_request: BatchRequest, ) -> List[BatchDefinition]: """ <Will> 202103. The following behavior of the _data_references_cache follows a pattern that we are using for other data_connectors, including variations of FilePathDataConnector. When BatchRequest contains batch_data that is passed in as a in-memory dataframe, the cache will contain the names of all data_assets (and data_references) that have been passed into the RuntimeDataConnector in this session, even though technically only the most recent batch_data is available. This can be misleading. However, allowing the RuntimeDataConnector to keep a record of all data_assets (and data_references) that have been passed in will allow for the proposed behavior of RuntimeBatchRequest which will allow for paths and queries to be passed in as part of the BatchRequest. Therefore this behavior will be revisited when the design of RuntimeBatchRequest and related classes are complete. """ self._validate_batch_request(batch_request=batch_request) batch_identifiers = batch_request.partition_request.get("batch_identifiers") self._validate_batch_identifiers(batch_identifiers=batch_identifiers) batch_definition_list: List[BatchDefinition] batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, partition_definition=PartitionDefinition(batch_identifiers), ) batch_definition_list = [batch_definition] self._update_data_references_cache( batch_request.data_asset_name, batch_definition_list, batch_identifiers ) return batch_definition_list
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( test_s3_files, test_df_small): bucket, _keys = test_s3_files expected_df = test_df_small my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, bucket=bucket, prefix="", assets={"alpha": {}}, ) batch_def = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_asset_name="alpha", data_connector_name="my_data_connector", partition_definition=PartitionDefinition(index=1), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) test_df = PandasExecutionEngine().get_batch_data( batch_spec=my_data_connector.build_batch_spec( batch_definition=batch_def)) assert test_df.dataframe.shape == expected_df.shape # if key does not exist batch_def_no_key = BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_asset_name="alpha", data_connector_name="my_data_connector", partition_definition=PartitionDefinition(index=9), batch_spec_passthrough={ "reader_method": "read_csv", "splitter_method": "_split_on_whole_table", }, ) with pytest.raises(ClientError): PandasExecutionEngine().get_batch_data( batch_spec=my_data_connector.build_batch_spec( batch_definition=batch_def_no_key))
def _map_data_reference_to_batch_definition_list( self, data_reference: str, data_asset_name: Optional[str] = None ) -> Optional[List[BatchDefinition]]: if data_asset_name is None: data_asset_name = DEFAULT_DATA_ASSET_NAME return [ BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=data_asset_name, partition_definition=PartitionDefinition(), ) ]
def test_get_batch_definition_list_from_batch_request_length_one( basic_datasource, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) partition_request: dict = { "batch_identifiers": { "airflow_run_id": 1234567890, } } test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"] ) batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "IN_MEMORY_DATA_ASSET", "batch_data": test_df, "partition_request": partition_request, "limit": None, } batch_request: BatchRequest = BatchRequest(**batch_request) expected_batch_definition_list: List[BatchDefinition] = [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="IN_MEMORY_DATA_ASSET", partition_definition=PartitionDefinition( partition_request["batch_identifiers"] ), ) ] batch_definition_list: List[ BatchDefinition ] = test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request ) assert batch_definition_list == expected_batch_definition_list
def test_data_references_cache_updating_after_batch_request( basic_datasource, ): test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"]) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # empty if data_connector has not been used assert test_runtime_data_connector.get_available_data_asset_names() == [] partition_request: dict = { "batch_identifiers": { "airflow_run_id": 1234567890, } } batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "my_data_asset_1", "batch_data": test_df, "partition_request": partition_request, "limit": None, } batch_request: BatchRequest = BatchRequest(**batch_request) # run with my_data_asset_1 test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request) assert test_runtime_data_connector._data_references_cache == { "my_data_asset_1": { "1234567890": [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset_1", partition_definition=PartitionDefinition( {"airflow_run_id": 1234567890}), ) ], } } # update with test_df_new: pd.DataFrame = pd.DataFrame(data={ "col1": [5, 6], "col2": [7, 8] }) partition_request: dict = { "batch_identifiers": { "airflow_run_id": 987654321, } } batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "my_data_asset_1", "batch_data": test_df_new, "partition_request": partition_request, "limit": None, } batch_request: BatchRequest = BatchRequest(**batch_request) # run with with new_data_asset but a new batch test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request) assert test_runtime_data_connector._data_references_cache == { "my_data_asset_1": { "1234567890": [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset_1", partition_definition=PartitionDefinition( {"airflow_run_id": 1234567890}), ) ], "987654321": [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset_1", partition_definition=PartitionDefinition( {"airflow_run_id": 987654321}), ) ], }, } # new data_asset_name test_df_new_asset: pd.DataFrame = pd.DataFrame(data={ "col1": [9, 10], "col2": [11, 12] }) partition_request: dict = { "batch_identifiers": { "airflow_run_id": 5555555, } } batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "my_data_asset_2", "batch_data": test_df_new_asset, "partition_request": partition_request, "limit": None, } batch_request: BatchRequest = BatchRequest(**batch_request) # run with with new_data_asset but a new batch test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request) assert test_runtime_data_connector._data_references_cache == { "my_data_asset_1": { "1234567890": [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset_1", partition_definition=PartitionDefinition( {"airflow_run_id": 1234567890}), ) ], "987654321": [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset_1", partition_definition=PartitionDefinition( {"airflow_run_id": 987654321}), ) ], }, "my_data_asset_2": { "5555555": [ BatchDefinition( datasource_name="my_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset_2", partition_definition=PartitionDefinition( {"airflow_run_id": 5555555}), ) ] }, } assert test_runtime_data_connector.get_available_data_asset_names() == [ "my_data_asset_1", "my_data_asset_2", ] assert test_runtime_data_connector.get_data_reference_list_count() == 3