def test_get_compute_domain_with_unmeetable_row_condition(spark_session): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) expected_df = df.filter(F.col("b") > 24) engine = SparkDFExecutionEngine() engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"row_condition": "b > 24", "condition_parser": "spark"}, domain_type=MetricDomainTypes.TABLE, ) # Ensuring data has been properly queried assert data.schema == expected_df.schema assert data.collect() == expected_df.collect() # Ensuring compute kwargs have not been modified assert "row_condition" in compute_kwargs.keys() assert accessor_kwargs == {}
def test_split_on_multi_column_values_and_sample_using_random(test_sparkdf): returned_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_multi_column_values", splitter_kwargs={ "column_names": ["y", "m", "d"], "partition_definition": { "y": 2020, "m": 1, "d": 5, }, }, sampling_method="_sample_using_random", sampling_kwargs={ "p": 0.5, }, ) ) # The test dataframe contains 10 columns and 120 rows. assert len(returned_df.columns) == 10 # The number of returned rows corresponding to the value of "partition_definition" above is 4. assert 0 <= returned_df.count() <= 4 # The sampling probability "p" used in "SparkDFExecutionEngine._sample_using_random()" is 0.5 (the equivalent of a # fair coin with the 50% chance of coming up as "heads"). Hence, on average we should get 50% of the rows, which is # 2; however, for such a small sample (of 4 rows), the number of rows returned by an individual run can deviate from # this average. Still, in the majority of trials, the number of rows should not be fewer than 2 or greater than 3. # The assertion in the next line, supporting this reasoning, is commented out to insure zero failures. Developers # are encouraged to uncomment it, whenever the "_sample_using_random" feature is the main focus of a given effort. # assert 2 <= returned_df.count() <= 3 for val in returned_df.collect(): assert val.date == datetime.date(2020, 1, 5)
def test_get_batch_with_split_on_whole_table_s3(spark_session): def mocked_get_reader_function(*args, **kwargs): def mocked_reader_function(*args, **kwargs): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) return df return mocked_reader_function spark_engine = SparkDFExecutionEngine() spark_engine._get_reader_fn = mocked_get_reader_function test_sparkdf = spark_engine.get_batch_data( S3BatchSpec( s3="s3://bucket/test/test.csv", reader_method="csv", reader_options={"header": True}, splitter_method="_split_on_whole_table", ) ) assert test_sparkdf.count() == 4 assert len(test_sparkdf.columns) == 2
def test_get_batch_with_split_on_whole_table(test_sparkdf): test_sparkdf = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_whole_table" ) ) assert test_sparkdf.count() == 120 assert len(test_sparkdf.columns) == 10
def test_reader_fn(spark_session): engine = SparkDFExecutionEngine() # Testing that can recognize basic csv file fn = engine._get_reader_fn(reader=spark_session.read, path="myfile.csv") assert "<bound method DataFrameReader.csv" in str(fn) # Ensuring that other way around works as well - reader_method should always override path fn_new = engine._get_reader_fn(reader=spark_session.read, reader_method="csv") assert "<bound method DataFrameReader.csv" in str(fn_new)
def test_get_batch_with_split_on_whole_table_filesystem( test_folder_connection_path_csv, ): # reader_method not configured because spark will configure own reader by default test_sparkdf = SparkDFExecutionEngine().get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_csv, "test.csv"), splitter_method="_split_on_whole_table", )) assert test_sparkdf.count() == 6 assert len(test_sparkdf.columns) == 2
def test_sample_using_random(test_sparkdf): sampled_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec(batch_data=test_sparkdf, sampling_method="_sample_using_random")) # The test dataframe contains 10 columns and 120 rows. assert len(sampled_df.columns) == 10 assert 0 <= sampled_df.count() <= 120 # The sampling probability "p" used in "SparkDFExecutionEngine._sample_using_random()" is 0.1 (the equivalent of an # unfair coin with the 10% chance of coming up as "heads"). Hence, we should never get as much as 20% of the rows. assert sampled_df.count() < 25
def test_get_batch_empty_splitter_parquet(test_folder_connection_path_parquet): # Note: reader method and reader_options are not needed, because # SparkDFExecutionEngine automatically determines the file type as well as the schema of the Parquet file. test_sparkdf = SparkDFExecutionEngine().get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_parquet, "test.parquet"), splitter_method=None, ) ) assert test_sparkdf.count() == 5 assert len(test_sparkdf.columns) == 2
def test_get_batch_empty_splitter(test_folder_connection_path_csv): # reader_method not configured because spark will configure own reader by default # reader_options are needed to specify the fact that the first line of test file is the header test_sparkdf = SparkDFExecutionEngine().get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_csv, "test.csv"), reader_options={"header": True}, splitter_method=None, )) assert test_sparkdf.count() == 5 assert len(test_sparkdf.columns) == 2
def test_dataframe_property_given_loaded_batch(spark_session): engine = SparkDFExecutionEngine() df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) df = spark_session.createDataFrame(df) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") # Ensuring Data not distorted assert engine.dataframe == df
def test_sample_using_a_list(test_sparkdf): sampled_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, sampling_method="_sample_using_a_list", sampling_kwargs={ "column_name": "id", "value_list": [3, 5, 7, 11], }, )) assert sampled_df.count() == 4 assert len(sampled_df.columns) == 10
def test_get_batch_empty_splitter_tsv(test_folder_connection_path_tsv): # reader_method not configured because spark will configure own reader by default # reader_options are needed to specify the fact that the first line of test file is the header # reader_options are also needed to specify the separator (otherwise, comma will be used as the default separator) test_sparkdf = SparkDFExecutionEngine().get_batch_data( PathBatchSpec( path=os.path.join(test_folder_connection_path_tsv, "test.tsv"), reader_options={"header": True, "sep": "\t"}, splitter_method=None, ) ) assert test_sparkdf.count() == 5 assert len(test_sparkdf.columns) == 2
def test_sample_using_mod(test_sparkdf): sampled_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, sampling_method="_sample_using_mod", sampling_kwargs={ "column_name": "id", "mod": 5, "value": 4, }, )) assert sampled_df.count() == 24 assert len(sampled_df.columns) == 10
def test_get_batch_with_split_on_converted_datetime(test_sparkdf): split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_converted_datetime", splitter_kwargs={ "column_name": "timestamp", "partition_definition": {"timestamp": "2020-01-03"}, }, ) ) assert split_df.count() == 2 assert len(split_df.columns) == 10
def test_dataframe_property_given_loaded_batch(): from pyspark.sql import SparkSession engine = SparkDFExecutionEngine() df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]}) spark = SparkSession.builder.getOrCreate() df = spark.createDataFrame(df) # Loading batch data engine.load_batch_data(batch_data=df, batch_id="1234") # Ensuring Data not distorted assert engine.dataframe == df
def _build_spark_engine(df, spark_session): df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in df.to_records(index=False) ], df.columns.tolist(), ) engine = SparkDFExecutionEngine() engine.load_batch_data("my_id", SparkDFBatchData(engine, df)) return engine
def test_get_batch_with_split_on_hashed_column(test_sparkdf): split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_hashed_column", splitter_kwargs={ "column_name": "favorite_color", "hash_digits": 1, "hash_function_name": "sha256", "partition_definition": { "hash_value": "a", }, }, )) assert split_df.count() == 8 assert len(split_df.columns) == 10
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: dict, metric_value_kwargs: dict, metrics: Dict[str, Any], runtime_configuration: dict, ) -> List[pyspark_sql_Row]: query: Optional[str] = metric_value_kwargs.get( "query" ) or cls.default_kwarg_values.get("query") df: pyspark_sql_DataFrame df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE ) df.createOrReplaceTempView("tmp_view") column: str = metric_value_kwargs.get("column") query = query.format(col=column, active_batch="tmp_view") engine: pyspark_sql_SparkSession = execution_engine.spark result: List[pyspark_sql_Row] = engine.sql(query).collect() return result
def test_get_aggregate_count_aware_metric_dependencies(spark_session): mp = ColumnValuesNonNull() metric = MetricConfiguration( "column_values.nonnull.unexpected_count", dict(), dict() ) dependencies = mp.get_evaluation_dependencies( metric, execution_engine=PandasExecutionEngine() ) assert ( dependencies["unexpected_condition"].id[0] == "column_values.nonnull.condition" ) metric = MetricConfiguration( "column_values.nonnull.unexpected_count", dict(), dict() ) dependencies = mp.get_evaluation_dependencies( metric, execution_engine=SparkDFExecutionEngine() ) assert ( dependencies["metric_partial_fn"].id[0] == "column_values.nonnull.unexpected_count.aggregate_fn" ) metric = MetricConfiguration( "column_values.nonnull.unexpected_count.aggregate_fn", dict(), dict() ) dependencies = mp.get_evaluation_dependencies(metric) assert ( dependencies["unexpected_condition"].id[0] == "column_values.nonnull.condition" )
def test_for_self_check_using_InferredAssetFilesystemDataConnector_SparkDFExecutionEngine( spark_session, tmp_path_factory): base_directory = str( tmp_path_factory.mktemp( "basic_data_connector_inferred_asset_filesystem_data_connector")) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20201010_1000.csv", "abe_202011111_2000.csv", "will_20201212_3000.csv", ], ) my_data_connector = InferredAssetFilesystemDataConnector( name="my_data_connector", base_directory=base_directory, glob_directive="*.csv", datasource_name="FAKE_DATASOURCE", execution_engine=SparkDFExecutionEngine(), default_regex={ "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["data_asset_name", "timestamp", "size"], }, ) self_check_results = my_data_connector.self_check() assert self_check_results["data_asset_count"] == 3 assert self_check_results["example_data_reference"]["n_rows"] == 3
def test_sample_using_md5(test_sparkdf): sampled_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, sampling_method="_sample_using_hash", sampling_kwargs={ "column_name": "date", "hash_function_name": "md5", }, ) ) assert sampled_df.count() == 10 assert len(sampled_df.columns) == 10 collected = sampled_df.collect() for val in collected: assert val.date in [datetime.date(2020, 1, 15), datetime.date(2020, 1, 29)]
def test_get_batch_with_split_on_divided_integer(test_sparkdf): split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_divided_integer", splitter_kwargs={ "column_name": "id", "divisor": 10, "partition_definition": {"id": 5}, }, ) ) assert split_df.count() == 10 assert len(split_df.columns) == 10 max_result = split_df.select([F.max("id")]) assert max_result.collect()[0]["max(id)"] == 59 min_result = split_df.select([F.min("id")]) assert min_result.collect()[0]["min(id)"] == 50
def test_get_compute_domain_with_column_domain(spark_session): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist()) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) engine = SparkDFExecutionEngine() engine.load_batch_data(batch_data=df, batch_id="1234") data, compute_kwargs, accessor_kwargs = engine.get_compute_domain( domain_kwargs={"column": "a"}, domain_type=MetricDomainTypes.COLUMN) assert compute_kwargs is not None, "Compute domain kwargs should be existent" assert accessor_kwargs == {"column": "a"} assert data.schema == df.schema assert data.collect() == df.collect()
def test_get_batch_with_split_on_multi_column_values(test_sparkdf): split_df = ( SparkDFExecutionEngine() .get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_multi_column_values", splitter_kwargs={ "column_names": ["y", "m", "d"], "partition_definition": { "y": 2020, "m": 1, "d": 5, }, }, ) ) .dataframe ) assert split_df.count() == 4 assert len(split_df.columns) == 10 collected = split_df.collect() for val in collected: assert val.date == datetime.date(2020, 1, 5) with pytest.raises(ValueError): split_df = ( SparkDFExecutionEngine() .get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_multi_column_values", splitter_kwargs={ "column_names": ["I", "dont", "exist"], "partition_definition": { "y": 2020, "m": 1, "d": 5, }, }, ) ) .dataframe )
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): min_value = metric_value_kwargs.get("min_value") max_value = metric_value_kwargs.get("max_value") strict_min = metric_value_kwargs.get("strict_min") strict_max = metric_value_kwargs.get("strict_max") if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") ( df, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN) column = df[accessor_domain_kwargs["column"]] if min_value is not None and max_value is not None and min_value > max_value: raise ValueError("min_value cannot be greater than max_value") if min_value is None and max_value is None: raise ValueError("min_value and max_value cannot both be None") if min_value is None: if strict_max: condition = column < max_value else: condition = column <= max_value elif max_value is None: if strict_min: condition = column > min_value else: condition = column >= min_value else: if strict_min and strict_max: condition = (column > min_value) & (column < max_value) elif strict_min: condition = (column > min_value) & (column <= max_value) elif strict_max: condition = (column >= min_value) & (column < max_value) else: condition = (column >= min_value) & (column <= max_value) return df.filter(condition).count()
def test_get_batch_empty_sampler(test_sparkdf): sampled_df = ( SparkDFExecutionEngine() .get_batch_data( RuntimeDataBatchSpec(batch_data=test_sparkdf, sampling_method=None) ) .dataframe ) assert sampled_df.count() == 120 assert len(sampled_df.columns) == 10
def test_get_batch_data(test_sparkdf): test_sparkdf = ( SparkDFExecutionEngine() .get_batch_data( RuntimeDataBatchSpec(batch_data=test_sparkdf, data_asset_name="DATA_ASSET") ) .dataframe ) assert test_sparkdf.count() == 120 assert len(test_sparkdf.columns) == 10
def _build_spark_engine(spark_session, df): df = spark_session.createDataFrame( [ tuple(None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist()) for record in df.to_records(index=False) ], df.columns.tolist(), ) engine = SparkDFExecutionEngine(batch_data_dict={"temp_id": df}) return engine
def test_sample_using_md5_wrong_hash_function_name(test_sparkdf): with pytest.raises(ge_exceptions.ExecutionEngineError): sampled_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, sampling_method="_sample_using_hash", sampling_kwargs={ "column_name": "date", "hash_function_name": "I_wont_work", }, ))
def _spark( cls, execution_engine: SparkDFExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) return _get_spark_column_metadata( df.schema, include_nested=metric_value_kwargs["include_nested"])