def test_split_on_multi_column_values_and_sample_using_random(test_sparkdf): returned_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_multi_column_values", splitter_kwargs={ "column_names": ["y", "m", "d"], "partition_definition": { "y": 2020, "m": 1, "d": 5, }, }, sampling_method="_sample_using_random", sampling_kwargs={ "p": 0.5, }, ) ) # The test dataframe contains 10 columns and 120 rows. assert len(returned_df.columns) == 10 # The number of returned rows corresponding to the value of "partition_definition" above is 4. assert 0 <= returned_df.count() <= 4 # The sampling probability "p" used in "SparkDFExecutionEngine._sample_using_random()" is 0.5 (the equivalent of a # fair coin with the 50% chance of coming up as "heads"). Hence, on average we should get 50% of the rows, which is # 2; however, for such a small sample (of 4 rows), the number of rows returned by an individual run can deviate from # this average. Still, in the majority of trials, the number of rows should not be fewer than 2 or greater than 3. # The assertion in the next line, supporting this reasoning, is commented out to insure zero failures. Developers # are encouraged to uncomment it, whenever the "_sample_using_random" feature is the main focus of a given effort. # assert 2 <= returned_df.count() <= 3 for val in returned_df.collect(): assert val.date == datetime.date(2020, 1, 5)
def test_get_batch_with_split_on_column_value(test_sparkdf): split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_column_value", splitter_kwargs={ "column_name": "batch_id", "partition_definition": {"batch_id": 2}, }, ) ) assert test_sparkdf.count() == 120 assert len(test_sparkdf.columns) == 10 collected = split_df.collect() for val in collected: assert val.batch_id == 2 split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_column_value", splitter_kwargs={ "column_name": "date", "partition_definition": {"date": datetime.date(2020, 1, 30)}, }, ) ) assert split_df.count() == 3 assert len(split_df.columns) == 10
def test_get_batch_with_split_on_multi_column_values(test_sparkdf): split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_multi_column_values", splitter_kwargs={ "column_names": ["y", "m", "d"], "partition_definition": { "y": 2020, "m": 1, "d": 5, }, }, )) assert split_df.count() == 4 assert len(split_df.columns) == 10 collected = split_df.collect() for val in collected: assert val.date == datetime.date(2020, 1, 5) with pytest.raises(ValueError): split_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, splitter_method="_split_on_multi_column_values", splitter_kwargs={ "column_names": ["I", "dont", "exist"], "partition_definition": { "y": 2020, "m": 1, "d": 5, }, }, ))
def test_sample_using_md5(test_sparkdf): sampled_df = SparkDFExecutionEngine().get_batch_data( RuntimeDataBatchSpec( batch_data=test_sparkdf, sampling_method="_sample_using_hash", sampling_kwargs={ "column_name": "date", "hash_function_name": "md5", }, ) ) assert sampled_df.count() == 10 assert len(sampled_df.columns) == 10 collected = sampled_df.collect() for val in collected: assert val.date in [datetime.date(2020, 1, 15), datetime.date(2020, 1, 29)]