def test_is_true(spark): source_df = spark.create_df( [(True, True), (False, False), (None, None)], [("has_stuff", BooleanType(), True), ("expected", BooleanType(), True)], ) actual_df = source_df.withColumn("is_stuff_true", F.col("has_stuff").isTrue()) chispa.assert_column_equality(actual_df, "is_stuff_true", "expected")
def test_regexp_extract_all(spark): df = spark.create_df( [("200 - 300 PA.", ["200", "300"]), ("400 PA.", ["400"]), (None, None)], [ ("str", StringType(), True), ("expected", ArrayType(StringType(), True), True), ], ) actual_df = df.withColumn( "all_numbers", quinn.regexp_extract_all(F.col("str"), F.lit(r"(\d+)"))) chispa.assert_column_equality(actual_df, "all_numbers", "expected")
def test_ip_country_match(spark): schema = StructType([ StructField("host", StringType()), StructField("expected_country", StringType()) ]) data = [ Row(host='130.119.171.217', expected_country="US"), Row(host='2001:888:197d:0:250:fcff:fe23:3879', expected_country="NL") ] df = spark.createDataFrame(data, schema) ip_country_added_df = get_country_from_ip(df) assert_column_equality(ip_country_added_df, "expected_country", "ip_country")
def test_ip_country_no_match(spark): schema = StructType([ StructField("host", StringType()), StructField("expected_country", StringType()) ]) data = [ Row(host='127.0.0.1', expected_country="NoMatch"), Row(host='random.domain.com', expected_country="NotIP") ] df = spark.createDataFrame(data, schema) ip_country_added_df = get_country_from_ip(df) assert_column_equality(ip_country_added_df, "expected_country", "ip_country")
def test_is_not_in(spark): source_df = spark.create_df( [ ("surfing", True), ("swimming", True), ("dancing", False), ], [ ("fun_thing", StringType(), True), ("expected", BooleanType(), True), ] ) bobs_hobbies = ["dancing", "snowboarding"] actual_df = source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies)) chispa.assert_column_equality(actual_df, "is_not_bobs_hobby", "expected")
def test_time_parser(spark): schema = StructType([ StructField("timestamp", StringType()), StructField("expected_timestamp", TimestampType()) ]) data = [ Row(timestamp='01/Jul/1995:12:30:23 -0400', expected_time=datetime(1995, 7, 1, 19, 30, 23)), Row(timestamp='10/Aug/2003:20:28:01 +0200', expected_time=datetime(2003, 8, 10, 21, 28, 1)) ] df = spark.createDataFrame(data, schema) parsed_df = parse_log_time(df) parsed_df.show(truncate=False) assert_column_equality(parsed_df, "expected_timestamp", "parsed_time")
def test_is_null_or_blank(spark): source_df = spark.create_df( [ ("", True), (" ", True), (None, True), ("hi", False), ], [ ("blah", StringType(), True), ("expected", BooleanType(), True), ] ) actual_df = source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank()) chispa.assert_column_equality(actual_df, "is_blah_null_or_blank", "expected")
def test_anti_trim(spark): df = spark.create_df( [ (" I like fish ", " Ilikefish "), (" zombies", " zombies"), (" simpsons cat lady ", " simpsonscatlady "), (None, None), ], [ ("words", StringType(), True), ("expected", StringType(), True), ], ) actual_df = df.withColumn("words_anti_trimmed", quinn.anti_trim(F.col("words"))) chispa.assert_column_equality(actual_df, "words_anti_trimmed", "expected")
def test_exists(spark): df = spark.createDataFrame( [ ([1, 2, 3], False), ([4, 5, 6], True), ([10, 11, 12], True), ], StructType([ StructField("nums", ArrayType(IntegerType(), True), True), StructField("expected", BooleanType(), True), ]), ) actual_df = df.withColumn("any_num_greater_than_5", quinn.exists(lambda n: n > 5)(F.col("nums"))) chispa.assert_column_equality(actual_df, "any_num_greater_than_5", "expected")
def test_remove_non_word_characters(spark): df = spark.create_df( [ ("I?like!fish>", "Ilikefish"), ("%%%zombies", "zombies"), ("si%$#@!#$!@#mpsons", "simpsons"), (None, None), ], [ ("words", StringType(), True), ("expected", StringType(), True), ], ) actual_df = df.withColumn("words_without_nonword_chars", quinn.remove_non_word_characters(F.col("words"))) chispa.assert_column_equality(actual_df, "words_without_nonword_chars", "expected")
def test_remove_all_whitespace(spark): df = spark.create_df( [ (" I like fish ", "Ilikefish"), (" zombies", "zombies"), ("simpsons cat lady", "simpsonscatlady"), (None, None), ], [ ("words", StringType(), True), ("expected", StringType(), True), ], ) actual_df = df.withColumn("words_without_whitespace", quinn.remove_all_whitespace(F.col("words"))) chispa.assert_column_equality(actual_df, "words_without_whitespace", "expected")
def it_works_with_end_date_of_sunday(spark): df = spark.create_df( [ # converts a Thursday to the Sunday after (datetime.datetime(2020, 1, 2), datetime.datetime(2020, 1, 5)), # converts a Wednesday to the Sunday after (datetime.datetime(2020, 7, 15), datetime.datetime( 2020, 7, 19)), # doesn't change if the day in a Sunday (datetime.datetime(2020, 7, 19), datetime.datetime( 2020, 7, 19)), (None, None), ], [("some_date", DateType(), True), ("expected", DateType(), True)], ) actual_df = df.withColumn( "week_start_date", quinn.week_end_date(F.col("some_date"), "Sun")) chispa.assert_column_equality(actual_df, "week_start_date", "expected")
def it_defaults_to_saturday_week_end(spark): df = spark.create_df( [ # converts a Tuesday to the Saturday after (datetime.datetime(2020, 1, 2), datetime.datetime(2020, 1, 4)), # converts a Wednesday to the Saturday after (datetime.datetime(2020, 7, 15), datetime.datetime( 2020, 7, 18)), # doesn't change if the day is Saturday (datetime.datetime(2020, 7, 25), datetime.datetime( 2020, 7, 25)), (None, None), ], [("some_date", DateType(), True), ("expected", DateType(), True)], ) actual_df = df.withColumn("week_start_date", quinn.week_end_date(F.col("some_date"))) chispa.assert_column_equality(actual_df, "week_start_date", "expected")
def test_multi_equals(spark): df = spark.create_df( [ ("cat", "cat", True), ("cat", "dog", False), ("pig", "pig", False), ("", "", False), (None, None, False), ], [ ("s1", StringType(), True), ("s2", StringType(), True), ("expected", BooleanType(), True), ], ) actual_df = df.withColumn( "are_s1_and_s2_cat", quinn.multi_equals("cat")(F.col("s1"), F.col("s2"))) chispa.assert_column_equality(actual_df, "are_s1_and_s2_cat", "expected")
def it_defaults_to_sunday_start_date(spark): df = spark.create_df( [ # converts a Tuesday to the Sunday before (datetime.datetime(2020, 1, 2), datetime.datetime( 2019, 12, 29)), # converts a Wednesday to the Sunday before (datetime.datetime(2020, 7, 15), datetime.datetime( 2020, 7, 12)), # doesn't change if the day is Sunday (datetime.datetime(2020, 7, 26), datetime.datetime( 2020, 7, 26)), (None, None), ], [("some_date", DateType(), True), ("expected", DateType(), True)], ) actual_df = df.withColumn("week_start_date", quinn.week_start_date(F.col("some_date"))) chispa.assert_column_equality(actual_df, "week_start_date", "expected")
def it_works_with_integer_values(spark): df = spark.create_df( [ (12, 14, True), (20, 26, False), (44, 41, True), (32, 9, False), (None, None, None) ], [ ("num1", IntegerType(), True), ("num2", IntegerType(), True), ("expected", BooleanType(), True) ] ) actual_df = df.withColumn( "are_nums_approx_equal", quinn.approx_equal(F.col("num1"), F.col("num2"), F.lit(5)) ) chispa.assert_column_equality(actual_df, "are_nums_approx_equal", "expected")
def it_works_with_floating_values(spark): df = spark.create_df( [ (1.1, 1.05, True), (1.1, 11.6, False), (1.02, 1.09, True), (1.02, 1.34, False), (None, None, None) ], [ ("num1", FloatType(), True), ("num2", FloatType(), True), ("expected", BooleanType(), True) ] ) actual_df = df.withColumn( "are_nums_approx_equal", quinn.approx_equal(F.col("num1"), F.col("num2"), F.lit(0.1)) ) chispa.assert_column_equality(actual_df, "are_nums_approx_equal", "expected")
def it_works_with_start_date_of_monday(spark): df = spark.create_df( [ # converts a Thursday to the Monday before (datetime.datetime(2020, 1, 2), datetime.datetime( 2019, 12, 30)), # converts a Wednesday to the Monday before (datetime.datetime(2020, 7, 15), datetime.datetime( 2020, 7, 13)), # doesn't change if the day in a Monday (datetime.datetime(2020, 7, 20), datetime.datetime( 2020, 7, 20)), (None, None), ], [("some_date", DateType(), True), ("expected", DateType(), True)], ) actual_df = df.withColumn( "week_start_date", quinn.week_start_date(F.col("some_date"), "Mon")) chispa.assert_column_equality(actual_df, "week_start_date", "expected")
def test_null_between(spark): source_df = spark.create_df( [ (17, None, 94, True), (17, None, 10, False), (None, 10, 5, True), (None, 10, 88, False), (10, 15, 11, True), (None, None, 11, False), (3, 5, None, False), (None, None, None, False), ], [ ("lower_age", IntegerType(), True), ("upper_age", IntegerType(), True), ("age", IntegerType(), True), ("expected", BooleanType(), True), ], ) actual_df = source_df.withColumn( "is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age"))) chispa.assert_column_equality(actual_df, "is_between", "expected")
def test_ip_anonymizer(spark): schema = StructType([ StructField("host", StringType()), StructField("ip_country", StringType()), StructField("expected_anonymized_ip", StringType()) ]) data = [ Row(host='130.119.171.217', ip_country="US", expected_anonymized_ip="130.119.171.US"), Row(host='2001:888:197d:0:250:fcff:fe23:3879', ip_country="NL", expected_anonymized_ip="2001:888:197d:0:250:fcff:fe23:NL"), Row(host='random.domain.com', ip_country="NotIP", expected_anonymized_ip='random.domain.com'), Row(host='127.0.0.1', ip_country="NoMatch", expected_anonymized_ip="127.0.0.NaN") ] df = spark.createDataFrame(data, schema) anonymized_df = anonymize_ip(df) assert_column_equality(anonymized_df, "expected_anonymized_ip", "anonymized_ip")
def test_cast_arraytype(spark): data = [(['200', '300'], [200, 300]), (['400'], [400]), (None, None)] df = spark.createDataFrame(data, ["nums", "expected"])\ .withColumn("actual", F.col("nums").cast(ArrayType(IntegerType(), True))) assert_column_equality(df, "actual", "expected")