def test_remove_non_word_characters(self): source_df = self.spark.create_df([ ("I?like!fish>", 1), ("%%%zombies", 2), ("si%$#@!#$!@#mpsons", 2), (None, 3), ], [ ("words", StringType(), True), ("num", IntegerType(), True), ]) actual_df = source_df.withColumn( "words_without_nonword_chars", quinn.remove_non_word_characters(col("words"))) expected_df = self.spark.create_df([ ("I?like!fish>", 1, "Ilikefish"), ("%%%zombies", 2, "zombies"), ("si%$#@!#$!@#mpsons", 2, "simpsons"), (None, 3, None), ], [ ("words", StringType(), True), ("num", IntegerType(), True), ("words_without_nonword_chars", StringType(), True), ]) assert expected_df.collect() == actual_df.collect()
def test_remove_non_word_characters(spark): df = spark.create_df( [ ("I?like!fish>", "Ilikefish"), ("%%%zombies", "zombies"), ("si%$#@!#$!@#mpsons", "simpsons"), (None, None), ], [ ("words", StringType(), True), ("expected", StringType(), True), ], ) actual_df = df.withColumn("words_without_nonword_chars", quinn.remove_non_word_characters(F.col("words"))) chispa.assert_column_equality(actual_df, "words_without_nonword_chars", "expected")
def remove_special_letter_quinn(dataframe, column_name: str): return dataframe.withColumn("cleaned_{}".format(column_name), quinn.remove_non_word_characters(sql_fun.col(column_name)))
def with_clean_first_name(df): return df.withColumn( "clean_first_name", quinn.remove_non_word_characters(F.col("first_name")) )