예제 #1
0
    def test_remove_non_word_characters(self):
        source_df = self.spark.create_df([
            ("I?like!fish>", 1),
            ("%%%zombies", 2),
            ("si%$#@!#$!@#mpsons", 2),
            (None, 3),
        ], [
            ("words", StringType(), True),
            ("num", IntegerType(), True),
        ])

        actual_df = source_df.withColumn(
            "words_without_nonword_chars",
            quinn.remove_non_word_characters(col("words")))

        expected_df = self.spark.create_df([
            ("I?like!fish>", 1, "Ilikefish"),
            ("%%%zombies", 2, "zombies"),
            ("si%$#@!#$!@#mpsons", 2, "simpsons"),
            (None, 3, None),
        ], [
            ("words", StringType(), True),
            ("num", IntegerType(), True),
            ("words_without_nonword_chars", StringType(), True),
        ])

        assert expected_df.collect() == actual_df.collect()
예제 #2
0
def test_remove_non_word_characters(spark):
    df = spark.create_df(
        [
            ("I?like!fish>", "Ilikefish"),
            ("%%%zombies", "zombies"),
            ("si%$#@!#$!@#mpsons", "simpsons"),
            (None, None),
        ],
        [
            ("words", StringType(), True),
            ("expected", StringType(), True),
        ],
    )
    actual_df = df.withColumn("words_without_nonword_chars",
                              quinn.remove_non_word_characters(F.col("words")))
    chispa.assert_column_equality(actual_df, "words_without_nonword_chars",
                                  "expected")
예제 #3
0
def remove_special_letter_quinn(dataframe, column_name: str):
    return dataframe.withColumn("cleaned_{}".format(column_name),
                                quinn.remove_non_word_characters(sql_fun.col(column_name)))
예제 #4
0
def with_clean_first_name(df):
    return df.withColumn(
        "clean_first_name",
        quinn.remove_non_word_characters(F.col("first_name"))
    )