def test_porter_stem(): data = [("chocolates", "chocol"), ("chocolatey", "chocolatei"), ("choco", "choco"), (None, None)] df = spark.createDataFrame(data, ["word", "expected"]) actual_df = df.withColumn("word_porter_stem", ceja.porter_stem(col("word"))) assert_column_equality(actual_df, "word_porter_stem", "expected")
def test_match_rating_codex(): data = [("jellyfish", "JLYFSH"), ("li", "L"), ("luisa", "LS"), (None, None)] df = spark.createDataFrame(data, ["word", "expected"]) actual_df = df.withColumn("word_match_rating_codex", ceja.match_rating_codex(col("word"))) assert_column_equality(actual_df, "word_match_rating_codex", "expected")
def test_hamming_distance(): data = [("jellyfish", "smellyfish", 9), ("li", "lee", 2), ("luisa", "bruna", 4), (None, None, None)] df = spark.createDataFrame(data, ["word1", "word2", "expected"]) actual_df = df.withColumn( "word_hamming_distance", ceja.hamming_distance(col("word1"), col("word2"))) assert_column_equality(actual_df, "word_hamming_distance", "expected")
def test_match_rating_comparison(): data = [("mat", "matt", True), ("there", "their", True), ("luisa", "bruna", False), (None, None, None)] df = spark.createDataFrame(data, ["word1", "word2", "expected"]) actual_df = df.withColumn( "word_match_rating_comparison", ceja.match_rating_comparison(col("word1"), col("word2"))) assert_column_equality(actual_df, "word_match_rating_comparison", "expected")
def test_damerau_levenshtein_distance(): data = [("jellyfish", "smellyfish", 2), ("li", "lee", 2), ("luisa", "bruna", 4), (None, None, None)] df = spark.create_df(data, [("word1", StringType(), True), ("word2", StringType(), True), ("expected", IntegerType(), True)]) actual_df = df.withColumn( "word_damerau_levenshtein_distance", ceja.damerau_levenshtein_distance(col("word1"), col("word2"))) assert_column_equality(actual_df, "word_damerau_levenshtein_distance", "expected")
def test_remove_non_word_characters(spark_session): occ_pr = OccupancyProcessing(spark_session) data = [("jo&&se", "jose"), ("**li**", "li"), ("77,990", "77990"), (None, None)] df = spark_session.createDataFrame(data, ["name", "expected_name"]).withColumn( "clean_name", occ_pr.remove_non_word_characters( F.col("name"))) assert_column_equality(df, "clean_name", "expected_name")
def test_remove_special_letter_native(spark): source_data = [ ("toto@123", "*****@*****.**", "toto123"), ("titi$___", "*****@*****.**", "titi___"), ("!!##tata", "*****@*****.**", "tata"), ] source_df = spark.createDataFrame(source_data, ["name", "mail", "expected_name"]) actual_df = remove_special_letter_native(source_df, "name") actual_df.show() assert_column_equality(actual_df, "cleaned_name", "expected_name")
def test_metaphone(): data = [("jellyfish", "JLFX"), ("li", "L"), ("luisa", "LS"), ("Klumpz", "KLMPS"), ("Clumps", "KLMPS"), (None, None)] df = spark.createDataFrame(data, ["word", "expected"]) actual_df = df.withColumn("word_metaphone", ceja.metaphone(col("word"))) assert_column_equality(actual_df, "word_metaphone", "expected")
def test_nysiis(): data = [("jellyfish", "JALYF"), ("li", "L"), ("luisa", "LAS"), (None, None)] df = spark.createDataFrame(data, ["word", "expected"]) actual_df = df.withColumn("word_nysiis", ceja.nysiis(col("word"))) assert_column_equality(actual_df, "word_nysiis", "expected")