示例#1
0
def test_porter_stem():
    data = [("chocolates", "chocol"), ("chocolatey", "chocolatei"),
            ("choco", "choco"), (None, None)]
    df = spark.createDataFrame(data, ["word", "expected"])
    actual_df = df.withColumn("word_porter_stem",
                              ceja.porter_stem(col("word")))
    assert_column_equality(actual_df, "word_porter_stem", "expected")
示例#2
0
def test_match_rating_codex():
    data = [("jellyfish", "JLYFSH"), ("li", "L"), ("luisa", "LS"),
            (None, None)]
    df = spark.createDataFrame(data, ["word", "expected"])
    actual_df = df.withColumn("word_match_rating_codex",
                              ceja.match_rating_codex(col("word")))
    assert_column_equality(actual_df, "word_match_rating_codex", "expected")
示例#3
0
def test_hamming_distance():
    data = [("jellyfish", "smellyfish", 9), ("li", "lee", 2),
            ("luisa", "bruna", 4), (None, None, None)]
    df = spark.createDataFrame(data, ["word1", "word2", "expected"])
    actual_df = df.withColumn(
        "word_hamming_distance",
        ceja.hamming_distance(col("word1"), col("word2")))
    assert_column_equality(actual_df, "word_hamming_distance", "expected")
示例#4
0
def test_match_rating_comparison():
    data = [("mat", "matt", True), ("there", "their", True),
            ("luisa", "bruna", False), (None, None, None)]
    df = spark.createDataFrame(data, ["word1", "word2", "expected"])
    actual_df = df.withColumn(
        "word_match_rating_comparison",
        ceja.match_rating_comparison(col("word1"), col("word2")))
    assert_column_equality(actual_df, "word_match_rating_comparison",
                           "expected")
示例#5
0
def test_damerau_levenshtein_distance():
    data = [("jellyfish", "smellyfish", 2), ("li", "lee", 2),
            ("luisa", "bruna", 4), (None, None, None)]
    df = spark.create_df(data, [("word1", StringType(), True),
                                ("word2", StringType(), True),
                                ("expected", IntegerType(), True)])
    actual_df = df.withColumn(
        "word_damerau_levenshtein_distance",
        ceja.damerau_levenshtein_distance(col("word1"), col("word2")))
    assert_column_equality(actual_df, "word_damerau_levenshtein_distance",
                           "expected")
示例#6
0
def test_remove_non_word_characters(spark_session):

    occ_pr = OccupancyProcessing(spark_session)

    data = [("jo&&se", "jose"), ("**li**", "li"), ("77,990", "77990"),
            (None, None)]
    df = spark_session.createDataFrame(data,
                                       ["name", "expected_name"]).withColumn(
                                           "clean_name",
                                           occ_pr.remove_non_word_characters(
                                               F.col("name")))
    assert_column_equality(df, "clean_name", "expected_name")
示例#7
0
def test_remove_special_letter_native(spark):
    source_data = [
        ("toto@123", "*****@*****.**", "toto123"),
        ("titi$___", "*****@*****.**", "titi___"),
        ("!!##tata", "*****@*****.**", "tata"),
    ]

    source_df = spark.createDataFrame(source_data,
                                      ["name", "mail", "expected_name"])
    actual_df = remove_special_letter_native(source_df, "name")
    actual_df.show()

    assert_column_equality(actual_df, "cleaned_name", "expected_name")
示例#8
0
def test_metaphone():
    data = [("jellyfish", "JLFX"), ("li", "L"), ("luisa", "LS"),
            ("Klumpz", "KLMPS"), ("Clumps", "KLMPS"), (None, None)]
    df = spark.createDataFrame(data, ["word", "expected"])
    actual_df = df.withColumn("word_metaphone", ceja.metaphone(col("word")))
    assert_column_equality(actual_df, "word_metaphone", "expected")
示例#9
0
def test_nysiis():
    data = [("jellyfish", "JALYF"), ("li", "L"), ("luisa", "LAS"),
            (None, None)]
    df = spark.createDataFrame(data, ["word", "expected"])
    actual_df = df.withColumn("word_nysiis", ceja.nysiis(col("word")))
    assert_column_equality(actual_df, "word_nysiis", "expected")