def test_main_api(spark, sqlite_con_1): settings = { "link_type": "dedupe_only", "comparison_columns": [{ "col_name": "surname" }, { "col_name": "mob" }], "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"], "max_iterations": 2 } settings = complete_settings_dict(settings, spark=None) dfpd = pd.read_sql("select * from test1", sqlite_con_1) df = spark.createDataFrame(dfpd) linker = Splink(settings, spark, df=df) df_e = linker.get_scored_comparisons() linker.save_model_as_json("saved_model.json", overwrite=True) linker_2 = load_from_json("saved_model.json", spark=spark, df=df) df_e = linker_2.get_scored_comparisons() from splink.intuition import intuition_report params = linker.params row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0] print(intuition_report(row_dict, params)) linker.params._print_m_u_probs()
def test_freq_adj_divzero(spark, nulls_df): # create settings object that requests term_freq_adjustments on column 'weird' settings = { "link_type": "dedupe_only", "blocking_rules": [ "l.surname = r.surname", ], "comparison_columns": [ { "col_name": "firstname", "num_levels": 3, }, { "col_name": "surname", "num_levels": 3, "term_frequency_adjustments": True, }, { "col_name": "always_none", "num_levels": 3, "term_frequency_adjustments": True, }, ], "additional_columns_to_retain": ["unique_id"], "max_iterations": 1, } # create column in a way that could trigger a div by zero on the average adj calculation before the fix nulls_df = nulls_df.withColumn("always_none", f.lit(None)) test_passing = True try: linker = Splink(settings, nulls_df, spark) linker.get_scored_comparisons() except ZeroDivisionError: test_passing = False assert test_passing is True
def test_fix_u(spark): # We expect u on the cartesian product of MoB to be around 1/12 df = [ { "unique_id": 1, "mob": "1", "first_name": "a", "surname": "a" }, { "unique_id": 2, "mob": "2", "first_name": "b", "surname": "b" }, { "unique_id": 3, "mob": "3", "first_name": "c", "surname": "c" }, { "unique_id": 4, "mob": "4", "first_name": "d", "surname": "d" }, { "unique_id": 5, "mob": "5", "first_name": "e", "surname": "e" }, { "unique_id": 6, "mob": "6", "first_name": "f", "surname": "f" }, { "unique_id": 7, "mob": "7", "first_name": "g", "surname": "g" }, { "unique_id": 9, "mob": "9", "first_name": "h", "surname": "h" }, { "unique_id": 10, "mob": "10", "first_name": "i", "surname": "i" }, { "unique_id": 10, "mob": "10", "first_name": "i", "surname": "i" }, ] df = spark.createDataFrame(Row(**x) for x in df) settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "u_probabilities": [0.8, 0.2], "fix_u_probabilities": True, }, { "col_name": "first_name", "u_probabilities": [0.8, 0.2], }, { "col_name": "surname" }, ], "blocking_rules": [], "max_iterations": 1, } linker = Splink(settings, df, spark) df_e = linker.get_scored_comparisons() # Want to check that the "u_probabilities" in the latest parameters are still 0.8, 0.2 mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["u_probabilities"][0] == pytest.approx(0.8) assert mob["u_probabilities"][1] == pytest.approx(0.2) first_name = linker.model.current_settings_obj.get_comparison_column( "first_name") assert first_name["u_probabilities"][0] != 0.8 assert first_name["u_probabilities"][1] != 0.2 settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "u_probabilities": [0.8, 0.2], "fix_u_probabilities": False, }, { "col_name": "first_name" }, { "col_name": "surname" }, ], "blocking_rules": [], "max_iterations": 1, } linker = Splink(settings, df, spark) df_e = linker.get_scored_comparisons() # Want to check that the "u_probabilities" in the latest parameters are no longer 0.8, 0.2 mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["u_probabilities"][0] != 0.8 assert mob["u_probabilities"][0] != 0.2 settings = { "link_type": "dedupe_only", "proportion_of_matches": 0.1, "comparison_columns": [ { "col_name": "mob", "num_levels": 2, "m_probabilities": [0.04, 0.96], "fix_m_probabilities": True, "u_probabilities": [0.75, 0.25], "fix_u_probabilities": False, }, { "col_name": "first_name" }, { "col_name": "surname" }, ], "blocking_rules": [], "max_iterations": 1, } linker = Splink(settings, df, spark) linker.get_scored_comparisons() mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["u_probabilities"][0] != 0.75 assert mob["u_probabilities"][1] != 0.25 mob = linker.model.current_settings_obj.get_comparison_column("mob") assert mob["m_probabilities"][0] == pytest.approx(0.04) assert mob["m_probabilities"][1] == pytest.approx(0.96)
path = os.path.join(OUTPUT_PATH, f"params/saved_params_iteration_{it_num}.json") write_local_file_to_s3("saved_params.json", path, overwrite=True) # Lineage breaking functions def blocked_comparisons_to_s3(df, spark): df = df.repartition(50) path = os.path.join(OUTPUT_PATH, "data/df_gammas/") df.write.mode("overwrite").parquet(path) df_new = spark.read.parquet(path) return df_new def scored_comparisons_to_s3(df, spark): path = os.path.join(OUTPUT_PATH, "data/df_e/") df.write.mode("overwrite").parquet(path) df_new = spark.read.parquet(path) return df_new from splink import Splink linker = Splink(settings, spark, df=df, save_state_fn=persist_params_settings, break_lineage_blocked_comparisons = blocked_comparisons_to_s3, break_lineage_scored_comparisons = scored_comparisons_to_s3 ) df_e = linker.get_scored_comparisons() write_local_file_to_s3("saved_params.json", path, overwrite=True) df_e.write.mode("overwrite").parquet(path)
def test_main_api(spark): rows = [ { "unique_id": 1, "mob": 10, "surname": "Linacre" }, { "unique_id": 2, "mob": 10, "surname": "Linacre" }, { "unique_id": 3, "mob": 10, "surname": "Linacer" }, { "unique_id": 4, "mob": 7, "surname": "Smith" }, { "unique_id": 5, "mob": 8, "surname": "Smith" }, { "unique_id": 6, "mob": 8, "surname": "Smith" }, { "unique_id": 7, "mob": 8, "surname": "Jones" }, ] df = spark.createDataFrame(Row(**x) for x in rows) settings = { "link_type": "dedupe_only", "comparison_columns": [{ "col_name": "surname" }, { "col_name": "mob" }], "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"], "max_iterations": 1, } linker = Splink(settings, df, spark) df_e = linker.get_scored_comparisons() linker.save_model_as_json("saved_model.json", overwrite=True) linker_2 = load_from_json("saved_model.json", df, spark=spark) df_e = linker_2.get_scored_comparisons() model = linker.model row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0] intuition_report(row_dict, model) bayes_factor_chart(row_dict, model)