Exemplo n.º 1
0
def test_main_api(spark, sqlite_con_1):

    settings = {
        "link_type": "dedupe_only",
        "comparison_columns": [{
            "col_name": "surname"
        }, {
            "col_name": "mob"
        }],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"],
        "max_iterations": 2
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)

    df = spark.createDataFrame(dfpd)

    linker = Splink(settings, spark, df=df)
    df_e = linker.get_scored_comparisons()
    linker.save_model_as_json("saved_model.json", overwrite=True)
    linker_2 = load_from_json("saved_model.json", spark=spark, df=df)
    df_e = linker_2.get_scored_comparisons()

    from splink.intuition import intuition_report
    params = linker.params
    row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0]
    print(intuition_report(row_dict, params))

    linker.params._print_m_u_probs()
def test_freq_adj_divzero(spark, nulls_df):

    # create settings object that requests term_freq_adjustments on column 'weird'

    settings = {
        "link_type":
        "dedupe_only",
        "blocking_rules": [
            "l.surname = r.surname",
        ],
        "comparison_columns": [
            {
                "col_name": "firstname",
                "num_levels": 3,
            },
            {
                "col_name": "surname",
                "num_levels": 3,
                "term_frequency_adjustments": True,
            },
            {
                "col_name": "always_none",
                "num_levels": 3,
                "term_frequency_adjustments": True,
            },
        ],
        "additional_columns_to_retain": ["unique_id"],
        "max_iterations":
        1,
    }

    # create column in a way that could trigger a div by zero on the average adj calculation before the fix
    nulls_df = nulls_df.withColumn("always_none", f.lit(None))

    test_passing = True
    try:
        linker = Splink(settings, nulls_df, spark)
        linker.get_scored_comparisons()
    except ZeroDivisionError:
        test_passing = False

    assert test_passing is True
Exemplo n.º 3
0
def test_fix_u(spark):

    # We expect u on the cartesian product of MoB to be around 1/12
    df = [
        {
            "unique_id": 1,
            "mob": "1",
            "first_name": "a",
            "surname": "a"
        },
        {
            "unique_id": 2,
            "mob": "2",
            "first_name": "b",
            "surname": "b"
        },
        {
            "unique_id": 3,
            "mob": "3",
            "first_name": "c",
            "surname": "c"
        },
        {
            "unique_id": 4,
            "mob": "4",
            "first_name": "d",
            "surname": "d"
        },
        {
            "unique_id": 5,
            "mob": "5",
            "first_name": "e",
            "surname": "e"
        },
        {
            "unique_id": 6,
            "mob": "6",
            "first_name": "f",
            "surname": "f"
        },
        {
            "unique_id": 7,
            "mob": "7",
            "first_name": "g",
            "surname": "g"
        },
        {
            "unique_id": 9,
            "mob": "9",
            "first_name": "h",
            "surname": "h"
        },
        {
            "unique_id": 10,
            "mob": "10",
            "first_name": "i",
            "surname": "i"
        },
        {
            "unique_id": 10,
            "mob": "10",
            "first_name": "i",
            "surname": "i"
        },
    ]

    df = spark.createDataFrame(Row(**x) for x in df)

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "u_probabilities": [0.8, 0.2],
                "fix_u_probabilities": True,
            },
            {
                "col_name": "first_name",
                "u_probabilities": [0.8, 0.2],
            },
            {
                "col_name": "surname"
            },
        ],
        "blocking_rules": [],
        "max_iterations":
        1,
    }

    linker = Splink(settings, df, spark)

    df_e = linker.get_scored_comparisons()

    # Want to check that the "u_probabilities" in the latest parameters are still 0.8, 0.2
    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["u_probabilities"][0] == pytest.approx(0.8)
    assert mob["u_probabilities"][1] == pytest.approx(0.2)

    first_name = linker.model.current_settings_obj.get_comparison_column(
        "first_name")
    assert first_name["u_probabilities"][0] != 0.8
    assert first_name["u_probabilities"][1] != 0.2

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "u_probabilities": [0.8, 0.2],
                "fix_u_probabilities": False,
            },
            {
                "col_name": "first_name"
            },
            {
                "col_name": "surname"
            },
        ],
        "blocking_rules": [],
        "max_iterations":
        1,
    }

    linker = Splink(settings, df, spark)

    df_e = linker.get_scored_comparisons()

    # Want to check that the "u_probabilities" in the latest parameters are no longer 0.8, 0.2
    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["u_probabilities"][0] != 0.8
    assert mob["u_probabilities"][0] != 0.2

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [0.04, 0.96],
                "fix_m_probabilities": True,
                "u_probabilities": [0.75, 0.25],
                "fix_u_probabilities": False,
            },
            {
                "col_name": "first_name"
            },
            {
                "col_name": "surname"
            },
        ],
        "blocking_rules": [],
        "max_iterations":
        1,
    }

    linker = Splink(settings, df, spark)

    linker.get_scored_comparisons()

    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["u_probabilities"][0] != 0.75
    assert mob["u_probabilities"][1] != 0.25

    mob = linker.model.current_settings_obj.get_comparison_column("mob")
    assert mob["m_probabilities"][0] == pytest.approx(0.04)
    assert mob["m_probabilities"][1] == pytest.approx(0.96)
    path = os.path.join(OUTPUT_PATH, f"params/saved_params_iteration_{it_num}.json")
    write_local_file_to_s3("saved_params.json", path, overwrite=True)

# Lineage breaking functions
def blocked_comparisons_to_s3(df, spark):
    df = df.repartition(50)
    path = os.path.join(OUTPUT_PATH, "data/df_gammas/")
    df.write.mode("overwrite").parquet(path)
    df_new = spark.read.parquet(path)
    return df_new

def scored_comparisons_to_s3(df, spark):

    path = os.path.join(OUTPUT_PATH, "data/df_e/")
    df.write.mode("overwrite").parquet(path)
    df_new = spark.read.parquet(path)
    return df_new

from splink import Splink
linker = Splink(settings,
                spark,
                df=df,
                save_state_fn=persist_params_settings,
                break_lineage_blocked_comparisons = blocked_comparisons_to_s3,
                break_lineage_scored_comparisons = scored_comparisons_to_s3
               )
df_e = linker.get_scored_comparisons()

write_local_file_to_s3("saved_params.json", path, overwrite=True)

df_e.write.mode("overwrite").parquet(path)
Exemplo n.º 5
0
def test_main_api(spark):

    rows = [
        {
            "unique_id": 1,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 2,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 3,
            "mob": 10,
            "surname": "Linacer"
        },
        {
            "unique_id": 4,
            "mob": 7,
            "surname": "Smith"
        },
        {
            "unique_id": 5,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 6,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 7,
            "mob": 8,
            "surname": "Jones"
        },
    ]

    df = spark.createDataFrame(Row(**x) for x in rows)

    settings = {
        "link_type": "dedupe_only",
        "comparison_columns": [{
            "col_name": "surname"
        }, {
            "col_name": "mob"
        }],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"],
        "max_iterations": 1,
    }

    linker = Splink(settings, df, spark)
    df_e = linker.get_scored_comparisons()
    linker.save_model_as_json("saved_model.json", overwrite=True)
    linker_2 = load_from_json("saved_model.json", df, spark=spark)
    df_e = linker_2.get_scored_comparisons()

    model = linker.model
    row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0]
    intuition_report(row_dict, model)
    bayes_factor_chart(row_dict, model)