예제 #1
0
def test_expectation(spark, sqlite_con_1, params_1, gamma_settings_1):
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    gamma_settings_1["blocking_rules"] = [
        "l.mob = r.mob",
        "l.surname = r.surname",
    ]

    df_comparison = block_using_rules(gamma_settings_1, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, gamma_settings_1, spark)

    # df_e = iterate(df_gammas, spark, params_1, num_iterations=1)
    df_e = run_expectation_step(df_gammas, params_1, gamma_settings_1, spark)

    df_e_pd = df_e.toPandas()
    df_e_pd = df_e_pd.sort_values(["unique_id_l", "unique_id_r"])

    correct_list = [
        0.893617021,
        0.705882353,
        0.705882353,
        0.189189189,
        0.189189189,
        0.893617021,
        0.375,
        0.375,
    ]
    result_list = list(df_e_pd["match_probability"].astype(float))

    for i in zip(result_list, correct_list):
        assert i[0] == pytest.approx(i[1])
예제 #2
0
def test_tiny_numbers(spark, sqlite_con_1):

    # Regression test, see https://github.com/moj-analytical-services/splink/issues/48

    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    settings = {
        "link_type": "dedupe_only",
        "proportion_of_matches": 0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [5.9380419956766985e-25, 1 - 5.9380419956766985e-25],
                "u_probabilities": [0.8, 0.2],
            },
            {"col_name": "surname", "num_levels": 2,},
        ],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname",],
    }

    settings = complete_settings_dict(settings, spark=None)

    df_comparison = block_using_rules(settings, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, settings, spark)
    params = Params(settings, spark="supress_warnings")

    df_e = run_expectation_step(df_gammas, params, settings, spark)
예제 #3
0
    def manually_apply_fellegi_sunter_weights(self):
        """Compute match probabilities from m and u probabilities specified in the splink settings object

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """
        df_comparison = self._get_df_comparison()
        df_gammas = add_gammas(df_comparison, self.settings, self.spark)
        return run_expectation_step(df_gammas, self.params, self.settings,
                                    self.spark)
예제 #4
0
    def manually_apply_fellegi_sunter_weights(self):
        """Compute match probabilities from m and u probabilities specified in the splink settings object

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """
        df_comparison = block_using_rules(self.settings_dict, self.df,
                                          self.spark)
        df_gammas = add_gammas(df_comparison, self.settings_dict, self.spark)
        # see https://github.com/moj-analytical-services/splink/issues/187
        df_gammas = self.break_lineage_blocked_comparisons(
            df_gammas, self.spark)
        return run_expectation_step(df_gammas, self.model, self.spark)
def test_expectation_and_maximisation(spark):
    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [0.1, 0.9],
                "u_probabilities": [0.8, 0.2],
            },
            {
                "custom_name": "surname",
                "custom_columns_used": ["surname"],
                "num_levels": 3,
                "case_expression": """
                    case
                    when surname_l is null or surname_r is null then -1
                    when surname_l = surname_r then 2
                    when substr(surname_l,1, 3) =  substr(surname_r, 1, 3) then 1
                    else 0
                    end
                    as gamma_surname
                    """,
                "m_probabilities": [0.1, 0.2, 0.7],
                "u_probabilities": [0.5, 0.25, 0.25],
            },
        ],
        "blocking_rules": [
            "l.mob = r.mob",
            "l.surname = r.surname",
        ],
        "retain_intermediate_calculation_columns":
        True,
    }

    rows = [
        {
            "unique_id": 1,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 2,
            "mob": 10,
            "surname": "Linacre"
        },
        {
            "unique_id": 3,
            "mob": 10,
            "surname": "Linacer"
        },
        {
            "unique_id": 4,
            "mob": 7,
            "surname": "Smith"
        },
        {
            "unique_id": 5,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 6,
            "mob": 8,
            "surname": "Smith"
        },
        {
            "unique_id": 7,
            "mob": 8,
            "surname": "Jones"
        },
    ]

    df_input = spark.createDataFrame(Row(**x) for x in rows)
    df_input.persist()
    params = Model(settings, spark)

    df_comparison = block_using_rules(
        params.current_settings_obj.settings_dict, df_input, spark)
    df_gammas = add_gammas(df_comparison,
                           params.current_settings_obj.settings_dict, spark)
    df_gammas.persist()
    df_e = run_expectation_step(df_gammas, params, spark)
    df_e = df_e.sort("unique_id_l", "unique_id_r")

    df_e.persist()

    ################################################
    # Test probabilities correctly assigned
    ################################################

    df = df_e.toPandas()
    cols_to_keep = [
        "prob_gamma_mob_match",
        "prob_gamma_mob_non_match",
        "prob_gamma_surname_match",
        "prob_gamma_surname_non_match",
    ]
    pd_df_result = df[cols_to_keep][:4]

    df_correct = [
        {
            "prob_gamma_mob_match": 0.9,
            "prob_gamma_mob_non_match": 0.2,
            "prob_gamma_surname_match": 0.7,
            "prob_gamma_surname_non_match": 0.25,
        },
        {
            "prob_gamma_mob_match": 0.9,
            "prob_gamma_mob_non_match": 0.2,
            "prob_gamma_surname_match": 0.2,
            "prob_gamma_surname_non_match": 0.25,
        },
        {
            "prob_gamma_mob_match": 0.9,
            "prob_gamma_mob_non_match": 0.2,
            "prob_gamma_surname_match": 0.2,
            "prob_gamma_surname_non_match": 0.25,
        },
        {
            "prob_gamma_mob_match": 0.1,
            "prob_gamma_mob_non_match": 0.8,
            "prob_gamma_surname_match": 0.7,
            "prob_gamma_surname_non_match": 0.25,
        },
    ]

    pd_df_correct = pd.DataFrame(df_correct)

    assert_frame_equal(pd_df_correct, pd_df_result)

    ################################################
    # Test match probabilities correctly calculated
    ################################################

    result_list = list(df["match_probability"])
    # See https://github.com/moj-analytical-services/splink/blob/master/tests/expectation_maximisation_test_answers.xlsx
    # for derivation of these numbers
    correct_list = [
        0.893617021,
        0.705882353,
        0.705882353,
        0.189189189,
        0.189189189,
        0.893617021,
        0.375,
        0.375,
    ]
    assert result_list == pytest.approx(correct_list)

    ################################################
    # Test new probabilities correctly calculated
    ################################################

    run_maximisation_step(df_e, params, spark)

    new_lambda = params.current_settings_obj["proportion_of_matches"]

    # See https://github.com/moj-analytical-services/splink/blob/master/tests/expectation_maximisation_test_answers.xlsx
    # for derivation of these numbers
    assert new_lambda == pytest.approx(0.540922141)

    rows = [
        ["mob", 0, 0.087438272, 0.441543191],
        ["mob", 1, 0.912561728, 0.558456809],
        ["surname", 0, 0.173315146, 0.340356209],
        ["surname", 1, 0.326240275, 0.160167628],
        ["surname", 2, 0.500444578, 0.499476163],
    ]

    settings_obj = params.current_settings_obj

    for r in rows:
        cc = settings_obj.get_comparison_column(r[0])
        level_dict = cc.level_as_dict(r[1])
        assert level_dict["m_probability"] == pytest.approx(r[2])
        assert level_dict["u_probability"] == pytest.approx(r[3])

    ################################################
    # Test revised probabilities correctly used
    ################################################

    df_e = run_expectation_step(df_gammas, params, spark)
    df_e = df_e.sort("unique_id_l", "unique_id_r")
    result_list = list(df_e.toPandas()["match_probability"])

    correct_list = [
        0.658602114,
        0.796821727,
        0.796821727,
        0.189486495,
        0.189486495,
        0.658602114,
        0.495063367,
        0.495063367,
    ]
    assert result_list == pytest.approx(correct_list)

    run_maximisation_step(df_e, params, spark)
    new_lambda = params.current_settings_obj["proportion_of_matches"]
    assert new_lambda == pytest.approx(0.534993426)

    rows = [
        ["mob", 0, 0.088546179, 0.435753788],
        ["mob", 1, 0.911453821, 0.564246212],
        ["surname", 0, 0.231340865, 0.27146747],
        ["surname", 1, 0.372351177, 0.109234086],
        ["surname", 2, 0.396307958, 0.619298443],
    ]

    settings_obj = params.current_settings_obj

    for r in rows:
        cc = settings_obj.get_comparison_column(r[0])
        level_dict = cc.level_as_dict(r[1])
        assert level_dict["m_probability"] == pytest.approx(r[2])
        assert level_dict["u_probability"] == pytest.approx(r[3])

    ################################################
    # Test whether saving and loading params works
    # (If we load params, does the expectation step yield same answer)
    ################################################
    import tempfile

    dir = tempfile.TemporaryDirectory()
    fname = os.path.join(dir.name, "params.json")

    df_e = run_expectation_step(df_gammas, params, spark)
    params.save_model_to_json_file(fname)

    from splink.model import load_model_from_json

    p = load_model_from_json(fname)

    df_e_2 = run_expectation_step(df_gammas, p, spark)

    assert_frame_equal(df_e.toPandas(), df_e_2.toPandas())