Пример #1
0
def test_iteration_known_data_generating_process(spark, gamma_settings_4,
                                                 params_4, sqlite_con_4):

    dfpd = pd.read_sql("select * from df", sqlite_con_4)

    df_gammas = spark.createDataFrame(dfpd)

    gamma_settings_4["retain_matching_columns"] = False
    gamma_settings_4["em_convergence"] = 0.001
    gamma_settings_4["max_iterations"] = 40
    df_e = iterate(
        df_gammas,
        params_4,
        gamma_settings_4,
        spark,
        compute_ll=False,
    )

    assert params_4.iteration < 20

    assert params_4.params["π"]["gamma_col_2_levels"]["prob_dist_match"][
        "level_0"]["probability"] == pytest.approx(0.05, abs=0.01)
    assert params_4.params["π"]["gamma_col_5_levels"]["prob_dist_match"][
        "level_0"]["probability"] == pytest.approx(0.1, abs=0.01)
    assert params_4.params["π"]["gamma_col_20_levels"]["prob_dist_match"][
        "level_0"]["probability"] == pytest.approx(0.05, abs=0.01)

    assert params_4.params["π"]["gamma_col_2_levels"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.05, abs=0.01)
    assert params_4.params["π"]["gamma_col_5_levels"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.2, abs=0.01)
    assert params_4.params["π"]["gamma_col_20_levels"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.5, abs=0.01)
Пример #2
0
    def get_scored_comparisons(self):
        """Use the EM algorithm to estimate model parameters and return match probabilities.

        Note: Does not compute term frequency adjustments.

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """

        df_comparison = self._get_df_comparison()

        df_gammas = add_gammas(df_comparison, self.settings, self.spark)

        df_gammas = self.break_lineage_blocked_comparisons(
            df_gammas, self.spark)

        df_e = iterate(
            df_gammas,
            self.params,
            self.settings,
            self.spark,
            compute_ll=False,
            save_state_fn=self.save_state_fn,
        )

        # In case the user's break lineage function has persisted it
        df_gammas.unpersist()

        df_e = self.break_lineage_scored_comparisons(df_e, self.spark)

        df_e_adj = self._make_term_frequency_adjustments(df_e)

        df_e.unpersist()

        return df_e_adj
Пример #3
0
    def get_scored_comparisons(self):
        """Use the EM algorithm to estimate model parameters and return match probabilities.

        Note: Does not compute term frequency adjustments.

        Returns:
            DataFrame: A spark dataframe including a match probability column
        """

        df_comparison = self._get_df_comparison()

        df_gammas = add_gammas(df_comparison, self.settings, self.spark)

        df_gammas.persist()

        df_e = iterate(
            df_gammas,
            self.params,
            self.settings,
            self.spark,
            compute_ll=False,
            save_state_fn=self.save_state_fn,
        )
        df_gammas.unpersist()
        return df_e
def estimate(df_gammas: DataFrame, settings: dict, spark: SparkSession):
    """Take pandas datafrae of gammas and estimate splink model

    Args:
        df_gammas (DataFrame): Pandas dataframe of df_gammas
        settings (dict): Splink settings dictionary
        spark (SparkSession): SparkSession object
    """

    settings["retain_matching_columns"] = False

    df = spark.createDataFrame(df_gammas)

    model = Model(settings, spark)

    df_e = iterate(df, model, spark)

    return df_e, model
Пример #5
0
def test_term_frequency_adjustments(spark):

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name":
                "name",
                "term_frequency_adjustments":
                True,
                "m_probabilities": [
                    0.1,  # Amonst matches, 10% are have typose
                    0.9  # The reamining 90% have a match
                ],
                "u_probabilities": [
                    4 /
                    5,  # Among non matches, 80% of the time there's no match
                    1 /
                    5  # But 20% of the time names 'collide'  WE WANT THESE U PROBABILITIES TO BE DEPENDENT ON NAME.  
                ],
            },
            {
                "col_name": "cat_12",
                "m_probabilities": [0.05, 0.95],
                "u_probabilities": [11 / 12, 1 / 12],
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            }
        ],
        "em_convergence":
        0.001
    }

    from string import ascii_letters
    import statistics
    import random
    from splink.settings import complete_settings_dict
    settings = complete_settings_dict(settings, spark="supress_warnings")

    def is_match(settings):
        p = settings["proportion_of_matches"]
        return random.choices([0, 1], [1 - p, p])[0]

    def get_row_portion(match, comparison_col, skew="auto"):
        # Problem is that at the moment we're guaranteeing that a match on john is just as likely to be a match as a match on james

        # What we want is to generate more 'collisions' for john than robin i.e. if it's a non match, we want more gamma = 1 on name for john

        if match:
            gamma_pdist = comparison_col["m_probabilities"]
        else:
            gamma_pdist = comparison_col["u_probabilities"]

        # To decide whether gamma = 0 or 1 in the case of skew, we first need to decide on what value the left hand value column will take (well, what probability it has of selection)

        # How many distinct values should be choose?
        num_values = int(round(1 / comparison_col["u_probabilities"][1]))

        if skew == "auto":
            skew = comparison_col["term_frequency_adjustments"]

        if skew:

            prob_dist = range(
                1, num_values +
                1)[::-1]  # a most freqent, last value least frequent
            # Normalise
            prob_dist = [p / sum(prob_dist) for p in prob_dist]

            index_of_value = random.choices(range(num_values), prob_dist)[0]
            if not match:  # If it's a u probability
                this_prob = prob_dist[index_of_value]
                gamma_pdist = [1 - this_prob, this_prob]

        else:
            prob_dist = [1 / num_values] * num_values
            index_of_value = random.choices(range(num_values), prob_dist)[0]

        levels = comparison_col["num_levels"]
        gamma = random.choices(range(levels), gamma_pdist)[0]

        values = ascii_letters[:26]
        if num_values > 26:
            values = [
                a + b for a in ascii_letters[:26] for b in ascii_letters[:26]
            ]  #aa, ab etc

        values = values[:num_values]

        if gamma == 1:
            value_1 = values[index_of_value]
            value_2 = value_1

        if gamma == 0:
            value_1 = values[index_of_value]
            same_value = True
            while same_value:
                value_2 = random.choices(values, prob_dist)[0]
                if value_1 != value_2:
                    same_value = False

        cname = comparison_col["col_name"]
        return {
            f"{cname}_l": value_1,
            f"{cname}_r": value_2,
            f"gamma_{cname}": gamma
        }

    import uuid
    rows = []
    for uid in range(100000):
        m = is_match(settings)
        row = {
            "unique_id_l": str(uuid.uuid4()),
            "unique_id_r": str(uuid.uuid4()),
            "match": m
        }
        for cc in settings["comparison_columns"]:
            row_portion = get_row_portion(m, cc)
            row = {**row, **row_portion}
        rows.append(row)

    all_rows = pd.DataFrame(rows)
    df_gammas = spark.createDataFrame(all_rows)

    settings["comparison_columns"][1]["term_frequency_adjustments"] = True

    from splink import Splink
    from splink.params import Params
    from splink.iterate import iterate
    from splink.term_frequencies import make_adjustment_for_term_frequencies

    # We have table of gammas - need to work from there within splink
    params = Params(settings, spark)

    df_e = iterate(df_gammas, params, settings, spark, compute_ll=False)

    df_e_adj = make_adjustment_for_term_frequencies(
        df_e, params, settings, retain_adjustment_columns=True, spark=spark)

    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select name_l, name_tf_adj,  count(*)
    from df_e_adj
    where name_l = name_r
    group by name_l, name_tf_adj
    order by name_l
    """
    df = spark.sql(sql).toPandas()
    df = df.set_index("name_l")
    df_dict = df.to_dict(orient='index')
    assert df_dict['a']["name_tf_adj"] < 0.5

    assert df_dict['e']["name_tf_adj"] > 0.5
    assert df_dict['e'][
        "name_tf_adj"] > 0.6  #Arbitrary numbers, but we do expect a big uplift here
    assert df_dict['e'][
        "name_tf_adj"] < 0.95  #Arbitrary numbers, but we do expect a big uplift here

    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select cat_12_l, cat_12_tf_adj,  count(*) as count
    from df_e_adj
    where cat_12_l = cat_12_r
    group by cat_12_l, cat_12_tf_adj
    order by cat_12_l
    """
    spark.sql(sql).toPandas()
    df = spark.sql(sql).toPandas()
    assert df["cat_12_tf_adj"].max(
    ) < 0.55  # Keep these loose because when generating random data anything can happen!
    assert df["cat_12_tf_adj"].min() > 0.45

    # Test adjustments applied coorrectly when there is one
    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select *
    from df_e_adj
    where name_l = name_r and cat_12_l != cat_12_r
    limit 1
    """
    df = spark.sql(sql).toPandas()
    df_dict = df.loc[0, :].to_dict()

    def bayes(p1, p2):
        return p1 * p2 / (p1 * p2 + (1 - p1) * (1 - p2))

    assert df_dict["tf_adjusted_match_prob"] == pytest.approx(
        bayes(df_dict["match_probability"], df_dict["name_tf_adj"]))

    # Test adjustments applied coorrectly when there are multiple
    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select *
    from df_e_adj
    where name_l = name_r and cat_12_l = cat_12_r
    limit 1
    """
    df = spark.sql(sql).toPandas()
    df_dict = df.loc[0, :].to_dict()

    double_b = bayes(
        bayes(df_dict["match_probability"], df_dict["name_tf_adj"]),
        df_dict["cat_12_tf_adj"])

    assert df_dict["tf_adjusted_match_prob"] == pytest.approx(double_b)
Пример #6
0
def test_iterate(spark, sqlite_con_1, params_1, gamma_settings_1):

    original_params = copy.deepcopy(params_1.params)
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    rules = [
        "l.mob = r.mob",
        "l.surname = r.surname",
    ]

    gamma_settings_1["blocking_rules"] = rules

    df_comparison = block_using_rules(gamma_settings_1, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, gamma_settings_1, spark)

    gamma_settings_1["max_iterations"] = 1
    df_e = iterate(df_gammas, params_1, gamma_settings_1, spark)

    assert params_1.params["λ"] == pytest.approx(0.540922141)

    assert params_1.params["π"]["gamma_mob"]["prob_dist_match"]["level_0"][
        "probability"] == pytest.approx(0.087438272, abs=0.0001)
    assert params_1.params["π"]["gamma_surname"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.160167628, abs=0.0001)

    first_it_params = copy.deepcopy(params_1.params)

    df_e_pd = df_e.toPandas()
    df_e_pd = df_e_pd.sort_values(["unique_id_l", "unique_id_r"])

    correct_list = [
        0.658602114,
        0.796821727,
        0.796821727,
        0.189486495,
        0.189486495,
        0.658602114,
        0.495063367,
        0.495063367,
    ]
    result_list = list(df_e_pd["match_probability"].astype(float))

    for i in zip(result_list, correct_list):
        assert i[0] == pytest.approx(i[1], abs=0.0001)

    # Does it still work with another iteration?
    gamma_settings_1["max_iterations"] = 1
    df_e = iterate(df_gammas, params_1, gamma_settings_1, spark)
    assert params_1.params["λ"] == pytest.approx(0.534993426, abs=0.0001)

    assert params_1.params["π"]["gamma_mob"]["prob_dist_match"]["level_0"][
        "probability"] == pytest.approx(0.088546179, abs=0.0001)
    assert params_1.params["π"]["gamma_surname"]["prob_dist_non_match"][
        "level_1"]["probability"] == pytest.approx(0.109234086, abs=0.0001)

    ## Test whether the params object is correctly storing the iteration history

    assert params_1.param_history[0] == original_params
    assert params_1.param_history[1] == first_it_params

    ## Now test whether, when we

    data = params_1._convert_params_dict_to_dataframe(original_params)
    val1 = {
        "gamma": "gamma_mob",
        "match": 0,
        "value_of_gamma": "level_0",
        "probability": 0.8,
        "value": 0,
        "column": "mob",
    }
    val2 = {
        "gamma": "gamma_surname",
        "match": 1,
        "value_of_gamma": "level_1",
        "probability": 0.2,
        "value": 1,
        "column": "surname",
    }

    assert val1 in data
    assert val2 in data

    correct_list = [{
        "iteration": 0,
        "λ": 0.4
    }, {
        "iteration": 1,
        "λ": 0.540922141
    }]

    result_list = params_1._iteration_history_df_lambdas()

    for i in zip(result_list, correct_list):
        assert i[0]["iteration"] == i[1]["iteration"]
        assert i[0]["λ"] == pytest.approx(i[1]["λ"])

    result_list = params_1._iteration_history_df_gammas()

    val1 = {
        "iteration": 0,
        "gamma": "gamma_mob",
        "match": 0,
        "value_of_gamma": "level_0",
        "probability": 0.8,
        "value": 0,
        "column": "mob",
    }
    assert val1 in result_list

    val2 = {
        "iteration": 1,
        "gamma": "gamma_surname",
        "match": 0,
        "value_of_gamma": "level_1",
        "probability": 0.160167628,
        "value": 1,
        "column": "surname",
    }

    for r in result_list:
        if r["iteration"] == 1:
            if r["gamma"] == "gamma_surname":
                if r["match"] == 0:
                    if r["value"] == 1:
                        record = r

    for k, v in record.items():
        expected_value = val2[k]
        if k == "probability":
            assert v == pytest.approx(expected_value, abs=0.0001)
        else:
            assert v == expected_value

    # Test whether saving and loading parameters works
    import tempfile

    dir = tempfile.TemporaryDirectory()
    fname = os.path.join(dir.name, "params.json")

    # print(params_1.params)
    # import json
    # print(json.dumps(params_1.to_dict(), indent=4))

    params_1.save_params_to_json_file(fname)

    from splink.params import load_params_from_json

    p = load_params_from_json(fname)
    assert p.params["λ"] == pytest.approx(params_1.params["λ"])
def test_term_frequency_adjustments(spark):

    # The strategy is going to be to create a fake dataframe
    # where we have different levels to model frequency imbalance
    # gamma=3 is where name matches and name is robin (unusual name)
    # gamma=2 is where name matches and name is matt (normal name)
    # gamma=1 is where name matches and name is john (v common name)

    # We simulate the term frequency imbalance
    # by pooling this together, setting all gamma >0
    # to equal 1

    # We then expect that
    # term frequency adjustments should adjust up the
    # robins but adjust down the johns

    # We also expect that the tf adjusted match probability should be more accurate

    forename_probs = _probabilities_from_freqs([3, 2, 1])
    surname_probs = _probabilities_from_freqs([10, 5, 1])

    settings_true = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.5,
        "comparison_columns": [
            {
                "col_name": "forename",
                "term_frequency_adjustments": True,
                "m_probabilities": forename_probs["m_probabilities"],
                "u_probabilities": forename_probs["u_probabilities"],
                "num_levels": 4,
            },
            {
                "col_name": "surname",
                "term_frequency_adjustments": True,
                "m_probabilities": surname_probs["m_probabilities"],
                "u_probabilities": surname_probs["u_probabilities"],
                "num_levels": 4,
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            },
        ],
    }

    settings_true = complete_settings_dict(settings_true, spark)

    df = generate_df_gammas_random(10000, settings_true)

    # Create new binary columns that binarise the more granular gammas to 0 and 1
    df["gamma_forename_binary"] = df["gamma_forename"].where(
        df["gamma_forename"] == 0, 1)

    df["gamma_surname_binary"] = df["gamma_surname"].where(
        df["gamma_surname"] == 0, 1)

    # Populate non matches with random value
    # Then assign left and right values ased on the gamma values
    df["forename_binary_l"] = df["unique_id_l"]
    df["forename_binary_r"] = df["unique_id_r"]

    f1 = df["gamma_forename"] == 3
    df.loc[f1, "forename_binary_l"] = "Robin"
    df.loc[f1, "forename_binary_r"] = "Robin"

    f1 = df["gamma_forename"] == 2
    df.loc[f1, "forename_binary_l"] = "Matt"
    df.loc[f1, "forename_binary_r"] = "Matt"

    f1 = df["gamma_forename"] == 1
    df.loc[f1, "forename_binary_l"] = "John"
    df.loc[f1, "forename_binary_r"] = "John"

    # Populate non matches with random value
    df["surname_binary_l"] = df["unique_id_l"]
    df["surname_binary_r"] = df["unique_id_r"]

    f1 = df["gamma_surname"] == 3
    df.loc[f1, "surname_binary_l"] = "Linacre"
    df.loc[f1, "surname_binary_r"] = "Linacre"

    f1 = df["gamma_surname"] == 2
    df.loc[f1, "surname_binary_l"] = "Hughes"
    df.loc[f1, "surname_binary_r"] = "Hughes"

    f1 = df["gamma_surname"] == 1
    df.loc[f1, "surname_binary_l"] = "Smith"
    df.loc[f1, "surname_binary_r"] = "Smith"

    # cat20
    df["cat_20_l"] = df["unique_id_l"]
    df["cat_20_r"] = df["unique_id_r"]

    f1 = df["gamma_cat_20"] == 1
    df.loc[f1, "cat_20_l"] = "a"
    df.loc[f1, "cat_20_r"] = "a"

    df = add_match_prob(df, settings_true)
    df["match_probability"] = df["true_match_probability_l"]

    df_e = spark.createDataFrame(df)

    def four_to_two(probs):
        return [probs[0], sum(probs[1:])]

    settings_binary = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.5,
        "comparison_columns": [
            {
                "col_name": "forename_binary",
                "term_frequency_adjustments": True,
                "num_levels": 2,
                "m_probabilities":
                four_to_two(forename_probs["m_probabilities"]),
                "u_probabilities":
                four_to_two(forename_probs["u_probabilities"]),
            },
            {
                "col_name": "surname_binary",
                "term_frequency_adjustments": True,
                "num_levels": 2,
                "m_probabilities":
                four_to_two(surname_probs["m_probabilities"]),
                "u_probabilities":
                four_to_two(surname_probs["u_probabilities"]),
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            },
        ],
        "retain_intermediate_calculation_columns":
        True,
        "max_iterations":
        0,
        "additional_columns_to_retain": ["true_match_probability"],
    }

    # Can't use linker = Splink() because we have df_gammas, not df
    settings_binary = complete_settings_dict(settings_binary, spark)
    model = Model(settings_binary, spark)
    df_e = iterate(df_e, model, spark)

    df_e = make_adjustment_for_term_frequencies(df_e,
                                                model,
                                                spark,
                                                retain_adjustment_columns=True)

    df = df_e.toPandas()

    #########
    # Tests start here
    #########

    # Test that overall square error is better for tf adjusted match prob
    df["e1"] = (df["match_probability"] - df["true_match_probability_l"])**2
    df["e2"] = (df["tf_adjusted_match_prob"] -
                df["true_match_probability_l"])**2
    assert df["e1"].sum() > df["e2"].sum()

    # We expect Johns to be adjusted down...
    f1 = df["forename_binary_l"] == "John"
    df_filtered = df[f1]
    adj = df_filtered["forename_binary_tf_adj"].mean()
    assert adj < 0.5

    # And Robins to be adjusted up
    f1 = df["forename_binary_l"] == "Robin"
    df_filtered = df[f1]
    adj = df_filtered["forename_binary_tf_adj"].mean()
    assert adj > 0.5

    # We expect Smiths to be adjusted down...
    f1 = df["surname_binary_l"] == "Smith"
    df_filtered = df[f1]
    adj = df_filtered["surname_binary_tf_adj"].mean()
    assert adj < 0.5

    # And Linacres to be adjusted up
    f1 = df["surname_binary_l"] == "Linacre"
    df_filtered = df[f1]
    adj = df_filtered["surname_binary_tf_adj"].mean()
    assert adj > 0.5

    # Check adjustments are applied correctly

    f1 = df["forename_binary_l"] == "Robin"
    f2 = df["surname_binary_l"] == "Linacre"
    df_filtered = df[f1 & f2]
    row = df_filtered.head(1).to_dict(orient="records")[0]

    prior = row["match_probability"]
    posterior = row["tf_adjusted_match_prob"]

    b1 = row["forename_binary_tf_adj"]
    b2 = row["surname_binary_tf_adj"]

    expected_post = (prior * b1 * b2 / (prior * b1 * b2 + (1 - prior) *
                                        (1 - b1) * (1 - b2)))
    assert posterior == pytest.approx(expected_post)

    #  We expect match probability to be equal to tf_adjusted match probability in cases where surname and forename don't match
    f1 = df["surname_binary_l"] != df["surname_binary_r"]
    f2 = df["forename_binary_l"] != df["forename_binary_r"]

    df_filtered = df[f1 & f2]
    sum_difference = (df_filtered["tf_adjusted_match_prob"] -
                      df_filtered["match_probability"]).sum()

    assert 0 == pytest.approx(sum_difference)