示例#1
0
    def make_term_frequency_adjustments(self, df_e: DataFrame):
        """Take the outputs of 'get_scored_comparisons' and make term frequency adjustments on designated columns in the settings dictionary

        Args:
            df_e (DataFrame): A dataframe produced by the get_scored_comparisons method

        Returns:
            DataFrame: A spark dataframe including a column with term frequency adjusted match probabilities
        """

        return make_adjustment_for_term_frequencies(
            df_e,
            self.model,
            retain_adjustment_columns=True,
            spark=self.spark,
        )
示例#2
0
def test_term_frequency_adjustments(spark):

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name":
                "name",
                "term_frequency_adjustments":
                True,
                "m_probabilities": [
                    0.1,  # Amonst matches, 10% are have typose
                    0.9  # The reamining 90% have a match
                ],
                "u_probabilities": [
                    4 /
                    5,  # Among non matches, 80% of the time there's no match
                    1 /
                    5  # But 20% of the time names 'collide'  WE WANT THESE U PROBABILITIES TO BE DEPENDENT ON NAME.  
                ],
            },
            {
                "col_name": "cat_12",
                "m_probabilities": [0.05, 0.95],
                "u_probabilities": [11 / 12, 1 / 12],
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            }
        ],
        "em_convergence":
        0.001
    }

    from string import ascii_letters
    import statistics
    import random
    from splink.settings import complete_settings_dict
    settings = complete_settings_dict(settings, spark="supress_warnings")

    def is_match(settings):
        p = settings["proportion_of_matches"]
        return random.choices([0, 1], [1 - p, p])[0]

    def get_row_portion(match, comparison_col, skew="auto"):
        # Problem is that at the moment we're guaranteeing that a match on john is just as likely to be a match as a match on james

        # What we want is to generate more 'collisions' for john than robin i.e. if it's a non match, we want more gamma = 1 on name for john

        if match:
            gamma_pdist = comparison_col["m_probabilities"]
        else:
            gamma_pdist = comparison_col["u_probabilities"]

        # To decide whether gamma = 0 or 1 in the case of skew, we first need to decide on what value the left hand value column will take (well, what probability it has of selection)

        # How many distinct values should be choose?
        num_values = int(round(1 / comparison_col["u_probabilities"][1]))

        if skew == "auto":
            skew = comparison_col["term_frequency_adjustments"]

        if skew:

            prob_dist = range(
                1, num_values +
                1)[::-1]  # a most freqent, last value least frequent
            # Normalise
            prob_dist = [p / sum(prob_dist) for p in prob_dist]

            index_of_value = random.choices(range(num_values), prob_dist)[0]
            if not match:  # If it's a u probability
                this_prob = prob_dist[index_of_value]
                gamma_pdist = [1 - this_prob, this_prob]

        else:
            prob_dist = [1 / num_values] * num_values
            index_of_value = random.choices(range(num_values), prob_dist)[0]

        levels = comparison_col["num_levels"]
        gamma = random.choices(range(levels), gamma_pdist)[0]

        values = ascii_letters[:26]
        if num_values > 26:
            values = [
                a + b for a in ascii_letters[:26] for b in ascii_letters[:26]
            ]  #aa, ab etc

        values = values[:num_values]

        if gamma == 1:
            value_1 = values[index_of_value]
            value_2 = value_1

        if gamma == 0:
            value_1 = values[index_of_value]
            same_value = True
            while same_value:
                value_2 = random.choices(values, prob_dist)[0]
                if value_1 != value_2:
                    same_value = False

        cname = comparison_col["col_name"]
        return {
            f"{cname}_l": value_1,
            f"{cname}_r": value_2,
            f"gamma_{cname}": gamma
        }

    import uuid
    rows = []
    for uid in range(100000):
        m = is_match(settings)
        row = {
            "unique_id_l": str(uuid.uuid4()),
            "unique_id_r": str(uuid.uuid4()),
            "match": m
        }
        for cc in settings["comparison_columns"]:
            row_portion = get_row_portion(m, cc)
            row = {**row, **row_portion}
        rows.append(row)

    all_rows = pd.DataFrame(rows)
    df_gammas = spark.createDataFrame(all_rows)

    settings["comparison_columns"][1]["term_frequency_adjustments"] = True

    from splink import Splink
    from splink.params import Params
    from splink.iterate import iterate
    from splink.term_frequencies import make_adjustment_for_term_frequencies

    # We have table of gammas - need to work from there within splink
    params = Params(settings, spark)

    df_e = iterate(df_gammas, params, settings, spark, compute_ll=False)

    df_e_adj = make_adjustment_for_term_frequencies(
        df_e, params, settings, retain_adjustment_columns=True, spark=spark)

    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select name_l, name_tf_adj,  count(*)
    from df_e_adj
    where name_l = name_r
    group by name_l, name_tf_adj
    order by name_l
    """
    df = spark.sql(sql).toPandas()
    df = df.set_index("name_l")
    df_dict = df.to_dict(orient='index')
    assert df_dict['a']["name_tf_adj"] < 0.5

    assert df_dict['e']["name_tf_adj"] > 0.5
    assert df_dict['e'][
        "name_tf_adj"] > 0.6  #Arbitrary numbers, but we do expect a big uplift here
    assert df_dict['e'][
        "name_tf_adj"] < 0.95  #Arbitrary numbers, but we do expect a big uplift here

    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select cat_12_l, cat_12_tf_adj,  count(*) as count
    from df_e_adj
    where cat_12_l = cat_12_r
    group by cat_12_l, cat_12_tf_adj
    order by cat_12_l
    """
    spark.sql(sql).toPandas()
    df = spark.sql(sql).toPandas()
    assert df["cat_12_tf_adj"].max(
    ) < 0.55  # Keep these loose because when generating random data anything can happen!
    assert df["cat_12_tf_adj"].min() > 0.45

    # Test adjustments applied coorrectly when there is one
    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select *
    from df_e_adj
    where name_l = name_r and cat_12_l != cat_12_r
    limit 1
    """
    df = spark.sql(sql).toPandas()
    df_dict = df.loc[0, :].to_dict()

    def bayes(p1, p2):
        return p1 * p2 / (p1 * p2 + (1 - p1) * (1 - p2))

    assert df_dict["tf_adjusted_match_prob"] == pytest.approx(
        bayes(df_dict["match_probability"], df_dict["name_tf_adj"]))

    # Test adjustments applied coorrectly when there are multiple
    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select *
    from df_e_adj
    where name_l = name_r and cat_12_l = cat_12_r
    limit 1
    """
    df = spark.sql(sql).toPandas()
    df_dict = df.loc[0, :].to_dict()

    double_b = bayes(
        bayes(df_dict["match_probability"], df_dict["name_tf_adj"]),
        df_dict["cat_12_tf_adj"])

    assert df_dict["tf_adjusted_match_prob"] == pytest.approx(double_b)
def test_term_frequency_adjustments(spark):

    # The strategy is going to be to create a fake dataframe
    # where we have different levels to model frequency imbalance
    # gamma=3 is where name matches and name is robin (unusual name)
    # gamma=2 is where name matches and name is matt (normal name)
    # gamma=1 is where name matches and name is john (v common name)

    # We simulate the term frequency imbalance
    # by pooling this together, setting all gamma >0
    # to equal 1

    # We then expect that
    # term frequency adjustments should adjust up the
    # robins but adjust down the johns

    # We also expect that the tf adjusted match probability should be more accurate

    forename_probs = _probabilities_from_freqs([3, 2, 1])
    surname_probs = _probabilities_from_freqs([10, 5, 1])

    settings_true = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.5,
        "comparison_columns": [
            {
                "col_name": "forename",
                "term_frequency_adjustments": True,
                "m_probabilities": forename_probs["m_probabilities"],
                "u_probabilities": forename_probs["u_probabilities"],
                "num_levels": 4,
            },
            {
                "col_name": "surname",
                "term_frequency_adjustments": True,
                "m_probabilities": surname_probs["m_probabilities"],
                "u_probabilities": surname_probs["u_probabilities"],
                "num_levels": 4,
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            },
        ],
    }

    settings_true = complete_settings_dict(settings_true, spark)

    df = generate_df_gammas_random(10000, settings_true)

    # Create new binary columns that binarise the more granular gammas to 0 and 1
    df["gamma_forename_binary"] = df["gamma_forename"].where(
        df["gamma_forename"] == 0, 1)

    df["gamma_surname_binary"] = df["gamma_surname"].where(
        df["gamma_surname"] == 0, 1)

    # Populate non matches with random value
    # Then assign left and right values ased on the gamma values
    df["forename_binary_l"] = df["unique_id_l"]
    df["forename_binary_r"] = df["unique_id_r"]

    f1 = df["gamma_forename"] == 3
    df.loc[f1, "forename_binary_l"] = "Robin"
    df.loc[f1, "forename_binary_r"] = "Robin"

    f1 = df["gamma_forename"] == 2
    df.loc[f1, "forename_binary_l"] = "Matt"
    df.loc[f1, "forename_binary_r"] = "Matt"

    f1 = df["gamma_forename"] == 1
    df.loc[f1, "forename_binary_l"] = "John"
    df.loc[f1, "forename_binary_r"] = "John"

    # Populate non matches with random value
    df["surname_binary_l"] = df["unique_id_l"]
    df["surname_binary_r"] = df["unique_id_r"]

    f1 = df["gamma_surname"] == 3
    df.loc[f1, "surname_binary_l"] = "Linacre"
    df.loc[f1, "surname_binary_r"] = "Linacre"

    f1 = df["gamma_surname"] == 2
    df.loc[f1, "surname_binary_l"] = "Hughes"
    df.loc[f1, "surname_binary_r"] = "Hughes"

    f1 = df["gamma_surname"] == 1
    df.loc[f1, "surname_binary_l"] = "Smith"
    df.loc[f1, "surname_binary_r"] = "Smith"

    # cat20
    df["cat_20_l"] = df["unique_id_l"]
    df["cat_20_r"] = df["unique_id_r"]

    f1 = df["gamma_cat_20"] == 1
    df.loc[f1, "cat_20_l"] = "a"
    df.loc[f1, "cat_20_r"] = "a"

    df = add_match_prob(df, settings_true)
    df["match_probability"] = df["true_match_probability_l"]

    df_e = spark.createDataFrame(df)

    def four_to_two(probs):
        return [probs[0], sum(probs[1:])]

    settings_binary = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.5,
        "comparison_columns": [
            {
                "col_name": "forename_binary",
                "term_frequency_adjustments": True,
                "num_levels": 2,
                "m_probabilities":
                four_to_two(forename_probs["m_probabilities"]),
                "u_probabilities":
                four_to_two(forename_probs["u_probabilities"]),
            },
            {
                "col_name": "surname_binary",
                "term_frequency_adjustments": True,
                "num_levels": 2,
                "m_probabilities":
                four_to_two(surname_probs["m_probabilities"]),
                "u_probabilities":
                four_to_two(surname_probs["u_probabilities"]),
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            },
        ],
        "retain_intermediate_calculation_columns":
        True,
        "max_iterations":
        0,
        "additional_columns_to_retain": ["true_match_probability"],
    }

    # Can't use linker = Splink() because we have df_gammas, not df
    settings_binary = complete_settings_dict(settings_binary, spark)
    model = Model(settings_binary, spark)
    df_e = iterate(df_e, model, spark)

    df_e = make_adjustment_for_term_frequencies(df_e,
                                                model,
                                                spark,
                                                retain_adjustment_columns=True)

    df = df_e.toPandas()

    #########
    # Tests start here
    #########

    # Test that overall square error is better for tf adjusted match prob
    df["e1"] = (df["match_probability"] - df["true_match_probability_l"])**2
    df["e2"] = (df["tf_adjusted_match_prob"] -
                df["true_match_probability_l"])**2
    assert df["e1"].sum() > df["e2"].sum()

    # We expect Johns to be adjusted down...
    f1 = df["forename_binary_l"] == "John"
    df_filtered = df[f1]
    adj = df_filtered["forename_binary_tf_adj"].mean()
    assert adj < 0.5

    # And Robins to be adjusted up
    f1 = df["forename_binary_l"] == "Robin"
    df_filtered = df[f1]
    adj = df_filtered["forename_binary_tf_adj"].mean()
    assert adj > 0.5

    # We expect Smiths to be adjusted down...
    f1 = df["surname_binary_l"] == "Smith"
    df_filtered = df[f1]
    adj = df_filtered["surname_binary_tf_adj"].mean()
    assert adj < 0.5

    # And Linacres to be adjusted up
    f1 = df["surname_binary_l"] == "Linacre"
    df_filtered = df[f1]
    adj = df_filtered["surname_binary_tf_adj"].mean()
    assert adj > 0.5

    # Check adjustments are applied correctly

    f1 = df["forename_binary_l"] == "Robin"
    f2 = df["surname_binary_l"] == "Linacre"
    df_filtered = df[f1 & f2]
    row = df_filtered.head(1).to_dict(orient="records")[0]

    prior = row["match_probability"]
    posterior = row["tf_adjusted_match_prob"]

    b1 = row["forename_binary_tf_adj"]
    b2 = row["surname_binary_tf_adj"]

    expected_post = (prior * b1 * b2 / (prior * b1 * b2 + (1 - prior) *
                                        (1 - b1) * (1 - b2)))
    assert posterior == pytest.approx(expected_post)

    #  We expect match probability to be equal to tf_adjusted match probability in cases where surname and forename don't match
    f1 = df["surname_binary_l"] != df["surname_binary_r"]
    f2 = df["forename_binary_l"] != df["forename_binary_r"]

    df_filtered = df[f1 & f2]
    sum_difference = (df_filtered["tf_adjusted_match_prob"] -
                      df_filtered["match_probability"]).sum()

    assert 0 == pytest.approx(sum_difference)