예제 #1
0
def gamma_settings_1():
    gamma_settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [0.1, 0.9],
                "u_probabilities": [0.8, 0.2],
            },
            {
                "col_name": "surname",
                "num_levels": 3,
                "case_expression": """
            case
            when surname_l is null or surname_r is null then -1
            when surname_l = surname_r then 2
            when substr(surname_l,1, 3) =  substr(surname_r, 1, 3) then 1
            else 0
            end
            as gamma_surname
            """,
                "m_probabilities": [0.1, 0.2, 0.7],
                "u_probabilities": [0.5, 0.25, 0.25],
            },
        ],
        "blocking_rules": []
    }
    gamma_settings = complete_settings_dict(gamma_settings,
                                            spark="supress_warnings")
    yield gamma_settings
예제 #2
0
def test_main_api(spark, sqlite_con_1):

    settings = {
        "link_type": "dedupe_only",
        "comparison_columns": [{
            "col_name": "surname"
        }, {
            "col_name": "mob"
        }],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname"],
        "max_iterations": 2
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd = pd.read_sql("select * from test1", sqlite_con_1)

    df = spark.createDataFrame(dfpd)

    linker = Splink(settings, spark, df=df)
    df_e = linker.get_scored_comparisons()
    linker.save_model_as_json("saved_model.json", overwrite=True)
    linker_2 = load_from_json("saved_model.json", spark=spark, df=df)
    df_e = linker_2.get_scored_comparisons()

    from splink.intuition import intuition_report
    params = linker.params
    row_dict = df_e.toPandas().sample(1).to_dict(orient="records")[0]
    print(intuition_report(row_dict, params))

    linker.params._print_m_u_probs()
예제 #3
0
def test_tiny_numbers(spark, sqlite_con_1):

    # Regression test, see https://github.com/moj-analytical-services/splink/issues/48

    dfpd = pd.read_sql("select * from test1", sqlite_con_1)
    df = spark.createDataFrame(dfpd)

    settings = {
        "link_type": "dedupe_only",
        "proportion_of_matches": 0.4,
        "comparison_columns": [
            {
                "col_name": "mob",
                "num_levels": 2,
                "m_probabilities": [5.9380419956766985e-25, 1 - 5.9380419956766985e-25],
                "u_probabilities": [0.8, 0.2],
            },
            {"col_name": "surname", "num_levels": 2,},
        ],
        "blocking_rules": ["l.mob = r.mob", "l.surname = r.surname",],
    }

    settings = complete_settings_dict(settings, spark=None)

    df_comparison = block_using_rules(settings, df=df, spark=spark)

    df_gammas = add_gammas(df_comparison, settings, spark)
    params = Params(settings, spark="supress_warnings")

    df_e = run_expectation_step(df_gammas, params, settings, spark)
예제 #4
0
def gamma_settings_4():
    gamma_settings = {
        "link_type": "dedupe_only",
        "proportion_of_matches":0.9,
        "comparison_columns": [
            {
                "col_name": "col_2_levels",
                "num_levels": 2,
                "case_expression": sql_gen_case_smnt_strict_equality_2("col_2_levels"),
            },
            {
                "col_name": "col_5_levels",
                "num_levels": 2,
                "case_expression": sql_gen_case_smnt_strict_equality_2("col_5_levels"),
            },
            {
                "col_name": "col_20_levels",
                "num_levels": 2,
                "case_expression": sql_gen_case_smnt_strict_equality_2("col_20_levels"),
            },
        ],
        "blocking_rules": []
    }

    gamma_settings = complete_settings_dict(gamma_settings, spark="supress_warnings")
    yield gamma_settings
예제 #5
0
def test_link_option_link(spark, link_dedupe_data_repeat_ids):
    settings = {
        "link_type": "link_only",
        "comparison_columns": [{
            "col_name": "first_name"
        }, {
            "col_name": "surname"
        }],
        "blocking_rules":
        ["l.first_name = r.first_name", "l.surname = r.surname"]
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids)
    df_l = spark.createDataFrame(dfpd_l)
    dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids)
    df_r = spark.createDataFrame(dfpd_r)
    df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r)
    df = df.toPandas()

    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1, 1, 2, 2, 3, 3]
    assert list(df["unique_id_r"]) == [1, 3, 2, 3, 2, 3]

    # Test cartesian version

    settings = {
        "link_type": "link_only",
        "comparison_columns": [{
            "col_name": "first_name"
        }, {
            "col_name": "surname"
        }],
        "blocking_rules": []
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids)
    df_l = spark.createDataFrame(dfpd_l)
    dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids)
    df_r = spark.createDataFrame(dfpd_r)
    df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r)
    df = df.toPandas()

    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1, 1, 1, 2, 2, 2, 3, 3, 3]
    assert list(df["unique_id_r"]) == [1, 2, 3, 1, 2, 3, 1, 2, 3]
예제 #6
0
def test_no_blocking(spark, link_dedupe_data):
    settings = {
        "link_type": "link_only",
        "comparison_columns": [{"col_name": "first_name"},
                            {"col_name": "surname"}],
        "blocking_rules": []
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data)
    dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data)
    df_l = spark.createDataFrame(dfpd_l)
    df_r = spark.createDataFrame(dfpd_r)


    df_comparison = block_using_rules(settings, spark, df_l=df_l, df_r=df_r)
    df = df_comparison.toPandas()
    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [1,1,1,2,2,2]
    assert list(df["unique_id_r"]) == [7,8,9,7,8,9]
예제 #7
0
def test_link_option_dedupe_only(spark, link_dedupe_data_repeat_ids):
    settings = {
        "link_type": "dedupe_only",
        "comparison_columns": [{"col_name": "first_name"},
                            {"col_name": "surname"}],
        "blocking_rules": [
            "l.first_name = r.first_name",
            "l.surname = r.surname"
        ]
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids)
    df = spark.createDataFrame(dfpd)

    df = block_using_rules(settings, spark, df=df)
    df = df.toPandas()

    df = df.sort_values(["unique_id_l", "unique_id_r"])

    assert list(df["unique_id_l"]) == [2]
    assert list(df["unique_id_r"]) == [3]
예제 #8
0
def test_term_frequency_adjustments(spark):

    settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.1,
        "comparison_columns": [
            {
                "col_name":
                "name",
                "term_frequency_adjustments":
                True,
                "m_probabilities": [
                    0.1,  # Amonst matches, 10% are have typose
                    0.9  # The reamining 90% have a match
                ],
                "u_probabilities": [
                    4 /
                    5,  # Among non matches, 80% of the time there's no match
                    1 /
                    5  # But 20% of the time names 'collide'  WE WANT THESE U PROBABILITIES TO BE DEPENDENT ON NAME.  
                ],
            },
            {
                "col_name": "cat_12",
                "m_probabilities": [0.05, 0.95],
                "u_probabilities": [11 / 12, 1 / 12],
            },
            {
                "col_name": "cat_20",
                "m_probabilities": [0.2, 0.8],
                "u_probabilities": [19 / 20, 1 / 20],
            }
        ],
        "em_convergence":
        0.001
    }

    from string import ascii_letters
    import statistics
    import random
    from splink.settings import complete_settings_dict
    settings = complete_settings_dict(settings, spark="supress_warnings")

    def is_match(settings):
        p = settings["proportion_of_matches"]
        return random.choices([0, 1], [1 - p, p])[0]

    def get_row_portion(match, comparison_col, skew="auto"):
        # Problem is that at the moment we're guaranteeing that a match on john is just as likely to be a match as a match on james

        # What we want is to generate more 'collisions' for john than robin i.e. if it's a non match, we want more gamma = 1 on name for john

        if match:
            gamma_pdist = comparison_col["m_probabilities"]
        else:
            gamma_pdist = comparison_col["u_probabilities"]

        # To decide whether gamma = 0 or 1 in the case of skew, we first need to decide on what value the left hand value column will take (well, what probability it has of selection)

        # How many distinct values should be choose?
        num_values = int(round(1 / comparison_col["u_probabilities"][1]))

        if skew == "auto":
            skew = comparison_col["term_frequency_adjustments"]

        if skew:

            prob_dist = range(
                1, num_values +
                1)[::-1]  # a most freqent, last value least frequent
            # Normalise
            prob_dist = [p / sum(prob_dist) for p in prob_dist]

            index_of_value = random.choices(range(num_values), prob_dist)[0]
            if not match:  # If it's a u probability
                this_prob = prob_dist[index_of_value]
                gamma_pdist = [1 - this_prob, this_prob]

        else:
            prob_dist = [1 / num_values] * num_values
            index_of_value = random.choices(range(num_values), prob_dist)[0]

        levels = comparison_col["num_levels"]
        gamma = random.choices(range(levels), gamma_pdist)[0]

        values = ascii_letters[:26]
        if num_values > 26:
            values = [
                a + b for a in ascii_letters[:26] for b in ascii_letters[:26]
            ]  #aa, ab etc

        values = values[:num_values]

        if gamma == 1:
            value_1 = values[index_of_value]
            value_2 = value_1

        if gamma == 0:
            value_1 = values[index_of_value]
            same_value = True
            while same_value:
                value_2 = random.choices(values, prob_dist)[0]
                if value_1 != value_2:
                    same_value = False

        cname = comparison_col["col_name"]
        return {
            f"{cname}_l": value_1,
            f"{cname}_r": value_2,
            f"gamma_{cname}": gamma
        }

    import uuid
    rows = []
    for uid in range(100000):
        m = is_match(settings)
        row = {
            "unique_id_l": str(uuid.uuid4()),
            "unique_id_r": str(uuid.uuid4()),
            "match": m
        }
        for cc in settings["comparison_columns"]:
            row_portion = get_row_portion(m, cc)
            row = {**row, **row_portion}
        rows.append(row)

    all_rows = pd.DataFrame(rows)
    df_gammas = spark.createDataFrame(all_rows)

    settings["comparison_columns"][1]["term_frequency_adjustments"] = True

    from splink import Splink
    from splink.params import Params
    from splink.iterate import iterate
    from splink.term_frequencies import make_adjustment_for_term_frequencies

    # We have table of gammas - need to work from there within splink
    params = Params(settings, spark)

    df_e = iterate(df_gammas, params, settings, spark, compute_ll=False)

    df_e_adj = make_adjustment_for_term_frequencies(
        df_e, params, settings, retain_adjustment_columns=True, spark=spark)

    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select name_l, name_tf_adj,  count(*)
    from df_e_adj
    where name_l = name_r
    group by name_l, name_tf_adj
    order by name_l
    """
    df = spark.sql(sql).toPandas()
    df = df.set_index("name_l")
    df_dict = df.to_dict(orient='index')
    assert df_dict['a']["name_tf_adj"] < 0.5

    assert df_dict['e']["name_tf_adj"] > 0.5
    assert df_dict['e'][
        "name_tf_adj"] > 0.6  #Arbitrary numbers, but we do expect a big uplift here
    assert df_dict['e'][
        "name_tf_adj"] < 0.95  #Arbitrary numbers, but we do expect a big uplift here

    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select cat_12_l, cat_12_tf_adj,  count(*) as count
    from df_e_adj
    where cat_12_l = cat_12_r
    group by cat_12_l, cat_12_tf_adj
    order by cat_12_l
    """
    spark.sql(sql).toPandas()
    df = spark.sql(sql).toPandas()
    assert df["cat_12_tf_adj"].max(
    ) < 0.55  # Keep these loose because when generating random data anything can happen!
    assert df["cat_12_tf_adj"].min() > 0.45

    # Test adjustments applied coorrectly when there is one
    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select *
    from df_e_adj
    where name_l = name_r and cat_12_l != cat_12_r
    limit 1
    """
    df = spark.sql(sql).toPandas()
    df_dict = df.loc[0, :].to_dict()

    def bayes(p1, p2):
        return p1 * p2 / (p1 * p2 + (1 - p1) * (1 - p2))

    assert df_dict["tf_adjusted_match_prob"] == pytest.approx(
        bayes(df_dict["match_probability"], df_dict["name_tf_adj"]))

    # Test adjustments applied coorrectly when there are multiple
    df_e_adj.createOrReplaceTempView("df_e_adj")
    sql = """
    select *
    from df_e_adj
    where name_l = name_r and cat_12_l = cat_12_r
    limit 1
    """
    df = spark.sql(sql).toPandas()
    df_dict = df.loc[0, :].to_dict()

    double_b = bayes(
        bayes(df_dict["match_probability"], df_dict["name_tf_adj"]),
        df_dict["cat_12_tf_adj"])

    assert df_dict["tf_adjusted_match_prob"] == pytest.approx(double_b)
예제 #9
0
def test_link_option_link_dedupe(spark, link_dedupe_data_repeat_ids):
    settings = {
        "link_type": "link_and_dedupe",
        "comparison_columns": [{
            "col_name": "first_name"
        }, {
            "col_name": "surname"
        }],
        "blocking_rules":
        ["l.first_name = r.first_name", "l.surname = r.surname"]
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids)
    df_l = spark.createDataFrame(dfpd_l)
    dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids)
    df_r = spark.createDataFrame(dfpd_r)
    df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r)
    df = df.toPandas()
    df["u_l"] = df["unique_id_l"].astype(
        str) + df["_source_table_l"].str.slice(0, 1)
    df["u_r"] = df["unique_id_r"].astype(
        str) + df["_source_table_r"].str.slice(0, 1)
    df = df.sort_values(
        ["_source_table_l", "_source_table_r", "unique_id_l", "unique_id_r"])

    assert list(
        df["u_l"]) == ['2l', '1l', '1l', '2l', '2l', '3l', '3l', '1r', '2r']
    assert list(
        df["u_r"]) == ['3l', '1r', '3r', '2r', '3r', '2r', '3r', '3r', '3r']

    # Same for no blocking rules = cartesian product

    settings = {
        "link_type": "link_and_dedupe",
        "comparison_columns": [{
            "col_name": "first_name"
        }, {
            "col_name": "surname"
        }],
        "blocking_rules": []
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids)
    df_l = spark.createDataFrame(dfpd_l)
    dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids)
    df_r = spark.createDataFrame(dfpd_r)
    df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r)
    df = df.toPandas()

    df["u_l"] = df["unique_id_l"].astype(
        str) + df["_source_table_l"].str.slice(0, 1)
    df["u_r"] = df["unique_id_r"].astype(
        str) + df["_source_table_r"].str.slice(0, 1)
    df = df.sort_values(
        ["_source_table_l", "unique_id_l", "_source_table_r", "unique_id_r"])

    assert list(df["u_l"]) == [
        '1l', '1l', '1l', '1l', '1l', '2l', '2l', '2l', '2l', '3l', '3l', '3l',
        '1r', '1r', '2r'
    ]
    assert list(df["u_r"]) == [
        '2l', '3l', '1r', '2r', '3r', '3l', '1r', '2r', '3r', '1r', '2r', '3r',
        '2r', '3r', '3r'
    ]

    # Same for cartesian product

    settings = {
        "link_type": "link_and_dedupe",
        "comparison_columns": [{
            "col_name": "first_name"
        }, {
            "col_name": "surname"
        }]
    }
    settings = complete_settings_dict(settings, spark=None)
    dfpd_l = pd.read_sql("select * from df_l", link_dedupe_data_repeat_ids)
    df_l = spark.createDataFrame(dfpd_l)
    dfpd_r = pd.read_sql("select * from df_r", link_dedupe_data_repeat_ids)
    df_r = spark.createDataFrame(dfpd_r)
    df = block_using_rules(settings, spark, df_l=df_l, df_r=df_r)
    df = df.toPandas()
    df["u_l"] = df["unique_id_l"].astype(
        str) + df["_source_table_l"].str.slice(0, 1)
    df["u_r"] = df["unique_id_r"].astype(
        str) + df["_source_table_r"].str.slice(0, 1)
    df = df.sort_values(
        ["_source_table_l", "unique_id_l", "_source_table_r", "unique_id_r"])

    assert list(df["u_l"]) == [
        '1l', '1l', '1l', '1l', '1l', '2l', '2l', '2l', '2l', '3l', '3l', '3l',
        '1r', '1r', '2r'
    ]
    assert list(df["u_r"]) == [
        '2l', '3l', '1r', '2r', '3r', '3l', '1r', '2r', '3r', '1r', '2r', '3r',
        '2r', '3r', '3r'
    ]
예제 #10
0
def test_add_gammas(db):

    gamma_settings = {
        "link_type":
        "dedupe_only",
        "proportion_of_matches":
        0.5,
        "comparison_columns": [
            {
                "col_name": "fname",
                "num_levels": 2
            },
            {
                "col_name":
                "sname",
                "num_levels":
                3,
                "case_expression":
                """
                                    case
                                    when sname_l is null or sname_r is null then -1
                                    when sname_l = sname_r then 2
                                    when substr(sname_l,1, 3) =  substr(sname_r, 1, 3) then 1
                                    else 0
                                    end
                                    as gamma_sname
                                    """
            },
        ],
        "blocking_rules": [],
        "retain_matching_columns":
        False
    }

    gamma_settings = complete_settings_dict(gamma_settings,
                                            spark="supress_warnings")

    sql = _sql_gen_add_gammas(gamma_settings, table_name="test2")
    db.execute(sql)
    result = db.fetchall()
    result = [dict(r) for r in result]

    correct_answer = [
        {
            "unique_id_l": 1,
            "unique_id_r": 2,
            "gamma_fname": 1,
            "gamma_sname": 2
        },
        {
            "unique_id_l": 3,
            "unique_id_r": 4,
            "gamma_fname": 1,
            "gamma_sname": 1
        },
        {
            "unique_id_l": 5,
            "unique_id_r": 6,
            "gamma_fname": -1,
            "gamma_sname": -1
        },
        {
            "unique_id_l": 7,
            "unique_id_r": 8,
            "gamma_fname": 0,
            "gamma_sname": 0
        },
    ]

    pd_correct = pd.DataFrame(correct_answer)
    pd_correct = pd_correct.sort_values(["unique_id_l", "unique_id_r"])
    pd_result = pd.DataFrame(result)
    pd_result = pd_result.sort_values(["unique_id_l", "unique_id_r"])

    assert_frame_equal(pd_correct, pd_result)

    gamma_settings["retain_matching_columns"] = True
    sql = _sql_gen_add_gammas(gamma_settings, table_name="test2")

    db.execute(sql)
    result = db.fetchone()
    col_names = list(dict(result).keys())
    correct_col_names = [
        "unique_id_l",
        "unique_id_r",
        "fname_l",
        "fname_r",
        "gamma_fname",
        "sname_l",
        "sname_r",
        "gamma_sname",
    ]
    assert col_names == correct_col_names