def test_jaro_warning(spark): assert _check_jaro_registered(spark) == True spark.sql("drop temporary function jaro_winkler_sim") with pytest.warns(UserWarning): assert _check_jaro_registered(spark) == False from pyspark.sql.types import DoubleType spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", DoubleType(), )
def __init__( self, settings: dict, df_or_dfs: Union[DataFrame, List[DataFrame]], spark: SparkSession, save_state_fn: Callable = None, break_lineage_blocked_comparisons: Callable = default_break_lineage_blocked_comparisons, break_lineage_scored_comparisons: Callable = default_break_lineage_scored_comparisons, ): """Splink data linker Provides easy access to the core user-facing functinoality of splink Args: settings (dict): splink settings dictionary df_or_dfs (Union[DataFrame, List[DataFrame]]): Either a single Spark dataframe to dedupe, or a list of Spark dataframe to link and or dedupe. Where `link_type` is `dedupe_only`, should be a single dataframe to dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, show be a list of dfs. Requires conformant dataframes (i.e. they must have same columns) spark (SparkSession): SparkSession object save_state_fn (function, optional): A function provided by the user that takes one arguments, model (i.e. a Model from splink.model), and is executed each iteration. This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail. break_lineage_blocked_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after blocking. This is a user-provided function that takes one argument - df - and allows the user to break lineage. For example, the function might save df to the AWS s3 file system, and then reload it from the saved files. break_lineage_scored_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after comparisons are scored and before term frequency adjustments. This is a user-provided function that takes one argument - df - and allows the user to break lineage. For example, the function might save df to the AWS s3 file system, and then reload it from the saved files. """ self.spark = spark self.break_lineage_blocked_comparisons = break_lineage_blocked_comparisons self.break_lineage_scored_comparisons = break_lineage_scored_comparisons _check_jaro_registered(spark) validate_settings_against_schema(settings) validate_link_type(df_or_dfs, settings) self.model = Model(settings, spark) self.settings_dict = self.model.current_settings_obj.settings_dict self.settings_dict = normalise_probabilities(self.settings_dict) validate_probabilities(self.settings_dict) # dfs is a list of dfs irrespective of whether input was a df or list of dfs if type(df_or_dfs) == DataFrame: dfs = [df_or_dfs] else: dfs = df_or_dfs self.df = vertically_concatenate_datasets(dfs) validate_input_datasets(self.df, self.model.current_settings_obj) self.save_state_fn = save_state_fn
def __init__(self, settings: dict, spark: SparkSession, df_l: DataFrame = None, df_r: DataFrame = None, df: DataFrame = None, save_state_fn: Callable = None, break_lineage_blocked_comparisons: Callable = default_break_lineage_blocked_comparisons, break_lineage_scored_comparisons: Callable = default_break_lineage_scored_comparisons): """splink data linker Provides easy access to the core user-facing functinoality of splink Args: settings (dict): splink settings dictionary spark (SparkSession): SparkSession object df_l (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`. df_r (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`. df (DataFrame, optional): The dataframe to dedupe. Where `link_type` is `dedupe_only`, the dataframe to dedupe. Should be ommitted `link_type` is `link_only` or `link_and_dedupe`. save_state_fn (function, optional): A function provided by the user that takes two arguments, params and settings, and is executed each iteration. This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail. break_lineage_blocked_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after blocking. This is a user-provided function that takes one argument - df - and allows the user to break lineage. For example, the function might save df to the AWS s3 file system, and then reload it from the saved files. break_lineage_scored_comparisons (function, optional): Large jobs will likely run into memory errors unless the lineage is broken after comparisons are scored and before term frequency adjustments. This is a user-provided function that takes one argument - df - and allows the user to break lineage. For example, the function might save df to the AWS s3 file system, and then reload it from the saved files. """ self.spark = spark self.break_lineage_blocked_comparisons = break_lineage_blocked_comparisons self.break_lineage_scored_comparisons = break_lineage_scored_comparisons _check_jaro_registered(spark) settings = complete_settings_dict(settings, spark) validate_settings(settings) self.settings = settings self.params = Params(settings, spark) self.df_r = df_r self.df_l = df_l self.df = df self.save_state_fn = save_state_fn self._check_args()
def __init__( self, settings: dict, spark: SparkSession, df_l: DataFrame = None, df_r: DataFrame = None, df: DataFrame = None, save_state_fn: Callable = None, ): """splink data linker Provides easy access to the core user-facing functinoality of splink Args: settings (dict): splink settings dictionary spark (SparkSession): SparkSession object df_l (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`. df_r (DataFrame, optional): A dataframe to link/dedupe. Where `link_type` is `link_only` or `link_and_dedupe`, one of the two dataframes to link. Should be ommitted `link_type` is `dedupe_only`. df (DataFrame, optional): The dataframe to dedupe. Where `link_type` is `dedupe_only`, the dataframe to dedupe. Should be ommitted `link_type` is `link_only` or `link_and_dedupe`. save_state_fn (function, optional): A function provided by the user that takes two arguments, params and settings, and is executed each iteration. This is a hook that allows the user to save the state between iterations, which is mostly useful for very large jobs which may need to be restarted from where they left off if they fail. """ self.spark = spark _check_jaro_registered(spark) settings = complete_settings_dict(settings, spark) validate_settings(settings) self.settings = settings self.params = Params(settings, spark) self.df_r = df_r self.df_l = df_l self.df = df self.save_state_fn = save_state_fn self._check_args()
def test_case_statements(spark, sqlite_con_3): assert _check_jaro_registered(spark) == True spark.sql("drop temporary function jaro_winkler_sim") with pytest.warns(UserWarning): assert _check_jaro_registered(spark) == False from pyspark.sql.types import DoubleType spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", DoubleType(), ) assert _check_jaro_registered(spark) == True dfpd = pd.read_sql("select * from str_comp", sqlite_con_3) df = spark.createDataFrame(dfpd) df.createOrReplaceTempView("str_comp") case_statement = sql_gen_case_stmt_levenshtein_3("str_col", "str_col") sql = f"""select {case_statement} from str_comp""" df = spark.sql(sql).toPandas() assert df.loc[0, "gamma_str_col"] == 2 assert df.loc[1, "gamma_str_col"] == 1 assert df.loc[2, "gamma_str_col"] == 0 assert df.loc[3, "gamma_str_col"] == -1 assert df.loc[4, "gamma_str_col"] == -1 case_statement = sql_gen_case_stmt_levenshtein_4("str_col", "str_col") sql = f"""select {case_statement} from str_comp""" df = spark.sql(sql).toPandas() assert df.loc[0, "gamma_str_col"] == 3 assert df.loc[1, "gamma_str_col"] == 2 assert df.loc[2, "gamma_str_col"] == 0 assert df.loc[3, "gamma_str_col"] == -1 assert df.loc[4, "gamma_str_col"] == -1 case_statement = sql_gen_gammas_case_stmt_jaro_2("str_col", "str_col") sql = f"""select {case_statement} from str_comp""" df = spark.sql(sql).toPandas() assert df.loc[0, "gamma_str_col"] == 1 assert df.loc[1, "gamma_str_col"] == 1 assert df.loc[2, "gamma_str_col"] == 0 assert df.loc[3, "gamma_str_col"] == -1 assert df.loc[4, "gamma_str_col"] == -1 case_statement = sql_gen_gammas_case_stmt_jaro_3("str_col", "str_col") sql = f"""select {case_statement} from str_comp""" df = spark.sql(sql).toPandas() assert df.loc[0, "gamma_str_col"] == 2 assert df.loc[1, "gamma_str_col"] == 2 assert df.loc[2, "gamma_str_col"] == 0 assert df.loc[3, "gamma_str_col"] == -1 assert df.loc[4, "gamma_str_col"] == -1 case_statement = sql_gen_gammas_case_stmt_jaro_4("str_col", "str_col", threshold3=0.001) sql = f"""select {case_statement} from str_comp""" df = spark.sql(sql).toPandas() assert df.loc[0, "gamma_str_col"] == 3 assert df.loc[1, "gamma_str_col"] == 3 assert df.loc[2, "gamma_str_col"] == 1 assert df.loc[3, "gamma_str_col"] == -1 assert df.loc[4, "gamma_str_col"] == -1 data = [{ "surname_l": "smith", "forename1_l": "john", "forename2_l": "david", "surname_r": "smith", "forename1_r": "john", "forename2_r": "david" }, { "surname_l": "smith", "forename1_l": "john", "forename2_l": "david", "surname_r": "smithe", "forename1_r": "john", "forename2_r": "david" }, { "surname_l": "smith", "forename1_l": "john", "forename2_l": "david", "surname_r": "john", "forename1_r": "smith", "forename2_r": "david" }, { "surname_l": "smith", "forename1_l": "john", "forename2_l": "david", "surname_r": "john", "forename1_r": "david", "forename2_r": "smithe" }, { "surname_l": "linacre", "forename1_l": "john", "forename2_l": "david", "surname_r": "linaker", "forename1_r": "john", "forename2_r": "david" }, { "surname_l": "smith", "forename1_l": "john", "forename2_l": "david", "surname_r": "john", "forename1_r": "david", "forename2_r": "smarty" }] dfpd = pd.DataFrame(data) df = spark.createDataFrame(dfpd) df.createOrReplaceTempView("df_names") sql = sql_gen_gammas_name_inversion_4("surname", ["forename1", "forename2"], "surname") df_results = spark.sql(f"select {sql} from df_names").toPandas() assert df_results.loc[0, "gamma_surname"] == 3 assert df_results.loc[1, "gamma_surname"] == 3 assert df_results.loc[2, "gamma_surname"] == 2 assert df_results.loc[3, "gamma_surname"] == 2 assert df_results.loc[4, "gamma_surname"] == 1 assert df_results.loc[5, "gamma_surname"] == 0