def persistLocal(self, dfName: str, df: DataFrame, persistType: str): ''' Persist the input Datafrmae locally (memory/disk/none) and runs `df.take(1)` to force persist. ''' df.persist( self.getSparkPersistType(persistTypStr=persistType.toUpperCase())) if (self.__printcount == None): row = df.take(1) return df
def _truth_space_table_old( df_labels_with_splink_scores: DataFrame, spark: SparkSession, threshold_actual: float = 0.5, score_colname: str = None, ): """Create a table of the ROC space i.e. truth table statistics for each discrimination threshold Args: df_labels_with_splink_scores (DataFrame): A dataframe of labels and associated splink scores usually the output of the truth.labels_with_splink_scores function threshold_actual (float, optional): Threshold to use in categorising clerical match scores into match or no match. Defaults to 0.5. score_colname (float, optional): Allows user to explicitly state the column name in the Splink dataset containing the Splink score. If none will be inferred Returns: DataFrame: Table of 'truth space' i.e. truth categories for each threshold level """ # This is used repeatedly to generate the roc curve df_labels_with_splink_scores.persist() # We want percentiles of score to compute score_colname = _get_score_colname(df_labels_with_splink_scores, score_colname) percentiles = [x / 100 for x in range(0, 101)] values_distinct = df_labels_with_splink_scores.select( score_colname).distinct() thresholds = values_distinct.stat.approxQuantile(score_colname, percentiles, 0.0) thresholds.append(1.01) thresholds = sorted(set(thresholds)) roc_dfs = [] for thres in thresholds: df_e_t = df_e_with_truth_categories(df_labels_with_splink_scores, thres, spark, threshold_actual, score_colname) df_roc_row = _summarise_truth_cats(df_e_t, spark) roc_dfs.append(df_roc_row) all_roc_df = reduce(DataFrame.unionAll, roc_dfs) return all_roc_df