예제 #1
0
    def persistLocal(self, dfName: str, df: DataFrame, persistType: str):
        ''' Persist the input Datafrmae locally (memory/disk/none) and runs `df.take(1)` to force persist.
        '''
        df.persist(
            self.getSparkPersistType(persistTypStr=persistType.toUpperCase()))

        if (self.__printcount == None):
            row = df.take(1)
        return df
def _truth_space_table_old(
    df_labels_with_splink_scores: DataFrame,
    spark: SparkSession,
    threshold_actual: float = 0.5,
    score_colname: str = None,
):
    """Create a table of the ROC space i.e. truth table statistics
    for each discrimination threshold

    Args:
        df_labels_with_splink_scores (DataFrame): A dataframe of labels and associated splink scores
            usually the output of the truth.labels_with_splink_scores function
        threshold_actual (float, optional): Threshold to use in categorising clerical match
            scores into match or no match. Defaults to 0.5.
        score_colname (float, optional): Allows user to explicitly state the column name
            in the Splink dataset containing the Splink score.  If none will be inferred

    Returns:
        DataFrame: Table of 'truth space' i.e. truth categories for each threshold level
    """

    # This is used repeatedly to generate the roc curve
    df_labels_with_splink_scores.persist()

    # We want percentiles of score to compute
    score_colname = _get_score_colname(df_labels_with_splink_scores,
                                       score_colname)

    percentiles = [x / 100 for x in range(0, 101)]

    values_distinct = df_labels_with_splink_scores.select(
        score_colname).distinct()
    thresholds = values_distinct.stat.approxQuantile(score_colname,
                                                     percentiles, 0.0)
    thresholds.append(1.01)
    thresholds = sorted(set(thresholds))

    roc_dfs = []
    for thres in thresholds:
        df_e_t = df_e_with_truth_categories(df_labels_with_splink_scores,
                                            thres, spark, threshold_actual,
                                            score_colname)
        df_roc_row = _summarise_truth_cats(df_e_t, spark)
        roc_dfs.append(df_roc_row)

    all_roc_df = reduce(DataFrame.unionAll, roc_dfs)
    return all_roc_df