def read_table_bronze_loans_tbl_loans(df: DataFrame, logger: Logger,
                                      dbutils: DBUtils):
    base_year = dbutils.widgets.get("base_year")

    logger.info(f"Using base year: {base_year}")

    return df.filter(f.col("DefaultDate") >= base_year)
示例#2
0
def clean_immigration(df: SparkDataFrame) -> SparkDataFrame:
    """Clean immigration data

    :param df: immigration data frame to be cleaned.
    :return: cleaned immigration data frame
    """

    drop_cols = [
        'visapost', 'occup', 'entdepu', 'insnum', 'count', 'entdepa',
        'entdepd', 'matflag', 'dtaddto', 'biryear', 'admnum'
    ]
    int_cols = [
        'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94mode', 'i94bir',
        'i94visa', 'dtadfile'
    ]
    date_cols = ['arrdate', 'depdate']
    date_udf = udf(lambda x: x and (timedelta(days=int(x)) + datetime(
        1960, 1, 1)).strftime('%Y-%m-%d'))

    df = df.drop(*drop_cols)
    df = convert_column_type(df, 'integer', int_cols)
    for col in date_cols:
        df = df.withColumn(col, date_udf(df[col]))

    # Remove the row if the data in any of fk column is lost
    fk_columns = ['i94cit', 'i94port', 'i94addr']
    df = reduce(lambda df, idx: df.filter(df[fk_columns[idx]].isNotNull()),
                range(len(fk_columns)), df)

    return df
示例#3
0
def filter_by_time_context(df: DataFrame,
                           timecontext: Optional[TimeContext] = None
                           ) -> DataFrame:
    """ Filter a Dataframe by given time context
    Parameters
    ----------
    df : pyspark.sql.dataframe.DataFrame
    timecontext: TimeContext

    Returns
    -------
    filtered Spark Dataframe
    """
    if not timecontext:
        return df

    if TIME_COL in df.columns:
        # For py3.8, underlying spark type converter calls utctimetuple()
        # and will throw excpetion for Timestamp type if tz is set.
        # See https://github.com/pandas-dev/pandas/issues/32174
        # Dropping tz will cause spark to interpret begin, end with session
        # timezone & os env TZ. We convert Timestamp to pydatetime to
        # workaround.
        begin, end = timecontext
        return df.filter((F.col(TIME_COL) >= begin.to_pydatetime())
                         & (F.col(TIME_COL) < end.to_pydatetime()))
    else:
        raise com.TranslationError(
            "'time' column missing in Dataframe {}."
            "To use time context, a Timestamp column name 'time' must"
            "present in the table. ".format(df))
    def process_batch(self, df: DataFrame, batch_id):
        checked_df = df.filter(f"{FieldsName.REQUEST_INFO_REQUEST_PATH} == '/product/car/checked_car_items.action'") \
                        .select(FieldsName.USER_ID, f.explode(FieldsName.ITEMS).alias(FieldsName.PRODUCT_ID)) \
                        .drop(FieldsName.ITEMS) \
                        .dropna(how="any")

        add_car_df = df.filter(f"{FieldsName.REQUEST_INFO_REQUEST_PATH} == '/product/car/add_to_car.action'") \
            .select(FieldsName.USER_ID, FieldsName.PRODUCT_ID) \
            .dropna(how="any")

        collect_df = checked_df.union(add_car_df).distinct()

        for row in collect_df.collect():
            self.send(topic="python_test1",
                      key=row[FieldsName.USER_ID],
                      value=row[FieldsName.PRODUCT_ID])
示例#5
0
def join_park_violation_with_centerline(df_park_violation: DataFrame,
                                        df_centerline: DataFrame) -> DataFrame:
    """
    Joining park_violation dataframe and centerline datafrmae based on borocode, street name and house number

    Basic steps:
    1. joined odd house numbers with L_LOW_HN & L_HIGH_HN of centerline data
    2. joined even house numbers with R_LOW_HN & R_HIGH_HN of centerline data
    3. Also other criteria was borocode and street name to join the data

    :param df_park_violation:
    :param df_centerline:
    :return:
    """
    # df_park_violation = df_park_violation.repartition("BOROCODE", "Street Name", "House Number")
    # df_centerline.cache()
    """below steps for even house number"""
    """below steps for odd house number"""
    df_park_violation.cache()
    df_centerline.cache()

    df_park_violation_odd = df_park_violation.filter(F.col("temp") % 2 != 0)
    df_park_violation_even = df_park_violation.filter(F.col("temp") % 2 == 0)
    df_centerline.count()

    df_joined_1 = (df_park_violation_even.alias("park").join(
        df_centerline.alias("centerline").hint("broadcast"),
        ((F.col("Street Name") == F.col("ST_NAME")) |
         (F.col("Street Name") == F.col("FULL_STREE")))
        & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE"))
        & ((F.col("park.House Number") >= F.col("centerline.R_LOW_HN"))
           & (F.col("park.House Number") <= F.col("centerline.R_HIGH_HN"))),
    ).select("total_cnt", "year", "PHYSICALID"))

    df_joined_2 = (df_park_violation_odd.alias("park").join(
        df_centerline.alias("centerline").hint("broadcast"),
        ((F.col("Street Name") == F.col("ST_NAME")) |
         (F.col("Street Name") == F.col("FULL_STREE")))
        & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE"))
        & ((F.col("park.House Number") >= F.col("centerline.L_LOW_HN"))
           & (F.col("park.House Number") <= F.col("centerline.L_LOW_HN"))),
    ).select("total_cnt", "year", "PHYSICALID"))
    """returing union of 2 dataframes"""
    return df_joined_1.unionAll(df_joined_2)
示例#6
0
 def preprocess(self, df: DataFrame):
     preprocess_df = df.filter(~F.isnull("se_property")) \
         .withColumn("se_label", F.lower(F.col("se_label"))) \
         .filter(~F.isnull("se_label")) \
         .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \
         .withColumn("isSuspect", F.col("isSuspect").cast("int")) \
         .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \
         .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \
         .withColumn("hour", F.hour("collector_tstamp"))
     return preprocess_df
def defaults_per_country(df: DataFrame):
    year = dbutils.widgets.get("year")  # noqa: F821
    rating = dbutils.widgets.get("rating")  # noqa: F821

    return (
        df.filter((f.col("Year") == year) & (f.col("Rating") == rating))
        .groupBy("Country")
        .agg(
            f.count("Defaulted").alias("Defaults"),
        )
        .orderBy("Defaults", ascending=False)
    )
示例#8
0
def transform_parking_violation_data(df: DataFrame,
                                     column: str = "Violation County"
                                     ) -> DataFrame:
    """Transforming parking vialation data to make it joinable, below are the things steps in high level

    1. Added Borocode
    2. Converted house number in case it is separated by '-'
    3. Converted 'Street Name' to upper case
    4. Removed any data having no house number
    """

    df = (df.select(
        "Violation County", "House Number", "Street Name", "Summons Number",
        "Issue Date").distinct().withColumn(
            "year",
            F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))).orderBy(
                "Violation County", "House Number",
                "Street Name", "year").coalesce(100).groupBy(
                    "Violation County", "House Number", "Street Name",
                    "year").agg({
                        "Summons Number": "count"
                    }).withColumnRenamed(
                        "count(Summons Number)", "total_cnt").withColumn(
                            "BOROCODE",
                            F.when(
                                F.col(column).isin([
                                    "MAN", "MH", "MN", "NEWY", "NEW Y", "NY"
                                ]), 1).when(
                                    F.col(column).isin(["BRONX", "BX"]),
                                    2).when(
                                        F.col(column).isin(
                                            ["BK", "K", "KING", "KINGS"]),
                                        3).when(
                                            F.col(column).isin([
                                                "Q", "QN", "QNS", "QU", "QUEEN"
                                            ]), 4).when(
                                                F.col(column).isin(
                                                    ["R", "RICHMOND"]),
                                                5).otherwise(0),
                        ))

    df = (df.filter(F.col("House Number").isNotNull()).withColumn(
        "temp", F.split("House Number", "-")).withColumn(
            "House Number",
            F.col("temp").getItem(0).cast("int") +
            F.when(F.col("temp").getItem(1).isNull(), "0").otherwise(
                F.col("temp").getItem(1)).cast("int") / 1000,
        ).withColumn("temp",
                     F.col("temp").getItem(0).cast("int")).withColumn(
                         "Street Name", F.upper(F.col("Street Name"))))
    return df
    def process_batch(self, df: DataFrame, batch_id):
        valid_df = df.filter(
            f"{FieldsName.LOG_LEVEL} == 'INFO' and {FieldsName.MESSAGE} == 'OK'"
        )

        collect_df = valid_df.select(
            FieldsName.USER_ID,
            f.explode(FieldsName.ITEMS).alias(
                FieldsName.PRODUCT_ID)).dropna(how="any")

        for row in collect_df.collect():
            self.send(topic="python_test1",
                      key=row[FieldsName.USER_ID],
                      value=row[FieldsName.PRODUCT_ID])
示例#10
0
def remove_false_positives(az_phewas_df: DataFrame) -> DataFrame:
    """Remove associations present in the synonymous negative control."""

    false_positives = az_phewas_df.filter(
        col('CollapsingModel') == 'syn').select('Gene',
                                                'Phenotype').distinct()
    true_positives = az_phewas_df.join(false_positives,
                                       on=['Gene', 'Phenotype'],
                                       how='left_anti').distinct()
    logging.info(
        f'{az_phewas_df.count() - true_positives.count()} false positive evidence of association have been dropped.'
    )

    return true_positives
def calculate_bathrooms_bedrooms(df:DataFrame):
    """[calculates average bathrooms and bedrooms and saves it into csv]

    Args:
        df (DataFrame): [spark dataframe]
    """
    (df.filter((df.price > 5000) & (df.review_scores_value == 10))
        .agg(f.avg('bathrooms').alias('avg_bathrooms'),
            f.avg('bedrooms').alias('avg_bedrooms'))
        .coalesce(1)
        .write
        .option("header","true")
        .format('csv')
        .save('out/out_2_3.txt'))
示例#12
0
def clean_temperature(df: SparkDataFrame) -> SparkDataFrame:
    """Clean temperature data

    :param data: temperature data frame to be cleaned.
    :return: cleaned temperature data frame
    """

    df = df.filter(df['AverageTemperature'].isNotNull())
    # rename columns to snake_case
    df = normalize_column_name(
        df,
        lambda x: ''.join(['_' + c.lower() if c.isupper() else c
                           for c in x]).lstrip('_'))
    return df
示例#13
0
def clean_airports(df: SparkDataFrame) -> SparkDataFrame:
    """Clean airport data

    :param df: airport data frame to be cleaned.
    :return: cleaned airport data frame
    """

    df = df.filter((df['iso_country'] == 'US')
                   & (df['type'].contains('airport'))
                   & (df['local_code'].isNotNull()))
    df = df \
        .withColumn('iso_region', df['iso_region'].substr(4, 2)) \
        .withColumn('elevation_ft', df['elevation_ft'].cast('float'))

    return df
示例#14
0
def filter(dataframe: DataFrame, condition):
    """Filters DataFrame's rows using the given condition and value.

    Args:
        dataframe: Spark DataFrame.
        condition: SQL expression with column, operation and value
            to filter the dataframe.

    Returns:
        Filtered dataframe
    """
    if not isinstance(condition, str):
        raise TypeError("condition should be string.")

    return dataframe.filter(condition)
示例#15
0
    def _add_data(self,
                  datalist: dict,
                  name: str,
                  table: DataFrame,
                  partition_cols=[]):
        # if name not in datalist:

        if len(partition_cols) > 0:
            p_filter = self.dtm.scoped_partition_filter(
                start=self.partition_start,
                end=self.partition_end,
                partition_col=partition_cols[0],
                input_fmt=self.dtm.partition_dt_format)
            d = Data(table.filter(p_filter), partition_cols)
        else:
            d = Data(table, partition_cols)

        # TODO - Add this back in to support nested partition columns
        # TODO - But it will require a few tweaks
        # if len(partition_cols) > 0 and len(self.partition_start) > 0:
        #     tf_filters = [col(tfc) >= self.partition_start[i] for i, tfc in enumerate(d.partition_cols)]
        #     where_clause = tf_filters[0]
        #     for f in tf_filters[1:]:
        #         where_clause &= f
        #     print("Applying filter {} to dataframe {}".format(where_clause, name))
        #     d.df = d.df.filter(where_clause)
        #
        # if len(partition_cols) > 0 and len(self.partition_end) > 0:
        #     tf_filters = [col(tfc) <= self.partition_end[i] for i, tfc in enumerate(d.partition_cols)]
        #     where_clause = tf_filters[0]
        #     for f in tf_filters[1:]:
        #         where_clause &= f
        #     print("Applying filter {} to dataframe {}".format(where_clause, name))
        #     d.df = d.df.filter(where_clause)

        datalist[name] = d
示例#16
0
def filter_by_time_context(df: DataFrame,
                           timecontext: Optional[TimeContext] = None
                           ) -> DataFrame:
    """ Filter a Dataframe by given time context
    Parameters
    ----------
    df : pyspark.sql.dataframe.DataFrame
    timecontext: TimeContext

    Returns
    -------
    filtered Spark Dataframe
    """
    if not timecontext:
        return df

    begin, end = timecontext
    if TIME_COL in df.columns:
        return df.filter((F.col(TIME_COL) >= begin) & (F.col(TIME_COL) < end))
    else:
        raise com.TranslationError(
            "'time' column missing in Dataframe {}."
            "To use time context, a Timestamp column name 'time' must"
            "present in the table. ".format(df))
示例#17
0
 def _apply_metric_filters(self, name: str, df: DataFrame):
     metric_filters = self.config.get_config("metric_filters")
     metric_filter = metric_filters.get_or_else(name, None)
     if metric_filter is not None:
         df.filter(metric_filter)
     return df
def dedupe_splink_scores(
    df_e_with_dupes: DataFrame,
    unique_id_colname: str,
    score_colname: str = None,
    selection_fn: str = "abs_val",
):
    """Sometimes, multiple Splink jobs with different blocking rules are combined
    into a single dataset of edges.  Sometimes,the same pair of nodes will be
    scored multiple times, once by each job.  We need to deduplicate this dataset
    so each pair of nodes appears only once

    Args:
        df_e_with_dupes (DataFrame): Dataframe with dupes
        unique_id_colname (str): Unique id column name e.g. unique_id
        score_colname (str, optional): Which column contains scores? If none, inferred from
            df_e_with_dupes.columns. Defaults to None.
        selection_fn (str, optional): Where we have several different scores for a given
            pair of records, how do we decide the final score?
            Options are 'abs_val' and 'mean'.
            abs_val:  Take the value furthest from 0.5 i.e. the value that expresses most certainty
            mean: Take the mean of all values
            Defaults to 'abs_val'.
    """

    # Looking in blocking.py, the position of unique ids
    # (whether they appear in _l or _r) is guaranteed
    # in blocking outputs so we don't need to worry about
    # inversions

    # This is not the case for labelled data - hence the need
    # _sql_gen_unique_id_keygen to join labels to df_e

    possible_vals = ["abs_val", "mean"]
    if selection_fn not in possible_vals:
        raise ValueError(
            f"selection function should be in {possible_vals}, you passed {selection_fn}"
        )

    score_colname = _get_score_colname(df_e_with_dupes, score_colname)

    if selection_fn == "abs_val":
        df_e_with_dupes = df_e_with_dupes.withColumn(
            "absval", f.expr(f"0.5 - abs({score_colname})"))

        win_spec = Window.partitionBy(
            [f"{unique_id_colname}_l",
             f"{unique_id_colname}_r"]).orderBy(f.col("absval").desc())
        df_e_with_dupes = df_e_with_dupes.withColumn(
            "ranking",
            f.row_number().over(win_spec))
        df_e = df_e_with_dupes.filter(f.col("ranking") == 1)
        df_e = df_e.drop("absval")
        df_e = df_e.drop("ranking")

    if selection_fn == "mean":

        win_spec = Window.partitionBy(
            [f"{unique_id_colname}_l",
             f"{unique_id_colname}_r"]).orderBy(f.col(score_colname).desc())

        df_e_with_dupes = df_e_with_dupes.withColumn(
            "ranking",
            f.row_number().over(win_spec))

        df_e_with_dupes = df_e_with_dupes.withColumn(
            score_colname,
            f.avg(score_colname).over(
                win_spec.rowsBetween(Window.unboundedPreceding,
                                     Window.unboundedFollowing)),
        )
        df_e = df_e_with_dupes.filter(f.col("ranking") == 1)

        df_e = df_e.drop("ranking")

    return df_e
示例#19
0
    def _filter_duplicated_rows(self, df: DataFrame) -> DataFrame:
        """Filter dataframe duplicated rows.

        Attributes:
            df: transformed dataframe.

        Returns:
            Spark dataframe with filtered rows.

        Example:
            Suppose, for instance, that the transformed dataframe received
            by the construct method has the following rows:

            +---+----------+--------+--------+--------+
            | id| timestamp|feature1|feature2|feature3|
            +---+----------+--------+--------+--------+
            |  1|         1|       0|    null|       1|
            |  1|         2|       0|       1|       1|
            |  1|         3|    null|    null|    null|
            |  1|         4|       0|       1|       1|
            |  1|         5|       0|       1|       1|
            |  1|         6|    null|    null|    null|
            |  1|         7|    null|    null|    null|
            +---+-------------------+--------+--------+

            We will then create four columns, the first one, rn_by_key_columns
            (rn1) is the row number over a key columns partition ordered by timestamp.
            The second, rn_by_all_columns (rn2), is the row number over all columns
            partition (also ordered by timestamp). The third column,
            lag_rn_by_key_columns (lag_rn1), returns the last occurrence of the
            rn_by_key_columns over all columns partition. The last column, diff, is
            the difference between rn_by_key_columns and lag_rn_by_key_columns:

            +---+----------+--------+--------+--------+----+----+--------+-----+
            | id| timestamp|feature1|feature2|feature3| rn1| rn2| lag_rn1| diff|
            +---+----------+--------+--------+--------+----+----+--------+-----+
            |  1|         1|       0|    null|       1|   1|   1|    null| null|
            |  1|         2|       0|       1|       1|   2|   1|    null| null|
            |  1|         3|    null|    null|    null|   3|   1|    null| null|
            |  1|         4|       0|       1|       1|   4|   2|       2|    2|
            |  1|         5|       0|       1|       1|   5|   3|       4|    1|
            |  1|         6|    null|    null|    null|   6|   2|       3|    3|
            |  1|         7|    null|    null|    null|   7|   3|       6|    1|
            +---+----------+--------+--------+--------+----+----+--------+-----+

            Finally, this dataframe will then be filtered with the following condition:
            rn_by_all_columns = 1 or diff > 1 and only the original columns will be
            returned:

            +---+----------+--------+--------+--------+
            | id| timestamp|feature1|feature2|feature3|
            +---+----------+--------+--------+--------+
            |  1|         1|       0|    null|       1|
            |  1|         2|       0|       1|       1|
            |  1|         3|    null|    null|    null|
            |  1|         4|       0|       1|       1|
            |  1|         6|    null|    null|    null|
            +---+----------+--------+--------+--------+

        """
        window_key_columns = Window.partitionBy(
            self.keys_columns  # type: ignore
        ).orderBy(TIMESTAMP_COLUMN)
        window_all_columns = Window.partitionBy(
            self.keys_columns + self.features_columns  # type: ignore
        ).orderBy(TIMESTAMP_COLUMN)

        df = (df.withColumn(
            "rn_by_key_columns",
            F.row_number().over(window_key_columns)).withColumn(
                "rn_by_all_columns",
                F.row_number().over(window_all_columns)).withColumn(
                    "lag_rn_by_key_columns",
                    F.lag("rn_by_key_columns", 1).over(window_all_columns),
                ).withColumn(
                    "diff",
                    F.col("rn_by_key_columns") -
                    F.col("lag_rn_by_key_columns")))
        df = df.filter("rn_by_all_columns = 1 or diff > 1")

        return df.select([column for column in self.columns])
示例#20
0
def parse_genebass_evidence(genebass_df: DataFrame) -> DataFrame:
    """
    Parse Genebass's disease/target evidence.
    Args:
        genebass_df: DataFrame with Genebass's portal data
    Returns:
        evd_df: DataFrame with Genebass's data following the t/d evidence schema.
    """
    to_keep = [
        'datasourceId',
        'datatypeId',
        'targetFromSourceId',
        'diseaseFromSource',
        'diseaseFromSourceId',
        'diseaseFromSourceMappedId',
        'pValueMantissa',
        'pValueExponent',
        'beta',
        'betaConfidenceIntervalLower',
        'betaConfidenceIntervalUpper',
        'oddsRatio',
        'oddsRatioConfidenceIntervalLower',
        'oddsRatioConfidenceIntervalUpper',
        'resourceScore',
        'ancestry',
        'ancestryId',
        'projectId',
        'cohortId',
        'studySampleSize',
        'studyCases',
        'statisticalMethod',
        'statisticalMethodOverview',
    ]

    # WARNING: There are some associations with a p-value of 0.0 in Genebass.
    # This is a bug we still have to ellucidate and it might be due to a float overflow.
    # These evidence need to be manually corrected in order not to lose them and for them to pass validation
    # As an interim solution, their p value will equal to the minimum in the evidence set.
    logging.warning(
        f"There are {genebass_df.filter(col('Pvalue_Burden') == 0.0).count()} evidence with a p-value of 0.0."
    )
    minimum_pvalue = (
        genebass_df.filter(col('Pvalue_Burden') > 0.0).agg({'Pvalue_Burden': 'min'}).collect()[0]['min(Pvalue_Burden)']
    )
    genebass_df = genebass_df.withColumn(
        'Pvalue_Burden', when(col('Pvalue_Burden') == 0.0, lit(minimum_pvalue)).otherwise(col('Pvalue_Burden'))
    )

    return (
        genebass_df.withColumn('datasourceId', lit('gene_burden'))
        .withColumn('datatypeId', lit('genetic_association'))
        .withColumn('projectId', lit('Genebass'))
        .withColumn('cohortId', lit('UK Biobank 450k'))
        .withColumn('ancestry', lit('EUR'))
        .withColumn('ancestryId', lit('HANCESTRO_0009'))
        .withColumnRenamed('gene_id', 'targetFromSourceId')
        .withColumnRenamed('description', 'diseaseFromSource')
        .withColumnRenamed('phenocode', 'diseaseFromSourceId')
        .join(
            import_trait_mappings(),
            on='diseaseFromSource',
            how='left',
        )
        .withColumnRenamed('Pvalue_Burden', 'resourceScore')
        .withColumn('pValueExponent', log10(col('resourceScore')).cast(IntegerType()) - lit(1))
        .withColumn('pValueMantissa', round(col('resourceScore') / pow(lit(10), col('pValueExponent')), 3))
        # Stats are split taking into consideration the type of the trait
        # Those that are not continuous or categorical were reviewed and all of them are considered as categorical
        .withColumn(
            'beta',
            when(col('trait_type') == 'continuous', col('BETA_Burden')),
        )
        .withColumn(
            'betaConfidenceIntervalLower',
            when(col('trait_type') == 'continuous', col('BETA_Burden') - col('SE_Burden')),
        )
        .withColumn(
            'betaConfidenceIntervalUpper',
            when(col('trait_type') == 'continuous', col('BETA_Burden') + col('SE_Burden')),
        )
        .withColumn(
            'oddsRatio',
            when(col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']), col('BETA_Burden')),
        )
        .withColumn(
            'oddsRatioConfidenceIntervalLower',
            when(
                col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']),
                col('BETA_Burden') - col('SE_Burden'),
            ),
        )
        .withColumn(
            'oddsRatioConfidenceIntervalUpper',
            when(
                col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']),
                col('BETA_Burden') + col('SE_Burden'),
            ),
        )
        .withColumn('studySampleSize', (col('n_cases') + coalesce('n_controls', lit(0))))
        .withColumnRenamed('n_cases', 'studyCases')
        .withColumnRenamed('annotation', 'statisticalMethod')
        .withColumn('statisticalMethodOverview', col('statisticalMethod'))
        .replace(to_replace=METHOD_DESC, subset=['statisticalMethodOverview'])
        .select(to_keep)
        .distinct()
    )