def check_fact_table_integrity(fact: SparkDataFrame,
                               dim_demographics: SparkDataFrame,
                               dim_airports: SparkDataFrame,
                               dim_countries: SparkDataFrame) -> bool:
    """Check the integrity of models based on fact table.

    :param fact: the data frame of fact table, immigration.
    :param dim_demographics: the data frame of dimension demographics table
    :param dim_airports: the data frame of dimension airports table
    :param dim_countries: the data frame of dimension of countries table
    :return: True if all the integrity check pass otherwise False
    """

    integrity_demographics = fact.select(col('i94addr')).distinct()\
                                 .join(dim_demographics,
                                       fact['i94addr'] == dim_demographics['state_code'], 'left_anti').count() == 0

    integrity_airports = fact.select(col('i94port')).distinct()\
                             .join(dim_airports,
                                   fact['i94port'] == dim_airports['local_code'], 'left_anti').count() == 0

    integrity_countries = fact.select(col('i94cit')).distinct().\
                              join(dim_countries, fact['i94cit'] == dim_countries['code'], 'left_anti').count() == 0

    return integrity_demographics & integrity_airports & integrity_countries
示例#2
0
def loadIntoRaw(database, table, source: DataFrame):
    # RAW is a bit special since it does not have a fixed schema
    # When writing, we order it to use the same schema as the source dataset
    # ensureParent parameter instructs the data source to create the RAW table if it does not exist already
    destination = cdfRaw(database, table,
                         ensureParent=True).schema(source.schema).load()
    destination.createOrReplaceTempView("destinationRawTable")
    source.select(*destination.columns).write.insertInto("destinationRawTable")
示例#3
0
    def get_categoricals_multiplier(self,
                                    df: DataFrame,
                                    col_list: list = [],
                                    ignore_cols: list = [],
                                    approx_distinct=100,
                                    rsd=0.05):
        """
        Gets a dictionary of col names and the distinct values in the column.
        :param df:
        :param col_list: Subset list of columns to use as categoricals; if null, all columns will be checked for
        approx_distinct values and considered categoricals
        :param ignore_cols: when not selecting a subset of columns using col_list, ignore columns is a list of
        columns that will be skipped when searching for categoricals with approx_distinct columns.
        :param approx_distinct: log a warning message if the approx number of distinct values is greater than this threshold.
        :param rsd:
        :return:
        """
        # TODO - Add logging of findings
        filter_vals = []
        filter_cols = col_list

        if len(col_list) == 0:
            for (dcol, dtype) in df.drop(*ignore_cols).dtypes:
                if dtype == 'string':
                    if self._get_approx_distinct_count_for_col(
                            df, dcol, _rsd=rsd) <= approx_distinct:
                        # LOG print("{} has approx {} distincts".format(dcol, cnt))
                        # LOG print("appending {}".format(dcol))
                        filter_vals.append(df.select(col(dcol)) \
                                           .filter((col(dcol).isNotNull()) &
                                                   (col(dcol).isin("", "Y", "N") == False)) \
                                           .distinct().rdd.map(lambda row: str(row[0])).collect())
                        filter_cols.append(dcol)
            # ?? TODO - What about the rest of the potential categorical types (i.e. bools/ints/floats/etc)
            return feature_factory.feature.Multiplier.create_from_cats(
                filter_cols, filter_vals)
        else:
            for dcol in col_list:
                if self._get_approx_distinct_count_for_col(
                        df, dcol) > approx_distinct:
                    print("WARN! {} has more than {} distinct values".format(
                        dcol, approx_distinct))
                filter_vals.append(df.select(col(dcol)) \
                                   .filter((col(dcol).isNotNull()) &
                                           (col(dcol).isin("", "Y", "N") == False)) \
                                   .distinct().rdd.map(lambda row: str(row[0])).collect())
            return feature_factory.feature.Multiplier._create_from_cats(
                filter_cols, filter_vals)
示例#4
0
    def _struct_df(self, df: DataFrame) -> DataFrame:
        """Struct the output dataframe generated by the reader.

        Under the default "value" column coming from Kafka there are the custom
        fields created by some producer. This function will struct the dataframe as
        to get all desired fields from "value" and insert all Kafka default columns,
        including "value", under "kafka_metadata" column. It is important to notice
        that the declared value_schema suffer from the same effects described in
        explode_json_column method in pre_processing module.

        Args:
            df: direct dataframe output from from KafkaReader.

        Returns:
            Structured dataframe with kafka value fields as columns.
                All other default fields from Kafka will be stored under
                "kafka_metadata" column.

        """
        df = df.withColumn("kafka_metadata", struct(*self.KAFKA_COLUMNS))
        df = explode_json_column(df,
                                 column="value",
                                 json_schema=self.value_schema)
        return df.select([field.name
                          for field in self.value_schema] + ["kafka_metadata"])
 def df_add_cols_literal(self, df: DataFrame, cols: list):
     existing_cols = df.columns
     missing_cols = [col for col in cols if col not in existing_cols]
     missing_cols = list(set(missing_cols))  # remove duplicate
     missing_cols_expr = [lit(None).alias(col) for col in missing_cols]
     all_expr = ["*"] + missing_cols_expr
     return df.select(all_expr)
    def json_flatten(self, df: DataFrame):
        schema = df.schema
        flatten_df = df.select(self.on_flatten(schema))

        flatten_df_agg = self.concat_similar_columns(df=flatten_df)

        return flatten_df_agg
示例#7
0
 def remove_illegal_chars(self, dataframe: DataFrame, source_column: str,
                          target_column: str):
     df2 = dataframe.select(
         col('id'),
         translate(col(source_column), f'["".join({self.chars})]',
                   self.replacament).alias(target_column))
     return df2.select('id', 'string_filtered')
示例#8
0
 def _unpack_struct(self, df: DataFrame, col_name):
     sub_df = df.select(col_name + '.*')
     for subcol_name in sub_df.columns:
         df = df.withColumn(f'{col_name}_{subcol_name}',
                            df[col_name][subcol_name])
     df = df.drop(col_name)
     return self.unpack_nested(df)
示例#9
0
def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame:
    df = convert_types_for_ml(df)
    df = convert_heroes_to_lineup(df)
    df = ml_model.transform(df)
    df = convert_types_for_es(df)

    return df.select("probability_arr", "radiant_win_prediction",
                     "match_seq_num")
 def mass_regex_replace(self, df: DataFrame, pattern_replacement: list):
     pattern = pattern_replacement[0]
     replacement = pattern_replacement[1]
     cols = df.schema.fieldNames()
     selectExpr = [
         regexp_replace(col(c), pattern, replacement).alias(c) for c in cols
     ]
     return df.select(selectExpr)
示例#11
0
def generate_idx_for_df(df: DataFrame, col_name: str, col_schema):
    idx_udf = udf(lambda x: udf_array_to_map(x),
                  MapType(IntegerType(), col_schema, True))
    df = df.withColumn("map", idx_udf(col(col_name)))
    df = df.select("problem_type", "user_id", "oms_protected", "problem_id",
                   "create_at",
                   explode("map").alias("item_id", "answer"))
    return df
 def unpack_nested(self, dataframe: DataFrame):
     columns_to_select = []
     for field in dataframe.schema.fields:
         if type(field.dataType) in (ArrayType, StructType):
             c = explode(field.name).alias('int_array')
         else:
             c = col(field.name)
         columns_to_select.append(c)
     return dataframe.select(*columns_to_select)
示例#13
0
def unpack_df_col(
        df: sparkDataFrame,
        col_name: str,
) -> List:

    df = df.withColumnRenamed(col_name, 'col_to_extract')
    list_col_contents = [row.col_to_extract for row in  df.select('col_to_extract').collect()]

    return list_col_contents
示例#14
0
def flatten_schema(df: DataFrame):
    """
    :param df: Spark dataframe
    :return: Spark dataframe with flat schema
    """
    # Define the mapping of the column names
    mapping = flat_schema_mapping(df)

    return df.select(*mapping)
示例#15
0
def convert_columns_and_save(df: DataFrame):
    date_cols = [c for c in df.columns if "Date" in c and "Till" not in c]
    date_cols.append("ReportAsOfEOD")

    return (df.select(
        *(f.col(c).cast("date").alias(c) if c in date_cols else f.col(c)
          for c in df.columns)).withColumn(
              "ListedOnUTC", f.to_timestamp("ListedOnUTC")).withColumn(
                  "BiddingStartedOn",
                  f.to_timestamp("BiddingStartedOn")).withColumn(
                      "StageActiveSince",
                      f.to_timestamp("StageActiveSince")).withColumnRenamed(
                          "ReportAsOfEOD", "LoanReportAsOfEOD"))
示例#16
0
def transform_parking_violation_data(df: DataFrame,
                                     column: str = "Violation County"
                                     ) -> DataFrame:
    """Transforming parking vialation data to make it joinable, below are the things steps in high level

    1. Added Borocode
    2. Converted house number in case it is separated by '-'
    3. Converted 'Street Name' to upper case
    4. Removed any data having no house number
    """

    df = (df.select(
        "Violation County", "House Number", "Street Name", "Summons Number",
        "Issue Date").distinct().withColumn(
            "year",
            F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))).orderBy(
                "Violation County", "House Number",
                "Street Name", "year").coalesce(100).groupBy(
                    "Violation County", "House Number", "Street Name",
                    "year").agg({
                        "Summons Number": "count"
                    }).withColumnRenamed(
                        "count(Summons Number)", "total_cnt").withColumn(
                            "BOROCODE",
                            F.when(
                                F.col(column).isin([
                                    "MAN", "MH", "MN", "NEWY", "NEW Y", "NY"
                                ]), 1).when(
                                    F.col(column).isin(["BRONX", "BX"]),
                                    2).when(
                                        F.col(column).isin(
                                            ["BK", "K", "KING", "KINGS"]),
                                        3).when(
                                            F.col(column).isin([
                                                "Q", "QN", "QNS", "QU", "QUEEN"
                                            ]), 4).when(
                                                F.col(column).isin(
                                                    ["R", "RICHMOND"]),
                                                5).otherwise(0),
                        ))

    df = (df.filter(F.col("House Number").isNotNull()).withColumn(
        "temp", F.split("House Number", "-")).withColumn(
            "House Number",
            F.col("temp").getItem(0).cast("int") +
            F.when(F.col("temp").getItem(1).isNull(), "0").otherwise(
                F.col("temp").getItem(1)).cast("int") / 1000,
        ).withColumn("temp",
                     F.col("temp").getItem(0).cast("int")).withColumn(
                         "Street Name", F.upper(F.col("Street Name"))))
    return df
示例#17
0
    def __validate_col(
        self,
        df: DataFrame,
        partition_cols: List[str],
        target_cols: List[str],
        ts_col: str,
    ):
        """
        Validate if target column exists and is of numeric type, and validates if partition column exists.

        :param df: DataFrame to be validated
        :param partition_cols: Partition columns to be validated
        :param target_col: Target column to be validated
        :param ts_col: Timestamp column to be validated
        """

        for column in partition_cols:
            if column not in str(df.columns):
                raise ValueError(
                    f"Partition Column: '{column}' does not exist in DataFrame."
                )
        for column in target_cols:
            if column not in str(df.columns):
                raise ValueError(
                    f"Target Column: '{column}' does not exist in DataFrame.")
            if df.select(
                    column).dtypes[0][1] not in supported_target_col_types:
                raise ValueError(
                    f"Target Column needs to be one of the following types: {supported_target_col_types}"
                )

        if ts_col not in str(df.columns):
            raise ValueError(
                f"Timestamp Column: '{ts_col}' does not exist in DataFrame.")

        if df.select(ts_col).dtypes[0][1] != "timestamp":
            raise ValueError(
                f"Timestamp Column needs to be of timestamp type.")
示例#18
0
def json_transform(dataframe: DataFrame) -> DataFrame:
    """Filters DataFrame's rows using the given condition and value.

    Args:
        dataframe: Spark DataFrame.

    Returns:
        Converted dataframe.
    """
    return dataframe.select(
        to_json(
            struct([dataframe[column] for column in dataframe.columns])  # type: ignore
        ).alias("value")
    )
示例#19
0
    def orbat_normalize(self, df: DataFrame, mapping_sheet: str):
        mapping_dict = self.transformExtension.parse_mapping_sheet(
            mapping_str=mapping_sheet)
        selectExpr = []
        for field in df.schema.fields:
            name = field.name
            dtype = field.dataType
            if isinstance(dtype, (ArrayType, StructType, MapType)):
                selectExpr.append(to_json(col(name)).alias(name))
            else:
                selectExpr.append(col(name).alias(name))
        df = df.select(selectExpr)
        mapped_df = self.transformExtension.coalesce_map_col(
            df=df, mapping_dict=mapping_dict)

        return mapped_df
    def _transform(self, df: DataFrame) -> DataFrame:
        view: str = self.getView()
        analysis_views_prefix: Optional[str] = self.getAnalysisViewsPrefix()
        output_folder: Optional[Union[Path, str]] = self.getOutputFolder()
        columns_to_analyze: Optional[List[str]] = self.getColumnsToAnalyze()
        columns_to_skip: Optional[List[str]] = self.getColumnsToSkip()
        progress_logger: Optional[ProgressLogger] = self.getProgressLogger()

        # get columns in data frame
        df = df.sql_ctx.table(view)

        if columns_to_analyze:
            columns_to_analyze = [
                c for c in df.columns if c in columns_to_analyze
            ]
        else:
            columns_to_analyze = df.columns

        assert columns_to_analyze

        if columns_to_skip:
            columns_to_analyze = [
                c for c in columns_to_analyze if c not in columns_to_skip
            ]

        column_name: str
        for column_name in columns_to_analyze:
            result_df: DataFrame = (
                df.select(column_name).groupBy(column_name).count().orderBy(
                    col("count").desc()))
            if output_folder:
                target_path: str = str(
                    os.path.join(str(output_folder), column_name))
                if progress_logger:
                    progress_logger.write_to_log(
                        f"Writing analysis for column {column_name} to {target_path}"
                    )
                result_df.coalesce(1).write.csv(target_path,
                                                header=True,
                                                mode="overwrite")
            if analysis_views_prefix:
                result_df.createOrReplaceTempView(
                    f"{analysis_views_prefix}{column_name}"
                    if analysis_views_prefix.endswith(
                        "_") else f"{analysis_views_prefix}_{column_name}")

        return df
示例#21
0
def transform_read_centerline_data(df: DataFrame) -> DataFrame:
    """Transforming centerline data to make it joinable, below are the things steps in high level

    1. Converted ST_LABEL & FULL_STREE to upper case
    2. Converted L_LOW_HN & L_HIGH_HN  separated by '-' for odd house number
    3. Converted R_LOW_HN & R_HIGH_HN  separated by '-' for even house number
    4. Removed any data having no house number in L_LOW_HN and R_LOW_HN
    """
    df = (df.select("PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME",
                    "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").orderBy(
                        "PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME",
                        "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN",
                        "R_HIGH_HN").coalesce(200).withColumn(
                            "ST_NAME", F.upper(F.col("ST_NAME"))).withColumn(
                                "FULL_STREE",
                                F.upper(F.col("FULL_STREE"))).filter(
                                    (F.col("L_LOW_HN").isNotNull())
                                    | (F.col("R_LOW_HN").isNotNull())))
    df = df.withColumn("L_TEMP_ODD", F.split("L_LOW_HN", "-")).withColumn(
        "L_LOW_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("L_HIGH_HN", "-")).withColumn(
        "L_HIGH_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("R_LOW_HN", "-")).withColumn(
        "R_LOW_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("R_HIGH_HN", "-")).withColumn(
        "R_HIGH_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    return df
def _truth_space_table_old(
    df_labels_with_splink_scores: DataFrame,
    spark: SparkSession,
    threshold_actual: float = 0.5,
    score_colname: str = None,
):
    """Create a table of the ROC space i.e. truth table statistics
    for each discrimination threshold

    Args:
        df_labels_with_splink_scores (DataFrame): A dataframe of labels and associated splink scores
            usually the output of the truth.labels_with_splink_scores function
        threshold_actual (float, optional): Threshold to use in categorising clerical match
            scores into match or no match. Defaults to 0.5.
        score_colname (float, optional): Allows user to explicitly state the column name
            in the Splink dataset containing the Splink score.  If none will be inferred

    Returns:
        DataFrame: Table of 'truth space' i.e. truth categories for each threshold level
    """

    # This is used repeatedly to generate the roc curve
    df_labels_with_splink_scores.persist()

    # We want percentiles of score to compute
    score_colname = _get_score_colname(df_labels_with_splink_scores,
                                       score_colname)

    percentiles = [x / 100 for x in range(0, 101)]

    values_distinct = df_labels_with_splink_scores.select(
        score_colname).distinct()
    thresholds = values_distinct.stat.approxQuantile(score_colname,
                                                     percentiles, 0.0)
    thresholds.append(1.01)
    thresholds = sorted(set(thresholds))

    roc_dfs = []
    for thres in thresholds:
        df_e_t = df_e_with_truth_categories(df_labels_with_splink_scores,
                                            thres, spark, threshold_actual,
                                            score_colname)
        df_roc_row = _summarise_truth_cats(df_e_t, spark)
        roc_dfs.append(df_roc_row)

    all_roc_df = reduce(DataFrame.unionAll, roc_dfs)
    return all_roc_df
示例#23
0
def naaccr_read_fwf(flat_file: DataFrame,
                    record_layout: DataFrame,
                    value_col: str = 'value',
                    exclude_pfx: str = 'reserved') -> DataFrame:
    """
    @param flat_file: as from spark.read.text()
                      typically with .value
    @param record_layout: as from http://datadictionary.naaccr.org/?c=7
                          with .start, .length, .xmlId
    """
    fields = [
        func.substring(flat_file[value_col], item.start,
                       item.length).alias(item.xmlId)
        for item in record_layout.collect()
        if not item.xmlId.startswith(exclude_pfx)
    ]  # type: List[Union[Column, str]]
    return flat_file.select(fields)
示例#24
0
def countNullsAcrossAllColumns(df: DataFrame) -> int:
    """Given a spark dataframe count the total number of cells with a null in them

    Args:
        df (DataFrame): [Spark dataframe]

    Returns:
        int: [Total number of cells with a null value]
    """
    # https://www.datasciencemadesimple.com/count-of-missing-nanna-and-null-values-in-pyspark/
    from pyspark.sql.functions import isnull, when, count, expr

    nullCountDf = df.select(
        [count(when(isnull(c), c)).alias(c) for c in df.columns])
    sumExpr = "+".join(nullCountDf.columns) + " as TOTAL"
    sumDf = nullCountDf.select(expr(sumExpr))
    return sumDf.collect()[0].TOTAL
示例#25
0
def countWSAcrossAllStringColumns(df: DataFrame) -> int:
    """Given a spark dataframe count the Total number of cells with a BLANK or all SPACES value

    Args:
        df (DataFrame): [Spark dataframe]

    Returns:
        int: [Total number of cells with a BLANK or all SPACES value]
    """
    from pyspark.sql.functions import col, when, count, trim, expr

    stringCols = [cn for (cn, ct) in df.dtypes if ct == "string"]
    blanksCountdf = df.select(
        [count(when(trim(col(c)) == "", True)).alias(c) for c in stringCols])
    sumExpr = "+".join(blanksCountdf.columns) + " as TOTAL"
    sumDf = blanksCountdf.select(expr(sumExpr))
    return sumDf.collect()[0].TOTAL
示例#26
0
def _traverse_graph_spark_dataframe(
    df: DataFrame, root_node: T, graph: nx.DiGraph
) -> Tuple[DataFrame, Dict[str, List[T]], Dict[str, dict]]:
    inferred_values = {
        col: traverse_graph_with_series(root_node, df.select(col), graph)
        for col in df.columns
    }

    inferred_series = {}
    inferred_paths: Dict[str, List[T]] = {}
    inferred_states: Dict[str, dict] = {}
    for col, (inf_series, inf_path, inf_state) in inferred_values.items():
        assert isinstance(inf_path, list)  # Placate the MyPy Gods

        inferred_series[col] = inf_series
        inferred_paths[col] = inf_path
        inferred_states[col] = inf_state

    # note inference disabled, return df
    return df, inferred_paths, inferred_states
    def concat_similar_columns(self, df: DataFrame):
        """
        replace - to _ in column name,
        concat all similar column with ' | '

        :param df:
        :return:
        """
        fields = df.schema.fieldNames()
        pandas_df = pandas.DataFrame(data={'field': fields})
        pandas_df['groupfiled'] = pandas_df.apply(
            func=lambda r: r['field'].replace('-', '_'), axis=1)
        group_ser = pandas_df.groupby('groupfiled',
                                      as_index=False)['field'].apply(list)
        group_fileds = group_ser.to_list()
        SEP = ' | '
        selectExpr = [
            concat_ws(SEP, *[col(f) for f in fields]).alias(fields[0])
            for fields in group_fileds
        ]
        return df.select(selectExpr)
示例#28
0
    def __check_labels(sdf: SparkDataFrame, column_name: str,
                       label1: Optional[str], label2: Optional[str]) -> bool:
        r""" checks whether labels meet required conditions for method `execute`.

        Called only when category = 'boolean'

        Args:
            sdf: Spark DataFrame object
            column_name: specific column which is to be executed using 'Binary' mechanism
            label1: label to be used by 'Binary' mechanism
            label2: label to be used by 'Binary' mechanism

        Returns: True if parameters satisfy the conditions

        Raises: TypeError, ValueError if parameters do not obey the rules

        """
        if not isinstance(label1, str) or not isinstance(label2, str):
            raise TypeError("Labels must be strings.")

        if len(label1) == 0 or len(label2) == 0:
            raise ValueError("Labels must be non-empty strings")

        if label1 == label2:
            raise ValueError("Labels must not match")

        # finds unique values in a column
        labels: List[str] = [
            row[column_name]
            for row in sdf.select(column_name).distinct().collect()
        ]

        if len(labels
               ) is not 2 or label1 not in labels or label2 not in labels:
            # checks whether all the rows of column have either label1 or label2
            raise ValueError(
                "Labels in column `%s` does not match with labels entered" %
                column_name)

        return True
示例#29
0
 def append(self, df: DataFrame, full_table_name: str, schema: StructType, options: dict):
     # insertInto() requires dataframe columns order to match schema columns order
     df.select([field.name for field in schema.fields]).write.options(**options).insertInto(full_table_name, overwrite=False)
示例#30
0
 def _get_approx_distinct_count_for_col(self,
                                        df: DataFrame,
                                        _dcol: str,
                                        _rsd=0.05):
     return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \
         .rdd.map(lambda row: row[0]).collect()[0]