def check_fact_table_integrity(fact: SparkDataFrame, dim_demographics: SparkDataFrame, dim_airports: SparkDataFrame, dim_countries: SparkDataFrame) -> bool: """Check the integrity of models based on fact table. :param fact: the data frame of fact table, immigration. :param dim_demographics: the data frame of dimension demographics table :param dim_airports: the data frame of dimension airports table :param dim_countries: the data frame of dimension of countries table :return: True if all the integrity check pass otherwise False """ integrity_demographics = fact.select(col('i94addr')).distinct()\ .join(dim_demographics, fact['i94addr'] == dim_demographics['state_code'], 'left_anti').count() == 0 integrity_airports = fact.select(col('i94port')).distinct()\ .join(dim_airports, fact['i94port'] == dim_airports['local_code'], 'left_anti').count() == 0 integrity_countries = fact.select(col('i94cit')).distinct().\ join(dim_countries, fact['i94cit'] == dim_countries['code'], 'left_anti').count() == 0 return integrity_demographics & integrity_airports & integrity_countries
def loadIntoRaw(database, table, source: DataFrame): # RAW is a bit special since it does not have a fixed schema # When writing, we order it to use the same schema as the source dataset # ensureParent parameter instructs the data source to create the RAW table if it does not exist already destination = cdfRaw(database, table, ensureParent=True).schema(source.schema).load() destination.createOrReplaceTempView("destinationRawTable") source.select(*destination.columns).write.insertInto("destinationRawTable")
def get_categoricals_multiplier(self, df: DataFrame, col_list: list = [], ignore_cols: list = [], approx_distinct=100, rsd=0.05): """ Gets a dictionary of col names and the distinct values in the column. :param df: :param col_list: Subset list of columns to use as categoricals; if null, all columns will be checked for approx_distinct values and considered categoricals :param ignore_cols: when not selecting a subset of columns using col_list, ignore columns is a list of columns that will be skipped when searching for categoricals with approx_distinct columns. :param approx_distinct: log a warning message if the approx number of distinct values is greater than this threshold. :param rsd: :return: """ # TODO - Add logging of findings filter_vals = [] filter_cols = col_list if len(col_list) == 0: for (dcol, dtype) in df.drop(*ignore_cols).dtypes: if dtype == 'string': if self._get_approx_distinct_count_for_col( df, dcol, _rsd=rsd) <= approx_distinct: # LOG print("{} has approx {} distincts".format(dcol, cnt)) # LOG print("appending {}".format(dcol)) filter_vals.append(df.select(col(dcol)) \ .filter((col(dcol).isNotNull()) & (col(dcol).isin("", "Y", "N") == False)) \ .distinct().rdd.map(lambda row: str(row[0])).collect()) filter_cols.append(dcol) # ?? TODO - What about the rest of the potential categorical types (i.e. bools/ints/floats/etc) return feature_factory.feature.Multiplier.create_from_cats( filter_cols, filter_vals) else: for dcol in col_list: if self._get_approx_distinct_count_for_col( df, dcol) > approx_distinct: print("WARN! {} has more than {} distinct values".format( dcol, approx_distinct)) filter_vals.append(df.select(col(dcol)) \ .filter((col(dcol).isNotNull()) & (col(dcol).isin("", "Y", "N") == False)) \ .distinct().rdd.map(lambda row: str(row[0])).collect()) return feature_factory.feature.Multiplier._create_from_cats( filter_cols, filter_vals)
def _struct_df(self, df: DataFrame) -> DataFrame: """Struct the output dataframe generated by the reader. Under the default "value" column coming from Kafka there are the custom fields created by some producer. This function will struct the dataframe as to get all desired fields from "value" and insert all Kafka default columns, including "value", under "kafka_metadata" column. It is important to notice that the declared value_schema suffer from the same effects described in explode_json_column method in pre_processing module. Args: df: direct dataframe output from from KafkaReader. Returns: Structured dataframe with kafka value fields as columns. All other default fields from Kafka will be stored under "kafka_metadata" column. """ df = df.withColumn("kafka_metadata", struct(*self.KAFKA_COLUMNS)) df = explode_json_column(df, column="value", json_schema=self.value_schema) return df.select([field.name for field in self.value_schema] + ["kafka_metadata"])
def df_add_cols_literal(self, df: DataFrame, cols: list): existing_cols = df.columns missing_cols = [col for col in cols if col not in existing_cols] missing_cols = list(set(missing_cols)) # remove duplicate missing_cols_expr = [lit(None).alias(col) for col in missing_cols] all_expr = ["*"] + missing_cols_expr return df.select(all_expr)
def json_flatten(self, df: DataFrame): schema = df.schema flatten_df = df.select(self.on_flatten(schema)) flatten_df_agg = self.concat_similar_columns(df=flatten_df) return flatten_df_agg
def remove_illegal_chars(self, dataframe: DataFrame, source_column: str, target_column: str): df2 = dataframe.select( col('id'), translate(col(source_column), f'["".join({self.chars})]', self.replacament).alias(target_column)) return df2.select('id', 'string_filtered')
def _unpack_struct(self, df: DataFrame, col_name): sub_df = df.select(col_name + '.*') for subcol_name in sub_df.columns: df = df.withColumn(f'{col_name}_{subcol_name}', df[col_name][subcol_name]) df = df.drop(col_name) return self.unpack_nested(df)
def process_data(df: DataFrame, ml_model: PipelineModel = model) -> DataFrame: df = convert_types_for_ml(df) df = convert_heroes_to_lineup(df) df = ml_model.transform(df) df = convert_types_for_es(df) return df.select("probability_arr", "radiant_win_prediction", "match_seq_num")
def mass_regex_replace(self, df: DataFrame, pattern_replacement: list): pattern = pattern_replacement[0] replacement = pattern_replacement[1] cols = df.schema.fieldNames() selectExpr = [ regexp_replace(col(c), pattern, replacement).alias(c) for c in cols ] return df.select(selectExpr)
def generate_idx_for_df(df: DataFrame, col_name: str, col_schema): idx_udf = udf(lambda x: udf_array_to_map(x), MapType(IntegerType(), col_schema, True)) df = df.withColumn("map", idx_udf(col(col_name))) df = df.select("problem_type", "user_id", "oms_protected", "problem_id", "create_at", explode("map").alias("item_id", "answer")) return df
def unpack_nested(self, dataframe: DataFrame): columns_to_select = [] for field in dataframe.schema.fields: if type(field.dataType) in (ArrayType, StructType): c = explode(field.name).alias('int_array') else: c = col(field.name) columns_to_select.append(c) return dataframe.select(*columns_to_select)
def unpack_df_col( df: sparkDataFrame, col_name: str, ) -> List: df = df.withColumnRenamed(col_name, 'col_to_extract') list_col_contents = [row.col_to_extract for row in df.select('col_to_extract').collect()] return list_col_contents
def flatten_schema(df: DataFrame): """ :param df: Spark dataframe :return: Spark dataframe with flat schema """ # Define the mapping of the column names mapping = flat_schema_mapping(df) return df.select(*mapping)
def convert_columns_and_save(df: DataFrame): date_cols = [c for c in df.columns if "Date" in c and "Till" not in c] date_cols.append("ReportAsOfEOD") return (df.select( *(f.col(c).cast("date").alias(c) if c in date_cols else f.col(c) for c in df.columns)).withColumn( "ListedOnUTC", f.to_timestamp("ListedOnUTC")).withColumn( "BiddingStartedOn", f.to_timestamp("BiddingStartedOn")).withColumn( "StageActiveSince", f.to_timestamp("StageActiveSince")).withColumnRenamed( "ReportAsOfEOD", "LoanReportAsOfEOD"))
def transform_parking_violation_data(df: DataFrame, column: str = "Violation County" ) -> DataFrame: """Transforming parking vialation data to make it joinable, below are the things steps in high level 1. Added Borocode 2. Converted house number in case it is separated by '-' 3. Converted 'Street Name' to upper case 4. Removed any data having no house number """ df = (df.select( "Violation County", "House Number", "Street Name", "Summons Number", "Issue Date").distinct().withColumn( "year", F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))).orderBy( "Violation County", "House Number", "Street Name", "year").coalesce(100).groupBy( "Violation County", "House Number", "Street Name", "year").agg({ "Summons Number": "count" }).withColumnRenamed( "count(Summons Number)", "total_cnt").withColumn( "BOROCODE", F.when( F.col(column).isin([ "MAN", "MH", "MN", "NEWY", "NEW Y", "NY" ]), 1).when( F.col(column).isin(["BRONX", "BX"]), 2).when( F.col(column).isin( ["BK", "K", "KING", "KINGS"]), 3).when( F.col(column).isin([ "Q", "QN", "QNS", "QU", "QUEEN" ]), 4).when( F.col(column).isin( ["R", "RICHMOND"]), 5).otherwise(0), )) df = (df.filter(F.col("House Number").isNotNull()).withColumn( "temp", F.split("House Number", "-")).withColumn( "House Number", F.col("temp").getItem(0).cast("int") + F.when(F.col("temp").getItem(1).isNull(), "0").otherwise( F.col("temp").getItem(1)).cast("int") / 1000, ).withColumn("temp", F.col("temp").getItem(0).cast("int")).withColumn( "Street Name", F.upper(F.col("Street Name")))) return df
def __validate_col( self, df: DataFrame, partition_cols: List[str], target_cols: List[str], ts_col: str, ): """ Validate if target column exists and is of numeric type, and validates if partition column exists. :param df: DataFrame to be validated :param partition_cols: Partition columns to be validated :param target_col: Target column to be validated :param ts_col: Timestamp column to be validated """ for column in partition_cols: if column not in str(df.columns): raise ValueError( f"Partition Column: '{column}' does not exist in DataFrame." ) for column in target_cols: if column not in str(df.columns): raise ValueError( f"Target Column: '{column}' does not exist in DataFrame.") if df.select( column).dtypes[0][1] not in supported_target_col_types: raise ValueError( f"Target Column needs to be one of the following types: {supported_target_col_types}" ) if ts_col not in str(df.columns): raise ValueError( f"Timestamp Column: '{ts_col}' does not exist in DataFrame.") if df.select(ts_col).dtypes[0][1] != "timestamp": raise ValueError( f"Timestamp Column needs to be of timestamp type.")
def json_transform(dataframe: DataFrame) -> DataFrame: """Filters DataFrame's rows using the given condition and value. Args: dataframe: Spark DataFrame. Returns: Converted dataframe. """ return dataframe.select( to_json( struct([dataframe[column] for column in dataframe.columns]) # type: ignore ).alias("value") )
def orbat_normalize(self, df: DataFrame, mapping_sheet: str): mapping_dict = self.transformExtension.parse_mapping_sheet( mapping_str=mapping_sheet) selectExpr = [] for field in df.schema.fields: name = field.name dtype = field.dataType if isinstance(dtype, (ArrayType, StructType, MapType)): selectExpr.append(to_json(col(name)).alias(name)) else: selectExpr.append(col(name).alias(name)) df = df.select(selectExpr) mapped_df = self.transformExtension.coalesce_map_col( df=df, mapping_dict=mapping_dict) return mapped_df
def _transform(self, df: DataFrame) -> DataFrame: view: str = self.getView() analysis_views_prefix: Optional[str] = self.getAnalysisViewsPrefix() output_folder: Optional[Union[Path, str]] = self.getOutputFolder() columns_to_analyze: Optional[List[str]] = self.getColumnsToAnalyze() columns_to_skip: Optional[List[str]] = self.getColumnsToSkip() progress_logger: Optional[ProgressLogger] = self.getProgressLogger() # get columns in data frame df = df.sql_ctx.table(view) if columns_to_analyze: columns_to_analyze = [ c for c in df.columns if c in columns_to_analyze ] else: columns_to_analyze = df.columns assert columns_to_analyze if columns_to_skip: columns_to_analyze = [ c for c in columns_to_analyze if c not in columns_to_skip ] column_name: str for column_name in columns_to_analyze: result_df: DataFrame = ( df.select(column_name).groupBy(column_name).count().orderBy( col("count").desc())) if output_folder: target_path: str = str( os.path.join(str(output_folder), column_name)) if progress_logger: progress_logger.write_to_log( f"Writing analysis for column {column_name} to {target_path}" ) result_df.coalesce(1).write.csv(target_path, header=True, mode="overwrite") if analysis_views_prefix: result_df.createOrReplaceTempView( f"{analysis_views_prefix}{column_name}" if analysis_views_prefix.endswith( "_") else f"{analysis_views_prefix}_{column_name}") return df
def transform_read_centerline_data(df: DataFrame) -> DataFrame: """Transforming centerline data to make it joinable, below are the things steps in high level 1. Converted ST_LABEL & FULL_STREE to upper case 2. Converted L_LOW_HN & L_HIGH_HN separated by '-' for odd house number 3. Converted R_LOW_HN & R_HIGH_HN separated by '-' for even house number 4. Removed any data having no house number in L_LOW_HN and R_LOW_HN """ df = (df.select("PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME", "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").orderBy( "PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME", "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").coalesce(200).withColumn( "ST_NAME", F.upper(F.col("ST_NAME"))).withColumn( "FULL_STREE", F.upper(F.col("FULL_STREE"))).filter( (F.col("L_LOW_HN").isNotNull()) | (F.col("R_LOW_HN").isNotNull()))) df = df.withColumn("L_TEMP_ODD", F.split("L_LOW_HN", "-")).withColumn( "L_LOW_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) df = df.withColumn("L_TEMP_ODD", F.split("L_HIGH_HN", "-")).withColumn( "L_HIGH_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) df = df.withColumn("L_TEMP_ODD", F.split("R_LOW_HN", "-")).withColumn( "R_LOW_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) df = df.withColumn("L_TEMP_ODD", F.split("R_HIGH_HN", "-")).withColumn( "R_HIGH_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) return df
def _truth_space_table_old( df_labels_with_splink_scores: DataFrame, spark: SparkSession, threshold_actual: float = 0.5, score_colname: str = None, ): """Create a table of the ROC space i.e. truth table statistics for each discrimination threshold Args: df_labels_with_splink_scores (DataFrame): A dataframe of labels and associated splink scores usually the output of the truth.labels_with_splink_scores function threshold_actual (float, optional): Threshold to use in categorising clerical match scores into match or no match. Defaults to 0.5. score_colname (float, optional): Allows user to explicitly state the column name in the Splink dataset containing the Splink score. If none will be inferred Returns: DataFrame: Table of 'truth space' i.e. truth categories for each threshold level """ # This is used repeatedly to generate the roc curve df_labels_with_splink_scores.persist() # We want percentiles of score to compute score_colname = _get_score_colname(df_labels_with_splink_scores, score_colname) percentiles = [x / 100 for x in range(0, 101)] values_distinct = df_labels_with_splink_scores.select( score_colname).distinct() thresholds = values_distinct.stat.approxQuantile(score_colname, percentiles, 0.0) thresholds.append(1.01) thresholds = sorted(set(thresholds)) roc_dfs = [] for thres in thresholds: df_e_t = df_e_with_truth_categories(df_labels_with_splink_scores, thres, spark, threshold_actual, score_colname) df_roc_row = _summarise_truth_cats(df_e_t, spark) roc_dfs.append(df_roc_row) all_roc_df = reduce(DataFrame.unionAll, roc_dfs) return all_roc_df
def naaccr_read_fwf(flat_file: DataFrame, record_layout: DataFrame, value_col: str = 'value', exclude_pfx: str = 'reserved') -> DataFrame: """ @param flat_file: as from spark.read.text() typically with .value @param record_layout: as from http://datadictionary.naaccr.org/?c=7 with .start, .length, .xmlId """ fields = [ func.substring(flat_file[value_col], item.start, item.length).alias(item.xmlId) for item in record_layout.collect() if not item.xmlId.startswith(exclude_pfx) ] # type: List[Union[Column, str]] return flat_file.select(fields)
def countNullsAcrossAllColumns(df: DataFrame) -> int: """Given a spark dataframe count the total number of cells with a null in them Args: df (DataFrame): [Spark dataframe] Returns: int: [Total number of cells with a null value] """ # https://www.datasciencemadesimple.com/count-of-missing-nanna-and-null-values-in-pyspark/ from pyspark.sql.functions import isnull, when, count, expr nullCountDf = df.select( [count(when(isnull(c), c)).alias(c) for c in df.columns]) sumExpr = "+".join(nullCountDf.columns) + " as TOTAL" sumDf = nullCountDf.select(expr(sumExpr)) return sumDf.collect()[0].TOTAL
def countWSAcrossAllStringColumns(df: DataFrame) -> int: """Given a spark dataframe count the Total number of cells with a BLANK or all SPACES value Args: df (DataFrame): [Spark dataframe] Returns: int: [Total number of cells with a BLANK or all SPACES value] """ from pyspark.sql.functions import col, when, count, trim, expr stringCols = [cn for (cn, ct) in df.dtypes if ct == "string"] blanksCountdf = df.select( [count(when(trim(col(c)) == "", True)).alias(c) for c in stringCols]) sumExpr = "+".join(blanksCountdf.columns) + " as TOTAL" sumDf = blanksCountdf.select(expr(sumExpr)) return sumDf.collect()[0].TOTAL
def _traverse_graph_spark_dataframe( df: DataFrame, root_node: T, graph: nx.DiGraph ) -> Tuple[DataFrame, Dict[str, List[T]], Dict[str, dict]]: inferred_values = { col: traverse_graph_with_series(root_node, df.select(col), graph) for col in df.columns } inferred_series = {} inferred_paths: Dict[str, List[T]] = {} inferred_states: Dict[str, dict] = {} for col, (inf_series, inf_path, inf_state) in inferred_values.items(): assert isinstance(inf_path, list) # Placate the MyPy Gods inferred_series[col] = inf_series inferred_paths[col] = inf_path inferred_states[col] = inf_state # note inference disabled, return df return df, inferred_paths, inferred_states
def concat_similar_columns(self, df: DataFrame): """ replace - to _ in column name, concat all similar column with ' | ' :param df: :return: """ fields = df.schema.fieldNames() pandas_df = pandas.DataFrame(data={'field': fields}) pandas_df['groupfiled'] = pandas_df.apply( func=lambda r: r['field'].replace('-', '_'), axis=1) group_ser = pandas_df.groupby('groupfiled', as_index=False)['field'].apply(list) group_fileds = group_ser.to_list() SEP = ' | ' selectExpr = [ concat_ws(SEP, *[col(f) for f in fields]).alias(fields[0]) for fields in group_fileds ] return df.select(selectExpr)
def __check_labels(sdf: SparkDataFrame, column_name: str, label1: Optional[str], label2: Optional[str]) -> bool: r""" checks whether labels meet required conditions for method `execute`. Called only when category = 'boolean' Args: sdf: Spark DataFrame object column_name: specific column which is to be executed using 'Binary' mechanism label1: label to be used by 'Binary' mechanism label2: label to be used by 'Binary' mechanism Returns: True if parameters satisfy the conditions Raises: TypeError, ValueError if parameters do not obey the rules """ if not isinstance(label1, str) or not isinstance(label2, str): raise TypeError("Labels must be strings.") if len(label1) == 0 or len(label2) == 0: raise ValueError("Labels must be non-empty strings") if label1 == label2: raise ValueError("Labels must not match") # finds unique values in a column labels: List[str] = [ row[column_name] for row in sdf.select(column_name).distinct().collect() ] if len(labels ) is not 2 or label1 not in labels or label2 not in labels: # checks whether all the rows of column have either label1 or label2 raise ValueError( "Labels in column `%s` does not match with labels entered" % column_name) return True
def append(self, df: DataFrame, full_table_name: str, schema: StructType, options: dict): # insertInto() requires dataframe columns order to match schema columns order df.select([field.name for field in schema.fields]).write.options(**options).insertInto(full_table_name, overwrite=False)
def _get_approx_distinct_count_for_col(self, df: DataFrame, _dcol: str, _rsd=0.05): return df.select(F.approx_count_distinct(col(_dcol), rsd=_rsd)) \ .rdd.map(lambda row: row[0]).collect()[0]