def most_valuable_users(df: DataFrame): return ( df.groupBy("UserName") .agg( f.countDistinct("LoanID").alias("Loans"), f.sum("InterestRepayment").alias("TotalInterestRepayment"), f.sum("LateFeesRepayment").alias("TotalLateFeesRepayment"), ) .orderBy("TotalInterestRepayment", ascending=False) )
def transform_temperature(df: SparkDataFrame) -> SparkDataFrame: """Transform temperature data Aggregate the data to obtain the temperature at country level Add a column of lower case country name for joining purpose :param df: the temperature data frame to be transformed :return: the transformed data frame """ df = df.groupBy(['country']).agg({ 'average_temperature': 'avg', 'latitude': 'first', 'longitude': 'first' }) df = rename_column_name(df, [ ('avg(average_temperature)', 'average_temperature'), ('first(latitude)', 'latitude'), ('first(longitude)', 'longitude'), ]) df = df.withColumn('lower_name', lower(df['country'])) return df
def append_features(self, df: DataFrame, groupBy_cols, feature_sets: [FeatureSet], withTrendsForFeatures: [FeatureSet] = None): """ Appends features to incoming df. The features columns and groupby cols will be deduped and validated. If there's a group by, the groupby cols will be applied before appending features. If there's not a group by and no agg features then the features will be appended to df. :param df: :param groupBy_cols: :param feature_sets: input of FeatureSet :return: """ # If groupBy Column is past in as something other than list, convert to list # Validation - If features, passed in is dict, convert to list of vals, etc. # groupBy_cols = self.helpers._to_list(groupBy_cols) groupBy_cols, groupBy_joiners = self.helpers._extract_groupby_joiner( groupBy_cols) features, dups = self.helpers._dedup_fast(df, [ feature for feature_set in feature_sets for feature in feature_set.features.values() ]) df = self.helpers._resolve_feature_joiners( df, features, groupBy_joiners).repartition(*groupBy_cols) # feature_cols = [] agg_cols = [] non_agg_cols = {} features_to_drop = [] # base_cols = [f.base_col for f in features] # column validation # valid_result, undef_cols = self.helpers.validate_col(df, *base_cols) # assert valid_result, "base cols {} are not defined in df columns {}".format(undef_cols, df.columns) # valid_result, undef_cols = self.helpers._validate_col(df, *groupBy_cols) # assert valid_result, "groupby cols {} are not defined in df columns {}".format(undef_cols, df.columns) for feature in features: assert True if ((len(feature.aggs) > 0) and (len( groupBy_cols) > 0) or feature.agg_func is None) else False, "{} has either aggs or groupBys " \ "but not both, ensure both are present".format(feature.name) # feature_cols.append(feature.assembled_column) # feature_cols.append(F.col(feature.output_alias)) agg_cols += [agg_col for agg_col in feature.aggs] if feature.agg_func is None: non_agg_cols[feature.output_alias] = feature.assembled_column else: df = df.withColumn(feature.output_alias, feature.assembled_column) if feature.is_temporary: features_to_drop.append(feature.name) if len(groupBy_cols) > 0: df = df.groupBy(*groupBy_cols)\ .agg(*agg_cols) for fn, col in non_agg_cols.items(): df = df.withColumn(fn, col) final_df = df.drop(*features_to_drop) # else: # new_df = df.select(*df.columns + feature_cols) return final_df
def calculate_time_horizon(df: DataFrame, ts_col: str, freq: str, partition_cols: List[str]): # Convert Frequency using resample dictionary parsed_freq = checkAllowableFreq(freq) freq = f"{parsed_freq[0]} {freq_dict[parsed_freq[1]]}" # Get max and min timestamp per partition partitioned_df: DataFrame = df.groupBy(*partition_cols).agg( max(ts_col).alias("max_ts"), min(ts_col).alias("min_ts"), ) # Generate upscale metrics normalized_time_df: DataFrame = (partitioned_df.withColumn( "min_epoch_ms", expr("unix_millis(min_ts)") ).withColumn("max_epoch_ms", expr("unix_millis(max_ts)")).withColumn( "interval_ms", expr( f"unix_millis(cast('1970-01-01 00:00:00.000+0000' as TIMESTAMP) + INTERVAL {freq})" ), ).withColumn( "rounded_min_epoch", expr("min_epoch_ms - (min_epoch_ms % interval_ms)")).withColumn( "rounded_max_epoch", expr("max_epoch_ms - (max_epoch_ms % interval_ms)")).withColumn( "diff_ms", expr("rounded_max_epoch - rounded_min_epoch")).withColumn( "num_values", expr("(diff_ms/interval_ms) +1"))) ( min_ts, max_ts, min_value_partition, max_value_partition, p25_value_partition, p50_value_partition, p75_value_partition, total_values, ) = normalized_time_df.select( min("min_ts"), max("max_ts"), min("num_values"), max("num_values"), percentile_approx("num_values", 0.25), percentile_approx("num_values", 0.5), percentile_approx("num_values", 0.75), sum("num_values"), ).first() warnings.simplefilter("always", ResampleWarning) warnings.warn( f""" Resample Metrics Warning: Earliest Timestamp: {min_ts} Latest Timestamp: {max_ts} No. of Unique Partitions: {normalized_time_df.count()} Resampled Min No. Values in Single a Partition: {min_value_partition} Resampled Max No. Values in Single a Partition: {max_value_partition} Resampled P25 No. Values in Single a Partition: {p25_value_partition} Resampled P50 No. Values in Single a Partition: {p50_value_partition} Resampled P75 No. Values in Single a Partition: {p75_value_partition} Resampled Total No. Values Across All Partitions: {total_values} """, ResampleWarning, )
def transform_demographics(df: SparkDataFrame) -> SparkDataFrame: """Transform demographic data Pivot the race to present each in the separate columns. Aggregate the data to get the data at state level. :param df: demographics data frame to be transformed. :return: transformed demographics data frame. """ # The first aggregation is to get the information of race pivot. aggregated = df.groupBy(['city', 'state', 'state_code']).agg({ 'median_age': 'first', 'male_population': 'first', 'female_population': 'first', 'total_population': 'first', 'number_of_veterans': 'first', 'foreign_born': 'first', 'average_household_size': 'first' }) pivot = df.groupBy(['city', 'state', 'state_code']).pivot('race').sum('count') aggregated = aggregated.join(other=pivot, on=['city', 'state', 'state_code'], how='inner') aggregated = rename_column_name( aggregated, [ ('first(median_age)', 'median_age'), ('first(male_population)', 'male_population'), ('first(female_population)', 'female_population'), ('first(total_population)', 'total_population'), ('first(number_of_veterans)', 'number_of_veterans'), ('first(foreign_born)', 'foreign_born'), ('first(average_household_size)', 'average_household_size'), ('Hispanic or Latino', 'hispanic_or_latino'), ('Black or African-American', 'black_or_african_american'), ('American Indian and Alaska Native', 'american_indian_and_alaska_native'), ('White', 'white'), ('Asian', 'asian') ] ) # The second aggregation is to obtain the data at state level. df = aggregated.groupBy(['state', 'state_code']).agg({ 'male_population': 'sum', 'female_population': 'sum', 'total_population': 'sum', 'number_of_veterans': 'sum', 'foreign_born': 'sum', 'median_age': 'avg', 'average_household_size': 'avg', 'hispanic_or_latino': 'sum', 'black_or_african_american': 'sum', 'american_indian_and_alaska_native': 'sum', 'white': 'sum', 'asian': 'sum' }) df = rename_column_name( df, [ ('sum(male_population)', 'male_population'), ('sum(female_population)', 'female_population'), ('sum(total_population)', 'total_population'), ('sum(number_of_veterans)', 'number_of_veterans'), ('sum(foreign_born)', 'foreign_born'), ('avg(median_age)', 'median_age'), ('avg(average_household_size)', 'average_household_size'), ('sum(hispanic_or_latino)', 'hispanic_or_latino'), ('sum(black_or_african_american)', 'black_or_african_american'), ('sum(american_indian_and_alaska_native)', 'american_indian_and_alaska_native'), ('sum(white)', 'white'), ('sum(asian)', 'asian') ] ) return df