def _weighted_entropy( countdf: pyspark.sql.dataframe.DataFrame, total_count: int, split_columns: Optional[List[str]], target_column_name: str, weighted: bool = True ) -> float: """Entropy calculation across many .""" split_columns_plus_target = split_columns[:] split_columns_plus_target.append(target_column_name) groupdf = countdf.groupby(split_columns_plus_target).agg(F.sum("count").alias("group_count")) w = Window.partitionBy(split_columns) groupdf = groupdf.withColumn("p", F.col("group_count") / F.sum(groupdf["group_count"]).over(w)).withColumn( "weight", F.sum(groupdf["group_count"] / total_count).over(w) ) entropydf = groupdf.groupby(split_columns).agg( (-F.sum(F.col("p") * F.log2("p"))).alias("entropy"), (F.sum(F.col("group_count") / total_count)).alias("weight") ) if weighted: result = entropydf.groupby().agg(F.sum(F.col("entropy") * F.col("weight"))).collect()[0][0] else: result = entropydf.groupby().sum("entropy").collect()[0][0] return result
def calculate_mean_std_and_geometric_mean( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Calculate the mean, std and geometric mean of qtySold and netSale for each subcategory and each month """ df_group = df.groupby('month', 'SubCategory') df = calculate_geometric_mean(df) df_group_sum = df_group.avg('totalMonthlyQtySold', 'totalMonthlyNetSale')\ .withColumnRenamed('avg(totalMonthlyQtySold)', "Qty_mean_by_month_Subcat")\ .withColumnRenamed('avg(totalMonthlyNetSale)', "NS_mean_by_month_Subcat") df_group_std = df_group.agg(stddev('totalMonthlyQtySold'))\ .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_month_Subcat") df_group_std2 = df_group.agg(stddev('totalMonthlyNetSale'))\ .withColumnRenamed('stddev_samp(totalMonthlyNetSale)', "NS_std_by_month_Subcat") # join to get final dataset df = df.join(df_group_sum, on=['month', 'SubCategory'], how='inner') df = df.join(df_group_std, on=['month', 'SubCategory'], how='inner') df = df.join(df_group_std2, on=['month', 'SubCategory'], how='inner') return df