예제 #1
0
def _weighted_entropy(
    countdf: pyspark.sql.dataframe.DataFrame, total_count: int, split_columns: Optional[List[str]], target_column_name: str, weighted: bool = True
) -> float:
    """Entropy calculation across many ."""
    split_columns_plus_target = split_columns[:]
    split_columns_plus_target.append(target_column_name)
    groupdf = countdf.groupby(split_columns_plus_target).agg(F.sum("count").alias("group_count"))

    w = Window.partitionBy(split_columns)
    groupdf = groupdf.withColumn("p", F.col("group_count") / F.sum(groupdf["group_count"]).over(w)).withColumn(
        "weight", F.sum(groupdf["group_count"] / total_count).over(w)
    )

    entropydf = groupdf.groupby(split_columns).agg(
        (-F.sum(F.col("p") * F.log2("p"))).alias("entropy"), (F.sum(F.col("group_count") / total_count)).alias("weight")
    )

    if weighted:
        result = entropydf.groupby().agg(F.sum(F.col("entropy") * F.col("weight"))).collect()[0][0]
    else:
        result = entropydf.groupby().sum("entropy").collect()[0][0]

    return result
예제 #2
0
def calculate_mean_std_and_geometric_mean(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Calculate the mean, std and geometric mean of qtySold and netSale for each subcategory and each month
    """
    df_group = df.groupby('month', 'SubCategory')
    df = calculate_geometric_mean(df)
    df_group_sum = df_group.avg('totalMonthlyQtySold', 'totalMonthlyNetSale')\
    .withColumnRenamed('avg(totalMonthlyQtySold)', "Qty_mean_by_month_Subcat")\
    .withColumnRenamed('avg(totalMonthlyNetSale)', "NS_mean_by_month_Subcat")

    df_group_std = df_group.agg(stddev('totalMonthlyQtySold'))\
    .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_month_Subcat")

    df_group_std2 = df_group.agg(stddev('totalMonthlyNetSale'))\
    .withColumnRenamed('stddev_samp(totalMonthlyNetSale)', "NS_std_by_month_Subcat")

    # join to get final dataset
    df = df.join(df_group_sum, on=['month', 'SubCategory'], how='inner')
    df = df.join(df_group_std, on=['month', 'SubCategory'], how='inner')
    df = df.join(df_group_std2, on=['month', 'SubCategory'], how='inner')
    return df