def calculate_geometric_mean( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Calculate the geometirc mean of qtySold and netSale, by adding the new column called `geo_mean` """ df_geometric_mean = df.groupBy('month', 'SubCategory').agg( exp(avg(log(col('totalMonthlyQtySold'))))) df_geometric_mean = df_geometric_mean.withColumnRenamed( 'EXP(avg(LOG(totalMonthlyQtySold)))', 'Qty_GeoMean_by_month_Subcat') df_geometric_mean2 = df.groupBy('month', 'SubCategory').agg( exp(avg(log(col('totalMonthlyNetSale'))))) df_geometric_mean2 = df_geometric_mean2.withColumnRenamed( 'EXP(avg(LOG(totalMonthlyNetSale)))', 'NS_GeoMean_by_month_Subcat') # join the column to the original dataset df_new = df.join(df_geometric_mean, on=['month', 'SubCategory'], how='inner') df_new = df_new.join(df_geometric_mean2, on=['month', 'SubCategory'], how='inner') #assert df.count() == df_new.count() return df_new
def remove_no_stock_item( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: hassale_item = df.groupBy("SKU", "Store").agg({ "StockQty": "sum" }).filter(col('sum(StockQty)') != 0).drop('sum(StockQty)') new_df = hassale_item.join(df, on=["SKU", "Store"], how='inner') return new_df