def calculate_Capacity_to_sales( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ 1. Capacity / Qty sold 2. Capacity / NetSales """ df = df.withColumn("Capacity_to_qty", (df.Capacity / df.totalMonthlyQtySold)) df = df.withColumn("Capacity_to_sales", (df.Capacity / df.totalMonthlyNetSale)) return df
def calculate_Depths( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ Depth = Capacity / Facings ProposedDepth = 3, if Depth >= 4. Otherwise empty VarianceDepth = ProposedDepth - Depth (should be negative) """ df = df.withColumn("Depth", (df.Capacity / df.Facings)) df = df.withColumn("ProposedDepth", when(col('Depth') >= 4, 3).otherwise('')) df = df.withColumn( "VarianceDepth", when(col('Depth') >= 4, (df.ProposedDepth - df.Depth)).otherwise('')) return df
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: object, newType: object) -> object: """ Convert the data type of DataFrame columns """ for name in names: df = df.withColumn(name, df[name].cast(newType)) return df
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: list, newType) -> pyspark.sql.dataframe.DataFrame: """ A custom function to convert the data type of DataFrame columns """ for name in names: df = df.withColumn(name, df[name].cast(newType)) return df
def find_and_analysis_atLeastOneMonth_SKU( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ For SKU, which "soldQty < capacity" in at least one month, 1. Calcuaate the average of REAL NS, Standard dev of avg NS; REAL NS means that if one item has 0 sale on one month, avg calculation will only consider another 2 months 2. Calculate Capacity_to_avg_qty, and Facing_to_avg_qty Output: 2 dataset: 1. df_atLeastOneMonth: fulldataset 2. unchange Depth SKU 3. Changed Depth SKU 4. df_full is the combination of unchanged_SKU and changed_SKU """ ## Find at least one month SKU df = df.withColumn( 'qty_less_than_capacity', when((col("totalMonthlyQtySold") < col('Capacity')), 1).otherwise(0)) df_atLeastOneMonth = df.filter( df.qty_less_than_capacity == 1) # find SKU which qtySold> capacity at least on month ## Calculate the average of REAL NS; df_groupbySKU = df.filter(df.totalMonthlyNetSale != 0).groupBy( 'MatID', "SubCategory", 'Vendor') # Group by each SKU ## get the average net-sales of each product SKU_avg_Qty = df_groupbySKU.avg("totalMonthlyQtySold").withColumnRenamed( "avg(totalMonthlyQtySold)", "AvgQtySold") SKU_avg_std = df_groupbySKU.agg(stddev('totalMonthlyQtySold'))\ .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_SKU") ## Join datasets df_1 = SKU_avg_Qty.join(df_atLeastOneMonth, on=["MatID", 'SubCategory', 'Vendor'], how="right") df_1 = df_1.join(SKU_avg_std, on=["MatID", 'SubCategory', 'Vendor'], how="left") df_1 = df_1.withColumn('Capacity_to_avg_qty', (col('Capacity') / col("AvgQtySold"))) df_1 = df_1.withColumn('Facing_to_avg_qty', (col('Facings') / col("AvgQtySold"))) # Calculate the ratio of average qty sold to the std of SKU df_1 = df_1.withColumn('StdQty_to_AvgQty', (col('Qty_std_by_SKU') / col("AvgQtySold"))) # if no standard derivation, means that this SKU is sold only one month df_full = df_1.select(selected_column_atLeastOneMonth).dropDuplicates() # separate SKU to 2 groups unchanged_SKU = df_full.filter(col('Depth') < 3) changed_SKU = df_full.filter(col('ProposedDepth') == 3) return df_atLeastOneMonth, unchanged_SKU, changed_SKU, df_full
def zip_explode_cols(df: pyspark.sql.dataframe.DataFrame, cols: list, result_name: str, rename_fields: Dict[str, str] = None): """ Explode multiple equally-sized arrays into one struct by zipping all arrays into one `ArrayType[StructType]` Args: df: The input Spark DataFrame cols: The array columns that should be zipped result_name: The name of the column that will contain the newly created struct rename_fields: dictionary mapping column names to new struct field names. Used to rename columns in the newly created struct. Returns: `df.withColumn(result_name, zip(explode(cols)))` """ df = df.withColumn(result_name, f.explode(f.arrays_zip(*cols))) if rename_fields: # create schema of new struct by simply renaming the top-level struct fields old_schema: t.StructType = df.schema[result_name].dataType # rename field if field ist in `old_schema.fieldNames()` new_field_names = [ rename_fields[field] if field in rename_fields else field for field in old_schema.fieldNames() ] new_schema = t.StructType([ t.StructField(name, field.dataType) for name, field in zip(new_field_names, old_schema.fields) ]) df = df.withColumn(result_name, f.col(result_name).cast(new_schema)) # # old method using withColumn and a new struct; breaks with PySpark 3.0 # df = df.withColumn(target_struct, f.struct(*[ # f.col(target_struct + "." + actualName).alias(targetName) # for targetName, actualName in zip(target_colnames, df.schema[target_struct].dataType.fieldNames()) # ])) return df
def clean_dist_df( dist_df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: # filter data dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply', 'Classification', 'Mat ID', '# POGs') ### Rename column dist_df = dist_df.withColumnRenamed("Name", "SKU") dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply") dist_df = dist_df.withColumnRenamed("Mat ID", "MatID") dist_df = dist_df.withColumnRenamed("# POGs", "POGS") # Conver columns to `FloatType()` dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float')) dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float')) dist_df = dist_df.withColumn("DaysSupply", dist_df.DaysSupply.cast('float')) dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer')) dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer')) return dist_df
def get_parquets_from_sdf(sdf: pyspark.sql.dataframe.DataFrame): name = 'tmp_file' + f'{os.getpid()}_{socket.gethostname().replace(".", "")}' while os.path.exists(name): name += '_' if check_hdfs_file_ex(name): sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name)) for column in sdf.dtypes: if 'date' in column[1]: sdf = sdf.withColumn( column[0], F.col(column[0]).cast(T.TimestampType()).alias(column[0])) sdf.write.mode('overwrite').parquet(name) sh.hdfs('dfs', '-get', '{}'.format(name), '{}'.format(os.getcwd())) sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name)) data = pd.read_parquet(name + '/') os.system(f'rm -r {os.getcwd()}/{name}') return data
def clean_data(df: pyspark.sql.dataframe.DataFrame, spark: pyspark.sql.session.SparkSession): """ Apply data processing. 1) Rename columns name 2) Columns type cast 3) Remove the closed store 4) Short SKU name by removing itemID 5) Remove items if no sales in the whole month, since they are not OOS 6) Remove items if no stock in the whole month, since they are not OOS 7) Add more rows to ensure each item in each store has the full-month records 8) Replace none to 0 9) Convert float number between -1 and 1 to 0 10) Save the cleaned dataset """ ### 1) Rename column df = df.withColumnRenamed("POS Margin on Net Sales", "Margin") df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("Stock Balance Qty", "StockQty") df = df.withColumnRenamed("POS Qty Sold", "QtySold") # 2) Conver the `df` columns to `FloatType()` columns = ['NetSales', 'QtySold', 'Margin', 'StockQty'] df = convertColumn(df, columns, FloatType()) # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMMdd")) # 3) Remove the closed store df = remove_closed_store(df) # 4) Short SKU name by removing itemID """ short_column_udf = udf(lambda name: short_column(name), StringType()) count = df.count() df = df.withColumn("SKU", short_column_udf(col("SKU"))) assert df.count() == count, "Some error here" # test on overall dataset print(df.count()) """ # 5) Remove items if no sales in the whole month, since they are not OOS df = remove_no_sale_item(df) # 6) Remove items if no stock in the whole month, since they are not OOS df = remove_no_stock_item(df) # 7) Add more rows to ensure each item in each store has the full-month records date_generated = create_list_dates(df) df = clean_and_add_date(df, date_generated, spark) # 8) Replace none to 0 df = df.fillna(0) # 9) convert float number between -1 and 1 to 0 #clean_numeric_column_udf = udf(lambda name: clean_numeric_column(name), FloatType()) #df = df.withColumn("StockQty", clean_numeric_column(col("StockQty"))) # 10) save the cleaned dataset, overwrite the old one. #df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/cleanedData") # only specify folder name print("Data processing finished.") return df, date_generated
def Data_clean_and_merge( df: pyspark.sql.dataframe.DataFrame, Subcat_info: pyspark.sql.dataframe.DataFrame, Vendor_info: pyspark.sql.dataframe.DataFrame, store_name: str, begin_date="2019-04-01", end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame: # select useful columns Subcat_info = Subcat_info.select('SKU', 'SubCategory') Vendor_info = Vendor_info.select('SKU', 'Vendor') # clean data entry: remove ID split_col = split(Subcat_info['SKU'], '-') Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0)) Subcat_info = Subcat_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col2 = split(Vendor_info['SKU'], '-') Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0)) Vendor_info = Vendor_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col = split(Subcat_info['SubCategory'], '-') split_col2 = split(Vendor_info['Vendor'], '-') Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1)) Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1)) # filter data df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales', 'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)') # Check only one store df = df.filter(df.Store == store_name) # Remove comma from integer (e.g. 1,333 to 1333) udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType()) #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] df = df.select(*[udf(column).alias(column) for column in df.columns]) # filter data, and keep only half years # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) df = df.filter(df.Date >= begin_date) df = df.filter(df.Date < end_date) # April - Sep # separate Item name to SKU and ID split_col = split(df['item'], '-') df = df.withColumn('MatID', split_col.getItem(0)) df = df.withColumn('MatID', regexp_replace(col("MatID"), "[ZNDF]", "")) # remove letters from matID df = df.withColumn('SKU', split_col.getItem(1)) ### Rename column df = df.withColumnRenamed("Sales Type", "SalesType") df = df.withColumnRenamed("POS Gross Sales", "GrossSales") df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("POS Total Discount", "TotalDiscount") df = df.withColumnRenamed("POS Qty Sold", "QtySold") df = df.withColumnRenamed("POS COGS (INV)", "COGS") # Assign all column names to `columns` columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] # Conver the `df` columns to `FloatType()` df = convertColumn(df, columns, FloatType()) # drop unnecessary items columns_to_drop = ['item'] df = df.drop(*columns_to_drop) # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) # Create the new columns df = df.withColumn("Price", df.GrossSales / df.QtySold) df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS)) df = df.withColumn("SellMargin", (df.NetSales + df.COGS)) # add subcategory column df = df.join(Subcat_info.select("MatID", 'SubCategory'), on=["MatID"], how="left") df = df.join(Vendor_info.select("MatID", 'Vendor'), on=["MatID"], how="left") return df