def clean_dist_df( dist_df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: # filter data dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply', 'Classification', 'Mat ID', '# POGs') ### Rename column dist_df = dist_df.withColumnRenamed("Name", "SKU") dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply") dist_df = dist_df.withColumnRenamed("Mat ID", "MatID") dist_df = dist_df.withColumnRenamed("# POGs", "POGS") # Conver columns to `FloatType()` dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float')) dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float')) dist_df = dist_df.withColumn("DaysSupply", dist_df.DaysSupply.cast('float')) dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer')) dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer')) return dist_df
def clean_dataset( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: ## Select the target features df = df.select('Index', 'Date Detail', 'Company', 'Business Unit', 'Concept_NEW', 'Product Category', 'Company and Cost Centre', 'SKU', 'POS Net Sales', 'Rank Total') ## Reanme columns df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("Date Detail", "Date") df = df.withColumnRenamed("Product Category", "Category") df = df.withColumnRenamed("Company and Cost Centre", "Store") df = df.withColumnRenamed("Business Unit", "BusinessUnit") df = df.withColumnRenamed("Rank Total", "rank") ## Column type cast columns = ['NetSales', 'rank'] df = convertColumn(df, columns, FloatType()) # Replace none to 0 df = df.na.fill(0) return df
def clean_data(df: pyspark.sql.dataframe.DataFrame, spark: pyspark.sql.session.SparkSession): """ Apply data processing. 1) Rename columns name 2) Columns type cast 3) Remove the closed store 4) Short SKU name by removing itemID 5) Remove items if no sales in the whole month, since they are not OOS 6) Remove items if no stock in the whole month, since they are not OOS 7) Add more rows to ensure each item in each store has the full-month records 8) Replace none to 0 9) Convert float number between -1 and 1 to 0 10) Save the cleaned dataset """ ### 1) Rename column df = df.withColumnRenamed("POS Margin on Net Sales", "Margin") df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("Stock Balance Qty", "StockQty") df = df.withColumnRenamed("POS Qty Sold", "QtySold") # 2) Conver the `df` columns to `FloatType()` columns = ['NetSales', 'QtySold', 'Margin', 'StockQty'] df = convertColumn(df, columns, FloatType()) # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMMdd")) # 3) Remove the closed store df = remove_closed_store(df) # 4) Short SKU name by removing itemID """ short_column_udf = udf(lambda name: short_column(name), StringType()) count = df.count() df = df.withColumn("SKU", short_column_udf(col("SKU"))) assert df.count() == count, "Some error here" # test on overall dataset print(df.count()) """ # 5) Remove items if no sales in the whole month, since they are not OOS df = remove_no_sale_item(df) # 6) Remove items if no stock in the whole month, since they are not OOS df = remove_no_stock_item(df) # 7) Add more rows to ensure each item in each store has the full-month records date_generated = create_list_dates(df) df = clean_and_add_date(df, date_generated, spark) # 8) Replace none to 0 df = df.fillna(0) # 9) convert float number between -1 and 1 to 0 #clean_numeric_column_udf = udf(lambda name: clean_numeric_column(name), FloatType()) #df = df.withColumn("StockQty", clean_numeric_column(col("StockQty"))) # 10) save the cleaned dataset, overwrite the old one. #df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/cleanedData") # only specify folder name print("Data processing finished.") return df, date_generated
def Data_clean_and_merge( df: pyspark.sql.dataframe.DataFrame, Subcat_info: pyspark.sql.dataframe.DataFrame, Vendor_info: pyspark.sql.dataframe.DataFrame, store_name: str, begin_date="2019-04-01", end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame: # select useful columns Subcat_info = Subcat_info.select('SKU', 'SubCategory') Vendor_info = Vendor_info.select('SKU', 'Vendor') # clean data entry: remove ID split_col = split(Subcat_info['SKU'], '-') Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0)) Subcat_info = Subcat_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col2 = split(Vendor_info['SKU'], '-') Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0)) Vendor_info = Vendor_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col = split(Subcat_info['SubCategory'], '-') split_col2 = split(Vendor_info['Vendor'], '-') Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1)) Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1)) # filter data df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales', 'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)') # Check only one store df = df.filter(df.Store == store_name) # Remove comma from integer (e.g. 1,333 to 1333) udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType()) #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] df = df.select(*[udf(column).alias(column) for column in df.columns]) # filter data, and keep only half years # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) df = df.filter(df.Date >= begin_date) df = df.filter(df.Date < end_date) # April - Sep # separate Item name to SKU and ID split_col = split(df['item'], '-') df = df.withColumn('MatID', split_col.getItem(0)) df = df.withColumn('MatID', regexp_replace(col("MatID"), "[ZNDF]", "")) # remove letters from matID df = df.withColumn('SKU', split_col.getItem(1)) ### Rename column df = df.withColumnRenamed("Sales Type", "SalesType") df = df.withColumnRenamed("POS Gross Sales", "GrossSales") df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("POS Total Discount", "TotalDiscount") df = df.withColumnRenamed("POS Qty Sold", "QtySold") df = df.withColumnRenamed("POS COGS (INV)", "COGS") # Assign all column names to `columns` columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] # Conver the `df` columns to `FloatType()` df = convertColumn(df, columns, FloatType()) # drop unnecessary items columns_to_drop = ['item'] df = df.drop(*columns_to_drop) # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) # Create the new columns df = df.withColumn("Price", df.GrossSales / df.QtySold) df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS)) df = df.withColumn("SellMargin", (df.NetSales + df.COGS)) # add subcategory column df = df.join(Subcat_info.select("MatID", 'SubCategory'), on=["MatID"], how="left") df = df.join(Vendor_info.select("MatID", 'Vendor'), on=["MatID"], how="left") return df