def find_and_analysis_atLeastOneMonth_SKU( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: """ For SKU, which "soldQty < capacity" in at least one month, 1. Calcuaate the average of REAL NS, Standard dev of avg NS; REAL NS means that if one item has 0 sale on one month, avg calculation will only consider another 2 months 2. Calculate Capacity_to_avg_qty, and Facing_to_avg_qty Output: 2 dataset: 1. df_atLeastOneMonth: fulldataset 2. unchange Depth SKU 3. Changed Depth SKU 4. df_full is the combination of unchanged_SKU and changed_SKU """ ## Find at least one month SKU df = df.withColumn( 'qty_less_than_capacity', when((col("totalMonthlyQtySold") < col('Capacity')), 1).otherwise(0)) df_atLeastOneMonth = df.filter( df.qty_less_than_capacity == 1) # find SKU which qtySold> capacity at least on month ## Calculate the average of REAL NS; df_groupbySKU = df.filter(df.totalMonthlyNetSale != 0).groupBy( 'MatID', "SubCategory", 'Vendor') # Group by each SKU ## get the average net-sales of each product SKU_avg_Qty = df_groupbySKU.avg("totalMonthlyQtySold").withColumnRenamed( "avg(totalMonthlyQtySold)", "AvgQtySold") SKU_avg_std = df_groupbySKU.agg(stddev('totalMonthlyQtySold'))\ .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_SKU") ## Join datasets df_1 = SKU_avg_Qty.join(df_atLeastOneMonth, on=["MatID", 'SubCategory', 'Vendor'], how="right") df_1 = df_1.join(SKU_avg_std, on=["MatID", 'SubCategory', 'Vendor'], how="left") df_1 = df_1.withColumn('Capacity_to_avg_qty', (col('Capacity') / col("AvgQtySold"))) df_1 = df_1.withColumn('Facing_to_avg_qty', (col('Facings') / col("AvgQtySold"))) # Calculate the ratio of average qty sold to the std of SKU df_1 = df_1.withColumn('StdQty_to_AvgQty', (col('Qty_std_by_SKU') / col("AvgQtySold"))) # if no standard derivation, means that this SKU is sold only one month df_full = df_1.select(selected_column_atLeastOneMonth).dropDuplicates() # separate SKU to 2 groups unchanged_SKU = df_full.filter(col('Depth') < 3) changed_SKU = df_full.filter(col('ProposedDepth') == 3) return df_atLeastOneMonth, unchanged_SKU, changed_SKU, df_full
def find_Incorrect_record_items( month_merge: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ The items has extremely high ratio of capacity/facing. (Ratio >6) """ Incorrect_record_items = month_merge.filter( col('Capacity') / col('Facings') > 6) Incorrect_record_items = Incorrect_record_items.withColumn( "Depth", col('Capacity') / col('Facings')).select(output_columns) return Incorrect_record_items
def find_Depth2_items(month_merge: pyspark.sql.dataframe.DataFrame, output_columns: list) -> pyspark.sql.dataframe.DataFrame: """ same as Capacity < Facings*2. They are issued items: for example: 1. Capacity = Facings = 1, incorrect 2. Facing = 3, Capacity = 5, incorrect 3. Capacity should > Facing. """ Depth2_items = month_merge.filter(col('Capacity') / col('Facings') < 2) Depth2_items = Depth2_items.withColumn( "Depth", col('Capacity') / col('Facings')).select(output_columns) return Depth2_items
def estimate_segments( df: pyspark.sql.dataframe.DataFrame, target_field: str = None, max_segments: int = 30, include_columns: List[str] = [], unique_perc_bounds: Tuple[float, float] = [None, 0.8], null_perc_bounds: Tuple[float, float] = [None, 0.2], ) -> Optional[Union[List[Dict], List[str]]]: """ Estimates the most important features and values on which to segment data profiling using entropy-based methods. If no target column provided, maximum entropy column is substituted. :param df: the dataframe of data to profile :param target_field: target field (optional) :param max_segments: upper threshold for total combinations of segments, default 30 :param include_columns: additional non-string columns to consider in automatic segmentation. Warning: high cardinality columns will degrade performance. :param unique_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of unique values (|unique| / |X|). Upper bound exclusive. :param null_perc_bounds: tuple of form [lower, upper] with bounds on the percentage of null values. Upper bound exclusive. :return: a list of segmentation feature names """ current_split_columns = [] segments = [] segments_used = 1 max_entropy_column = (float("-inf"), None) if not unique_perc_bounds[0]: unique_perc_bounds[0] = float("-inf") if not unique_perc_bounds[1]: unique_perc_bounds[1] = float("inf") if not null_perc_bounds[0]: null_perc_bounds[0] = float("-inf") if not null_perc_bounds[1]: null_perc_bounds[1] = float("inf") valid_column_names = set() count = df.count() print("Limiting to categorical (string) data columns...") valid_column_names = {col for col in df.columns if (df.select(col).dtypes[0][1] == "string" or col in include_columns)} print("Gathering cardinality information...") n_uniques = {col: df.agg(F.approx_count_distinct(col)).collect()[0][0] for col in valid_column_names} print("Gathering missing value information...") n_nulls = {col: df.filter(df[col].isNull()).count() for col in valid_column_names} print("Finding valid columns for autosegmentation...") for col in valid_column_names.copy(): null_perc = 0.0 if count == 0 else n_nulls[col] / count unique_perc = 0.0 if count == 0 else n_uniques[col] / count if ( col in segments or n_uniques[col] <= 1 or null_perc < null_perc_bounds[0] or null_perc >= null_perc_bounds[1] or unique_perc < unique_perc_bounds[0] or unique_perc >= unique_perc_bounds[1] ): valid_column_names.remove(col) if not valid_column_names: return [] if not target_field: print("Finding alternative target field since none were specified...") for col in valid_column_names: col_entropy = _simple_entropy(df, col) if n_uniques[col] > 1: col_entropy /= math.log(n_uniques[col]) if col_entropy > max_entropy_column[0]: max_entropy_column = (col_entropy, col) target_field = max_entropy_column[1] print(f"Using {target_field} column as target field.") assert target_field in df.columns valid_column_names.add(target_field) valid_column_names = list(valid_column_names) countdf = df.select(valid_column_names).groupby(valid_column_names).count().cache() print("Calculating segments...") while segments_used < max_segments: valid_column_names = {col for col in valid_column_names if (col not in segments and n_uniques[col] * segments_used <= (max_segments - segments_used))} _, segment_column_name = _find_best_split( countdf, current_split_columns, list(valid_column_names), target_column_name=target_field, normalization=n_uniques ) if not segment_column_name: break segments.append(segment_column_name) current_split_columns.append(segment_column_name) segments_used *= n_uniques[segment_column_name] return segments
def Data_clean_and_merge( df: pyspark.sql.dataframe.DataFrame, Subcat_info: pyspark.sql.dataframe.DataFrame, Vendor_info: pyspark.sql.dataframe.DataFrame, store_name: str, begin_date="2019-04-01", end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame: # select useful columns Subcat_info = Subcat_info.select('SKU', 'SubCategory') Vendor_info = Vendor_info.select('SKU', 'Vendor') # clean data entry: remove ID split_col = split(Subcat_info['SKU'], '-') Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0)) Subcat_info = Subcat_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col2 = split(Vendor_info['SKU'], '-') Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0)) Vendor_info = Vendor_info.withColumn('MatID', regexp_replace( col("MatID"), "[ZNDF]", "")) # remove letters from matID split_col = split(Subcat_info['SubCategory'], '-') split_col2 = split(Vendor_info['Vendor'], '-') Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1)) Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1)) # filter data df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales', 'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)') # Check only one store df = df.filter(df.Store == store_name) # Remove comma from integer (e.g. 1,333 to 1333) udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType()) #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] df = df.select(*[udf(column).alias(column) for column in df.columns]) # filter data, and keep only half years # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) df = df.filter(df.Date >= begin_date) df = df.filter(df.Date < end_date) # April - Sep # separate Item name to SKU and ID split_col = split(df['item'], '-') df = df.withColumn('MatID', split_col.getItem(0)) df = df.withColumn('MatID', regexp_replace(col("MatID"), "[ZNDF]", "")) # remove letters from matID df = df.withColumn('SKU', split_col.getItem(1)) ### Rename column df = df.withColumnRenamed("Sales Type", "SalesType") df = df.withColumnRenamed("POS Gross Sales", "GrossSales") df = df.withColumnRenamed("POS Net Sales", "NetSales") df = df.withColumnRenamed("POS Total Discount", "TotalDiscount") df = df.withColumnRenamed("POS Qty Sold", "QtySold") df = df.withColumnRenamed("POS COGS (INV)", "COGS") # Assign all column names to `columns` columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS'] # Conver the `df` columns to `FloatType()` df = convertColumn(df, columns, FloatType()) # drop unnecessary items columns_to_drop = ['item'] df = df.drop(*columns_to_drop) # Convert Date column to timestamp df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM")) # Create the new columns df = df.withColumn("Price", df.GrossSales / df.QtySold) df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS)) df = df.withColumn("SellMargin", (df.NetSales + df.COGS)) # add subcategory column df = df.join(Subcat_info.select("MatID", 'SubCategory'), on=["MatID"], how="left") df = df.join(Vendor_info.select("MatID", 'Vendor'), on=["MatID"], how="left") return df
def compute_summary(df: pyspark.sql.dataframe.DataFrame, valuecol: str = 'amtallowed', groupbycol=None, percentiles=[]): """Compute summary statistics. Computes following columns by default for each category in groupbycol: - default percentiles: [.01, .25, .50, .75, .99] - mean - num_zeros - percent_zeros - stddev - sum Args: df (`pyspark.sql.dataframe.DataFrame`) valuecol (str) : A numeric column to compute statistics on. groupbycol (str) : A categorical column to group the statistics on. percentiles (list) : list of percentiles in addition to default percentiles Returns: `pyspark.sql.dataframe.DataFrame` """ assert groupbycol is not None, 'groupby should be a column name (str) in dataframe, not NoneType' assert isinstance( groupbycol, str), 'groupbycol should be a single column name of type str' default_percentiles = [.01, .25, .50, .75, .99] percentiles = sorted(list(set(default_percentiles + percentiles))) # percentile_cols = [f'p{str(p).split(".")[-1]}' for p in percentiles] # formatted version - keeps exactly 2 decimal places, above version drops trailing zeros def make_alias(p): """Returns formatted column name Example: >>>make_alias(.2) 'p20' >>>make_alias(.20) 'p20' >>>make_alias(.201) 'p20' """ return 'p' + f'{p:.2f}'[-2:] percentile_cols = [make_alias(p) for p in percentiles] dfagg = ( df.filter(F.col(valuecol) >= 0).withColumn( 'is_zero', F.when(F.col(valuecol) <= 1e-2, 1).otherwise(0)).groupby(groupbycol).agg( F.sum(valuecol).alias('sum'), F.count(groupbycol).alias('count'), *[ get_percentile(valuecol, p, make_alias(p)) for p in percentiles ], # expand list of exprs F.mean(valuecol).alias('mean'), F.stddev(valuecol).alias('stddev'), F.sum('is_zero').alias('num_zeros')).withColumn( 'percent_zeros', F.col('num_zeros') / F.col('count') * 100).select( groupbycol, 'sum', 'count', 'mean', *percentile_cols, # expand list of columns 'stddev', 'num_zeros', 'percent_zeros') #'providerzip', ) return dfagg