예제 #1
0
def calculate_Capacity_to_sales(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    1. Capacity / Qty sold
    2. Capacity / NetSales
    """
    df = df.withColumn("Capacity_to_qty",
                       (df.Capacity / df.totalMonthlyQtySold))
    df = df.withColumn("Capacity_to_sales",
                       (df.Capacity / df.totalMonthlyNetSale))
    return df
예제 #2
0
def calculate_Depths(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    Depth =  Capacity / Facings
    ProposedDepth = 3, if Depth >= 4. Otherwise empty
    VarianceDepth = ProposedDepth - Depth (should be negative)
    """
    df = df.withColumn("Depth", (df.Capacity / df.Facings))
    df = df.withColumn("ProposedDepth",
                       when(col('Depth') >= 4, 3).otherwise(''))
    df = df.withColumn(
        "VarianceDepth",
        when(col('Depth') >= 4, (df.ProposedDepth - df.Depth)).otherwise(''))
    return df
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: object,
                  newType: object) -> object:
    """
    Convert the data type of DataFrame columns
    """
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df
예제 #4
0
def convertColumn(df: pyspark.sql.dataframe.DataFrame, names: list,
                  newType) -> pyspark.sql.dataframe.DataFrame:
    """
    A custom function to convert the data type of DataFrame columns
    """
    for name in names:
        df = df.withColumn(name, df[name].cast(newType))
    return df
예제 #5
0
def find_and_analysis_atLeastOneMonth_SKU(
        df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    """
    For SKU, which "soldQty < capacity" in at least one month,
       1. Calcuaate the average of REAL NS, Standard dev of avg NS;
          REAL NS means that if one item has 0 sale on one month, avg calculation will only consider another 2 months
       2. Calculate Capacity_to_avg_qty, and Facing_to_avg_qty
    
    Output: 2 dataset: 
       1. df_atLeastOneMonth: fulldataset
       2. unchange Depth SKU
       3. Changed Depth SKU
       4. df_full is the combination of unchanged_SKU and changed_SKU
    """

    ## Find at least one month SKU
    df = df.withColumn(
        'qty_less_than_capacity',
        when((col("totalMonthlyQtySold") < col('Capacity')), 1).otherwise(0))
    df_atLeastOneMonth = df.filter(
        df.qty_less_than_capacity ==
        1)  # find SKU which qtySold> capacity at least on month

    ## Calculate the average of REAL NS;
    df_groupbySKU = df.filter(df.totalMonthlyNetSale != 0).groupBy(
        'MatID', "SubCategory", 'Vendor')  # Group by each SKU
    ## get the average net-sales of each product
    SKU_avg_Qty = df_groupbySKU.avg("totalMonthlyQtySold").withColumnRenamed(
        "avg(totalMonthlyQtySold)", "AvgQtySold")
    SKU_avg_std = df_groupbySKU.agg(stddev('totalMonthlyQtySold'))\
    .withColumnRenamed('stddev_samp(totalMonthlyQtySold)', "Qty_std_by_SKU")

    ## Join datasets
    df_1 = SKU_avg_Qty.join(df_atLeastOneMonth,
                            on=["MatID", 'SubCategory', 'Vendor'],
                            how="right")
    df_1 = df_1.join(SKU_avg_std,
                     on=["MatID", 'SubCategory', 'Vendor'],
                     how="left")
    df_1 = df_1.withColumn('Capacity_to_avg_qty',
                           (col('Capacity') / col("AvgQtySold")))
    df_1 = df_1.withColumn('Facing_to_avg_qty',
                           (col('Facings') / col("AvgQtySold")))
    # Calculate the ratio of average qty sold to the std of SKU
    df_1 = df_1.withColumn('StdQty_to_AvgQty',
                           (col('Qty_std_by_SKU') / col("AvgQtySold")))

    # if no standard derivation, means that this SKU is sold only one month
    df_full = df_1.select(selected_column_atLeastOneMonth).dropDuplicates()
    # separate SKU to 2 groups
    unchanged_SKU = df_full.filter(col('Depth') < 3)
    changed_SKU = df_full.filter(col('ProposedDepth') == 3)

    return df_atLeastOneMonth, unchanged_SKU, changed_SKU, df_full
예제 #6
0
def zip_explode_cols(df: pyspark.sql.dataframe.DataFrame,
                     cols: list,
                     result_name: str,
                     rename_fields: Dict[str, str] = None):
    """
    Explode multiple equally-sized arrays into one struct by zipping all arrays into one `ArrayType[StructType]`

    Args:
        df: The input Spark DataFrame
        cols: The array columns that should be zipped
        result_name: The name of the column that will contain the newly created struct
        rename_fields: dictionary mapping column names to new struct field names.
            Used to rename columns in the newly created struct.

    Returns: `df.withColumn(result_name, zip(explode(cols)))`

    """
    df = df.withColumn(result_name, f.explode(f.arrays_zip(*cols)))

    if rename_fields:  # create schema of new struct by simply renaming the top-level struct fields
        old_schema: t.StructType = df.schema[result_name].dataType

        # rename field if field ist in `old_schema.fieldNames()`
        new_field_names = [
            rename_fields[field] if field in rename_fields else field
            for field in old_schema.fieldNames()
        ]

        new_schema = t.StructType([
            t.StructField(name, field.dataType)
            for name, field in zip(new_field_names, old_schema.fields)
        ])

        df = df.withColumn(result_name, f.col(result_name).cast(new_schema))

        # # old method using withColumn and a new struct; breaks with PySpark 3.0
        # df = df.withColumn(target_struct, f.struct(*[
        #     f.col(target_struct + "." + actualName).alias(targetName)
        #     for targetName, actualName in zip(target_colnames, df.schema[target_struct].dataType.fieldNames())
        # ]))

    return df
예제 #7
0
def clean_dist_df(
    dist_df: pyspark.sql.dataframe.DataFrame
) -> pyspark.sql.dataframe.DataFrame:
    # filter data
    dist_df = dist_df.select("Name", "Facings", "Capacity", 'Days Supply',
                             'Classification', 'Mat ID', '# POGs')

    ### Rename column
    dist_df = dist_df.withColumnRenamed("Name", "SKU")
    dist_df = dist_df.withColumnRenamed("Days Supply", "DaysSupply")
    dist_df = dist_df.withColumnRenamed("Mat ID", "MatID")
    dist_df = dist_df.withColumnRenamed("# POGs", "POGS")

    # Conver columns to `FloatType()`
    dist_df = dist_df.withColumn("Facings", dist_df.Facings.cast('float'))
    dist_df = dist_df.withColumn("Capacity", dist_df.Capacity.cast('float'))
    dist_df = dist_df.withColumn("DaysSupply",
                                 dist_df.DaysSupply.cast('float'))
    dist_df = dist_df.withColumn("MatID", dist_df.MatID.cast('integer'))
    dist_df = dist_df.withColumn("POGS", dist_df.POGS.cast('integer'))
    return dist_df
def get_parquets_from_sdf(sdf: pyspark.sql.dataframe.DataFrame):
    name = 'tmp_file' + f'{os.getpid()}_{socket.gethostname().replace(".", "")}'
    while os.path.exists(name):
        name += '_'
    if check_hdfs_file_ex(name):
        sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name))
    for column in sdf.dtypes:
        if 'date' in column[1]:
            sdf = sdf.withColumn(
                column[0],
                F.col(column[0]).cast(T.TimestampType()).alias(column[0]))
    sdf.write.mode('overwrite').parquet(name)
    sh.hdfs('dfs', '-get', '{}'.format(name), '{}'.format(os.getcwd()))
    sh.hdfs('dfs', '-rm', '-r', '-skipTrash', '{}'.format(name))
    data = pd.read_parquet(name + '/')
    os.system(f'rm -r {os.getcwd()}/{name}')
    return data
def clean_data(df: pyspark.sql.dataframe.DataFrame,
               spark: pyspark.sql.session.SparkSession):
    """
    Apply data processing. 
        1)  Rename columns name
        2)  Columns type cast
        3)  Remove the closed store
        4)  Short SKU name by removing itemID
        5)  Remove items if no sales in the whole month, since they are not OOS
        6)  Remove items if no stock in the whole month, since they are not OOS
        7)  Add more rows to ensure each item in each store has the full-month records
        8)  Replace none to 0
        9)  Convert float number between -1 and 1 to 0
        10)  Save the cleaned dataset
    """

    ### 1)  Rename column
    df = df.withColumnRenamed("POS Margin on Net Sales", "Margin")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("Stock Balance Qty", "StockQty")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")

    # 2)  Conver the `df` columns to `FloatType()`
    columns = ['NetSales', 'QtySold', 'Margin', 'StockQty']
    df = convertColumn(df, columns, FloatType())
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMMdd"))

    # 3)  Remove the closed store
    df = remove_closed_store(df)

    # 4)  Short SKU name by removing itemID
    """
    short_column_udf = udf(lambda name: short_column(name), StringType())
    count = df.count()
    df = df.withColumn("SKU", short_column_udf(col("SKU")))
    assert df.count() == count, "Some error here" # test on overall dataset
    print(df.count())
    """

    # 5)  Remove items if no sales in the whole month, since they are not OOS
    df = remove_no_sale_item(df)

    # 6)  Remove items if no stock in the whole month, since they are not OOS
    df = remove_no_stock_item(df)

    # 7)  Add more rows to ensure each item in each store has the full-month records
    date_generated = create_list_dates(df)
    df = clean_and_add_date(df, date_generated, spark)

    # 8)  Replace none to 0
    df = df.fillna(0)

    # 9)  convert float number between -1 and 1 to 0
    #clean_numeric_column_udf = udf(lambda name: clean_numeric_column(name), FloatType())
    #df = df.withColumn("StockQty", clean_numeric_column(col("StockQty")))

    # 10)  save the cleaned dataset, overwrite the old one.
    #df.coalesce(1).write.option("header", "true").mode('overwrite').csv("../data/cleanedData") # only specify folder name
    print("Data processing finished.")

    return df, date_generated
예제 #10
0
def Data_clean_and_merge(
        df: pyspark.sql.dataframe.DataFrame,
        Subcat_info: pyspark.sql.dataframe.DataFrame,
        Vendor_info: pyspark.sql.dataframe.DataFrame,
        store_name: str,
        begin_date="2019-04-01",
        end_date="2019-10-01") -> pyspark.sql.dataframe.DataFrame:
    # select useful columns
    Subcat_info = Subcat_info.select('SKU', 'SubCategory')
    Vendor_info = Vendor_info.select('SKU', 'Vendor')
    # clean data entry: remove ID
    split_col = split(Subcat_info['SKU'], '-')
    Subcat_info = Subcat_info.withColumn('MatID', split_col.getItem(0))
    Subcat_info = Subcat_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col2 = split(Vendor_info['SKU'], '-')
    Vendor_info = Vendor_info.withColumn('MatID', split_col2.getItem(0))
    Vendor_info = Vendor_info.withColumn('MatID',
                                         regexp_replace(
                                             col("MatID"), "[ZNDF]",
                                             ""))  # remove letters from matID

    split_col = split(Subcat_info['SubCategory'], '-')
    split_col2 = split(Vendor_info['Vendor'], '-')
    Subcat_info = Subcat_info.withColumn('SubCategory', split_col.getItem(1))
    Vendor_info = Vendor_info.withColumn('Vendor', split_col2.getItem(1))
    # filter data
    df = df.select("Date", "Store", 'item', 'POS Gross Sales', 'POS Net Sales',
                   'POS Total Discount', 'POS Qty Sold', 'POS COGS (INV)')

    # Check only one store
    df = df.filter(df.Store == store_name)

    # Remove comma from integer (e.g. 1,333 to 1333)
    udf = UserDefinedFunction(lambda x: re.sub(',', '', x), StringType())
    #num_columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    df = df.select(*[udf(column).alias(column) for column in df.columns])

    # filter data, and keep only half years
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))
    df = df.filter(df.Date >= begin_date)
    df = df.filter(df.Date < end_date)  # April - Sep

    # separate Item name to SKU and ID
    split_col = split(df['item'], '-')
    df = df.withColumn('MatID', split_col.getItem(0))
    df = df.withColumn('MatID',
                       regexp_replace(col("MatID"), "[ZNDF]",
                                      ""))  # remove letters from matID
    df = df.withColumn('SKU', split_col.getItem(1))

    ### Rename column
    df = df.withColumnRenamed("Sales Type", "SalesType")
    df = df.withColumnRenamed("POS Gross Sales", "GrossSales")
    df = df.withColumnRenamed("POS Net Sales", "NetSales")
    df = df.withColumnRenamed("POS Total Discount", "TotalDiscount")
    df = df.withColumnRenamed("POS Qty Sold", "QtySold")
    df = df.withColumnRenamed("POS COGS (INV)", "COGS")

    # Assign all column names to `columns`
    columns = ['TotalDiscount', 'QtySold', 'GrossSales', 'NetSales', 'COGS']
    # Conver the `df` columns to `FloatType()`
    df = convertColumn(df, columns, FloatType())

    # drop unnecessary items
    columns_to_drop = ['item']
    df = df.drop(*columns_to_drop)
    # Convert Date column to timestamp
    df = df.withColumn("Date", to_timestamp(df.Date, "yyyyMM"))

    # Create the new columns
    df = df.withColumn("Price", df.GrossSales / df.QtySold)
    df = df.withColumn("FrontMargin", (df.GrossSales + df.COGS))
    df = df.withColumn("SellMargin", (df.NetSales + df.COGS))

    # add subcategory column
    df = df.join(Subcat_info.select("MatID", 'SubCategory'),
                 on=["MatID"],
                 how="left")
    df = df.join(Vendor_info.select("MatID", 'Vendor'),
                 on=["MatID"],
                 how="left")
    return df