def clean_and_add_date(
    df: pyspark.sql.dataframe.DataFrame, date_generated: list,
    spark: pyspark.sql.session.SparkSession
) -> pyspark.sql.dataframe.DataFrame:
    """
    Add more rows to ensure each item in each store has the full-month records 
       (since if both stock and sales are 0, the raw date can miss the relevant column)
    """
    # Create a list of dates, start from the first day of dataset, end with the last day of dataset
    date_df = spark.createDataFrame(date_generated,
                                    DateType())  # create a Date df
    date_df = date_df.withColumnRenamed("value", "Date")

    # Register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("dfView")

    # get temporary table with distinct combinnation of SKU and Store
    ##sqlDF = spark.sql("SELECT SKU, Store FROM dfView GROUP BY SKU, Store") # same
    sqlDF = spark.sql("SELECT DISTINCT SKU, Store FROM dfView")

    # Cross join two dataset to create full schema
    schema = sqlDF.crossJoin(date_df)  # using crossjoin to quickly add
    #assert schema.count() == sqlDF.count() * len(date_generated) # check cross join result
    #assert schema.count() >= df.count(), 'We want ' + str(df.count()) + \
    #'row. But we get '+str(schema.count()) # we need add rows

    # left join origial dataset with new schema
    df = df.join(schema, on=['Date', 'Store', 'SKU'], how='right')
    #assert df.count() == count # test on overall dataset
    return df
def get_store_item_concept_list(df: pyspark.sql.dataframe.DataFrame,
                                spark) -> list:
    """
    Get the list of combinations of SKU, concept in stores
    """
    # Register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("dfView")
    # Query and create new dataframe
    sqlDF = spark.sql("SELECT DISTINCT SKU, Store, Concept_NEW FROM dfView")
    store_item_list = sqlDF.rdd.map(tuple).collect()
    return store_item_list