def clean_and_add_date( df: pyspark.sql.dataframe.DataFrame, date_generated: list, spark: pyspark.sql.session.SparkSession ) -> pyspark.sql.dataframe.DataFrame: """ Add more rows to ensure each item in each store has the full-month records (since if both stock and sales are 0, the raw date can miss the relevant column) """ # Create a list of dates, start from the first day of dataset, end with the last day of dataset date_df = spark.createDataFrame(date_generated, DateType()) # create a Date df date_df = date_df.withColumnRenamed("value", "Date") # Register the DataFrame as a SQL temporary view df.createOrReplaceTempView("dfView") # get temporary table with distinct combinnation of SKU and Store ##sqlDF = spark.sql("SELECT SKU, Store FROM dfView GROUP BY SKU, Store") # same sqlDF = spark.sql("SELECT DISTINCT SKU, Store FROM dfView") # Cross join two dataset to create full schema schema = sqlDF.crossJoin(date_df) # using crossjoin to quickly add #assert schema.count() == sqlDF.count() * len(date_generated) # check cross join result #assert schema.count() >= df.count(), 'We want ' + str(df.count()) + \ #'row. But we get '+str(schema.count()) # we need add rows # left join origial dataset with new schema df = df.join(schema, on=['Date', 'Store', 'SKU'], how='right') #assert df.count() == count # test on overall dataset return df
def get_store_item_concept_list(df: pyspark.sql.dataframe.DataFrame, spark) -> list: """ Get the list of combinations of SKU, concept in stores """ # Register the DataFrame as a SQL temporary view df.createOrReplaceTempView("dfView") # Query and create new dataframe sqlDF = spark.sql("SELECT DISTINCT SKU, Store, Concept_NEW FROM dfView") store_item_list = sqlDF.rdd.map(tuple).collect() return store_item_list