def extract_datepart(df: pyspark.sql.DataFrame, dt_col: str, to_extract: str, drop: bool = False) -> pyspark.sql.DataFrame: """ Base function for extracting dateparts. Used in less abstracted functions. Parameters ---------- df : pyspark.sql.DataFrame Base dataframe which contains ``dt_col`` column for extracting ``to_extract``. dt_col : str Name of date column to extract ``to_extract`` from. to_extract : str TODO drop : bool Whether or not to drop dt_col after extraction (default is False). Returns ------- df : pyspark.sql.DataFrame df with ``to_extract`` column, optionally without original ``dt_col`` column. """ df = df.withColumn(to_extract, getattr(F, to_extract)(F.col(dt_col))) if drop: df = df.drop(dt_col) return df
def lookup_columns(df: pyspark.sql.DataFrame, vocab: Dict[str, List[Any]]) -> pyspark.sql.DataFrame: def lookup(mapping): def fn(v): return mapping.index(v) return F.udf(fn, returnType=T.IntegerType()) for col, mapping in vocab.items(): df = df.withColumn(col, lookup(mapping)(df[col])) return df
def prepare_google_trend( google_trend_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: google_trend_all = google_trend_csv.withColumn( "Date", F.regexp_extract(google_trend_csv.week, "(.*?) -", 1)).withColumn( "State", F.regexp_extract(google_trend_csv.file, "Rossmann_DE_(.*)", 1)) # map state NI -> HB,NI to align with other data sources google_trend_all = google_trend_all.withColumn( "State", F.when(google_trend_all.State == "NI", "HB,NI").otherwise(google_trend_all.State), ) # expand dates return expand_date(google_trend_all)
def cast(df: pyspark.sql.DataFrame, col_name: str, dtype: str) -> pyspark.sql.DataFrame: return df.withColumn(col_name, F.col(col_name).cast(dtype))
def cast_columns(df: pyspark.sql.DataFrame, cols: List[str]) -> pyspark.sql.DataFrame: for col in cols: df = df.withColumn(col, F.coalesce(df[col].cast(T.FloatType()), F.lit(0.0))) return df
def prepare_df( df: pyspark.sql.DataFrame, store_csv: pyspark.sql.DataFrame, store_states_csv: pyspark.sql.DataFrame, state_names_csv: pyspark.sql.DataFrame, google_trend_csv: pyspark.sql.DataFrame, weather_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: num_rows = df.count() # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). df = (df.withColumn("Open", df.Open != "0").withColumn( "Promo", df.Promo != "0").withColumn("StateHoliday", df.StateHoliday != "0").withColumn( "SchoolHoliday", df.SchoolHoliday != "0")) # merge store information store = store_csv.join(store_states_csv, "Store") df = df.join(store, "Store") # merge Google Trend information google_trend_all = prepare_google_trend(google_trend_csv) df = df.join(google_trend_all, ["State", "Year", "Week"]).select(df["*"], google_trend_all.trend) # merge in Google Trend for whole Germany google_trend_de = google_trend_all[google_trend_all.file == "Rossmann_DE"].withColumnRenamed( "trend", "trend_de") df = df.join(google_trend_de, ["Year", "Week"]).select(df["*"], google_trend_de.trend_de) # merge weather weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ["State", "Date"]) # fix null values df = (df.withColumn( "CompetitionOpenSinceYear", F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)), ).withColumn( "CompetitionOpenSinceMonth", F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)), ).withColumn("Promo2SinceYear", F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn( "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek, F.lit(1)))) # days and months since the competition has been open, cap it to 2 years df = df.withColumn( "CompetitionOpenSince", F.to_date( F.format_string("%s-%s-15", df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth)), ) df = df.withColumn( "CompetitionDaysOpen", F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)), ), ).otherwise(0), ) df = df.withColumn("CompetitionMonthsOpen", (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # days and weeks of promotion, cap it to 25 weeks df = df.withColumn( "Promo2Since", F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' ), ) df = df.withColumn( "Promo2Days", F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))), ).otherwise(0), ) df = df.withColumn("Promo2Weeks", (df.Promo2Days / 7).cast(T.IntegerType())) # ensure that no row was lost through inner joins assert num_rows == df.count(), "lost rows in joins" return df
def expand_date(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: df = df.withColumn("Date", df.Date.cast(T.DateType())) return (df.withColumn("Year", F.year(df.Date)).withColumn( "Month", F.month(df.Date)).withColumn("Week", F.weekofyear(df.Date)).withColumn( "Day", F.dayofmonth(df.Date)))
def metrics (session: SparkSession, dataframe: pyspark.sql.DataFrame, actual: str, predicted: str) -> pyspark.sql.DataFrame: ''' Calculates evaluation metrics from predicted results :param dataframe: spark.sql.dataframe with the real and predicted values :param actual: Name of column with observed target values :param predicted: Name of column with predicted values :return: ''' # Along each row are the actual values and down each column are the predicted dataframe = dataframe.withColumn(actual, col(actual).cast('integer')) dataframe = dataframe.withColumn(predicted, col(predicted).cast('integer')) cm = dataframe.crosstab(actual, predicted) cm = cm.sort(cm.columns[0], ascending=True) # Adds missing column in case just one class was predicted if not '0' in cm.columns: cm = cm.withColumn('0', lit(0)) if not '1' in cm.columns: cm = cm.withColumn('1', lit(0)) # Subsets values from confusion matrix zero = cm.filter(cm[cm.columns[0]] == 0.0) first_0 = zero.take(1) one = cm.filter(cm[cm.columns[0]] == 1.0) first_1 = one.take(1) tn = first_0[0][1] fp = first_0[0][2] fn = first_1[0][1] tp = first_1[0][2] # Calculate metrics from values in the confussion matrix if (tp == 0): acc = float((tp + tn) / (tp + tn + fp + fn)) sen = 0 spe = float((tn) / (tn + fp)) prec = 0 rec = 0 f1 = 0 elif (tn == 0): acc = float((tp + tn) / (tp + tn + fp + fn)) sen = float((tp) / (tp + fn)) spe = 0 prec = float((tp) / (tp + fp)) rec = float((tp) / (tp + fn)) f1 = 2 * float((prec * rec) / (prec + rec)) else: acc = float((tp + tn) / (tp + tn + fp + fn)) sen = float((tp) / (tp + fn)) spe = float((tn) / (tn + fp)) prec = float((tp) / (tp + fp)) rec = float((tp) / (tp + fn)) f1 = 2 * float((prec * rec) / (prec + rec)) # Print results print('Confusion Matrix and Statistics: \n') cm.show() print('True Positives:', tp) print('True Negatives:', tn) print('False Positives:', fp) print('False Negatives:', fn) print('Total:', dataframe.count(), '\n') print('Accuracy: {0:.2f}'.format(acc)) print('Sensitivity: {0:.2f}'.format(sen)) print('Specificity: {0:.2f}'.format(spe)) print('Precision: {0:.2f}'.format(prec)) print('Recall: {0:.2f}'.format(rec)) print('F1-score: {0:.2f}'.format(f1)) # Create spark dataframe with results l = [(acc, sen, spe, prec, rec, f1)] df = session.createDataFrame(l, ['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1']) return df