Exemplo n.º 1
0
def extract_datepart(df: pyspark.sql.DataFrame,
                     dt_col: str,
                     to_extract: str,
                     drop: bool = False) -> pyspark.sql.DataFrame:
    """
    Base function for extracting dateparts. Used in less abstracted functions.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Base dataframe which contains ``dt_col`` column for extracting ``to_extract``.
    dt_col : str
        Name of date column to extract ``to_extract`` from.
    to_extract : str
        TODO
    drop : bool
        Whether or not to drop dt_col after extraction (default is False).

    Returns
    -------
    df : pyspark.sql.DataFrame
        df with ``to_extract`` column, optionally without original ``dt_col`` column.
    """
    df = df.withColumn(to_extract, getattr(F, to_extract)(F.col(dt_col)))
    if drop:
        df = df.drop(dt_col)
    return df
def lookup_columns(df: pyspark.sql.DataFrame,
                   vocab: Dict[str, List[Any]]) -> pyspark.sql.DataFrame:
    def lookup(mapping):
        def fn(v):
            return mapping.index(v)

        return F.udf(fn, returnType=T.IntegerType())

    for col, mapping in vocab.items():
        df = df.withColumn(col, lookup(mapping)(df[col]))
    return df
def prepare_google_trend(
    google_trend_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame:
    google_trend_all = google_trend_csv.withColumn(
        "Date",
        F.regexp_extract(google_trend_csv.week, "(.*?) -", 1)).withColumn(
            "State",
            F.regexp_extract(google_trend_csv.file, "Rossmann_DE_(.*)", 1))

    # map state NI -> HB,NI to align with other data sources
    google_trend_all = google_trend_all.withColumn(
        "State",
        F.when(google_trend_all.State == "NI",
               "HB,NI").otherwise(google_trend_all.State),
    )

    # expand dates
    return expand_date(google_trend_all)
Exemplo n.º 4
0
def cast(df: pyspark.sql.DataFrame, col_name: str,
         dtype: str) -> pyspark.sql.DataFrame:
    return df.withColumn(col_name, F.col(col_name).cast(dtype))
def cast_columns(df: pyspark.sql.DataFrame,
                 cols: List[str]) -> pyspark.sql.DataFrame:
    for col in cols:
        df = df.withColumn(col,
                           F.coalesce(df[col].cast(T.FloatType()), F.lit(0.0)))
    return df
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df
def expand_date(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    df = df.withColumn("Date", df.Date.cast(T.DateType()))
    return (df.withColumn("Year", F.year(df.Date)).withColumn(
        "Month",
        F.month(df.Date)).withColumn("Week", F.weekofyear(df.Date)).withColumn(
            "Day", F.dayofmonth(df.Date)))
Exemplo n.º 8
0
def metrics (session: SparkSession, dataframe: pyspark.sql.DataFrame, actual: str,
             predicted: str) -> pyspark.sql.DataFrame:
    '''
    Calculates evaluation metrics from predicted results

    :param dataframe: spark.sql.dataframe with the real and predicted values
    :param actual: Name of column with observed target values
    :param predicted: Name of column with predicted values
    :return:
    '''

    # Along each row are the actual values and down each column are the predicted
    dataframe = dataframe.withColumn(actual, col(actual).cast('integer'))
    dataframe = dataframe.withColumn(predicted, col(predicted).cast('integer'))
    cm = dataframe.crosstab(actual, predicted)
    cm = cm.sort(cm.columns[0], ascending=True)

    # Adds missing column in case just one class was predicted
    if not '0' in cm.columns:
        cm = cm.withColumn('0', lit(0))
    if not '1' in cm.columns:
        cm = cm.withColumn('1', lit(0))

    # Subsets values from confusion matrix
    zero = cm.filter(cm[cm.columns[0]] == 0.0)
    first_0 = zero.take(1)

    one = cm.filter(cm[cm.columns[0]] == 1.0)
    first_1 = one.take(1)

    tn = first_0[0][1]
    fp = first_0[0][2]
    fn = first_1[0][1]
    tp = first_1[0][2]

    # Calculate metrics from values in the confussion matrix
    if (tp == 0):
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = 0
        spe = float((tn) / (tn + fp))
        prec = 0
        rec = 0
        f1 = 0
    elif (tn == 0):
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = float((tp) / (tp + fn))
        spe = 0
        prec = float((tp) / (tp + fp))
        rec = float((tp) / (tp + fn))
        f1 = 2 * float((prec * rec) / (prec + rec))
    else:
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = float((tp) / (tp + fn))
        spe = float((tn) / (tn + fp))
        prec = float((tp) / (tp + fp))
        rec = float((tp) / (tp + fn))
        f1 = 2 * float((prec * rec) / (prec + rec))

    # Print results
    print('Confusion Matrix and Statistics: \n')
    cm.show()

    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', dataframe.count(), '\n')

    print('Accuracy: {0:.2f}'.format(acc))
    print('Sensitivity: {0:.2f}'.format(sen))
    print('Specificity: {0:.2f}'.format(spe))
    print('Precision: {0:.2f}'.format(prec))
    print('Recall: {0:.2f}'.format(rec))
    print('F1-score: {0:.2f}'.format(f1))

    # Create spark dataframe with results
    l = [(acc, sen, spe, prec, rec, f1)]
    df = session.createDataFrame(l, ['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1'])
    return df