示例#1
0
    def comapare_dataframes(
            cls,
            df1: DataFrame,
            df2: DataFrame,
            excluded_keys: Union[List, str, None] = []) -> bool:
        """
        Compares 2 DataFrames for exact match\
        internally it use pandas.testing.assert_frame_equal


        :param df1: processed data
        :type df1: DataFrame
        :param df2: gold standard expected data
        :type df2: DataFrame
        :return: True
        :param excluded_keys: columns to be excluded from comparision, optional
        :type excluded_keys: Union[List, str, None]
        :rtype: Boolean
        :raises: AssertionError Dataframe mismatch
        """
        excluded_keys = excluded_keys if type(excluded_keys) == list else [
            excluded_keys
        ]
        df1 = df1.drop(*excluded_keys)
        df2 = df2.drop(*excluded_keys)
        sort_columns = [cols[0] for cols in df1.dtypes]
        df1_sorted = df1.toPandas().sort_values(by=sort_columns,
                                                ignore_index=True)
        df2_sorted = df2.toPandas().sort_values(by=sort_columns,
                                                ignore_index=True)
        assert_frame_equal(df1_sorted, df2_sorted)
        return True
示例#2
0
def standardise_names(df: DataFrame, name_cols: list, drop_orig: bool = True):
    """Take a one or more name columns in a list and standardise the names
    so one name appears in each column consistently

    Args:
        df (DataFrame): Spark DataFrame
        name_cols (list): A list of columns that contain names, in order from first name to last name
        drop_orig (bool, optional): Drop the original columns after standardisation. Defaults to True.

    Returns:
        DataFrame: A Spark DataFrame with standardised name columns
    """

    name_col_joined = ", ".join(name_cols)
    surname_col_name = name_cols[-1]
    df = df.withColumn('name_concat',
                       expr(f"concat_ws(' ', {name_col_joined})"))
    df = df.withColumn('name_concat', expr('lower(name_concat)'))
    df = df.withColumn('name_concat',
                       expr("regexp_replace(name_concat, '[\\-\\.]', ' ')"))
    df = df.withColumn('name_arr', expr("split(name_concat, ' ')"))
    df = df.withColumn(
        'surname_std',
        expr(
            f"case when {surname_col_name} is not null then element_at(name_arr,-1) else null end"
        ))
    df = df.withColumn(
        'forename1_std',
        expr(
            "case when size(name_arr) > 1 then element_at(name_arr,1) else null end"
        ))
    df = df.withColumn(
        'forename2_std',
        expr(
            "case when size(name_arr) > 2 then element_at(name_arr,2) else null end"
        ))
    df = df.withColumn(
        'forename3_std',
        expr(
            "case when size(name_arr) > 3 then element_at(name_arr,3) else null end"
        ))
    df = df.withColumn(
        'forename4_std',
        expr(
            "case when size(name_arr) > 4 then element_at(name_arr,4) else null end"
        ))
    df = df.withColumn(
        'forename5_std',
        expr(
            "case when size(name_arr) > 5 then element_at(name_arr,5) else null end"
        ))
    df = df.drop("name_arr", "name_concat")
    if drop_orig:
        for n in name_cols:
            df = df.drop(n)
    return df
示例#3
0
 def _unpack_struct(self, df: DataFrame, col_name):
     sub_df = df.select(col_name + '.*')
     for subcol_name in sub_df.columns:
         df = df.withColumn(f'{col_name}_{subcol_name}',
                            df[col_name][subcol_name])
     df = df.drop(col_name)
     return self.unpack_nested(df)
def null_out_values_array(df: DataFrame, array_colname: str, values_to_null: list):
    """Null out a user defined list of undesirable values in a column that contains an array of values
    Useful for columns that mostly contain valid data but occasionally
    contain other values such as 'unknown'
   
    Args:
        df (DataFrame): The dataframe to clean
        colname (string): The name of the column to clean
        values_to_null (list): A list of values to be nulled.
    Returns:
        DataFrame: The cleaned dataframe with column containing array that has values in values_to_null nulled
    """
    if len(values_to_null) > 0:

        if str((dict(df.dtypes)[array_colname])).startswith("array"):

            array_args = [f.lit(v) for v in values_to_null]
            df = df.withColumn("vals_to_remove", f.array(*array_args))
            df = df.withColumn(
                array_colname, f.expr(f"array_except({array_colname}, vals_to_remove)")
            )
            df = df.drop("vals_to_remove")

        else:
            # if column is not an array fire up a warning
            warnings.warn(
                f""" column {array_colname} is not an array. Please use function null_out_values instead  """
            )

    return df
示例#5
0
def clean_immigration(df: SparkDataFrame) -> SparkDataFrame:
    """Clean immigration data

    :param df: immigration data frame to be cleaned.
    :return: cleaned immigration data frame
    """

    drop_cols = [
        'visapost', 'occup', 'entdepu', 'insnum', 'count', 'entdepa',
        'entdepd', 'matflag', 'dtaddto', 'biryear', 'admnum'
    ]
    int_cols = [
        'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94mode', 'i94bir',
        'i94visa', 'dtadfile'
    ]
    date_cols = ['arrdate', 'depdate']
    date_udf = udf(lambda x: x and (timedelta(days=int(x)) + datetime(
        1960, 1, 1)).strftime('%Y-%m-%d'))

    df = df.drop(*drop_cols)
    df = convert_column_type(df, 'integer', int_cols)
    for col in date_cols:
        df = df.withColumn(col, date_udf(df[col]))

    # Remove the row if the data in any of fk column is lost
    fk_columns = ['i94cit', 'i94port', 'i94addr']
    df = reduce(lambda df, idx: df.filter(df[fk_columns[idx]].isNotNull()),
                range(len(fk_columns)), df)

    return df
示例#6
0
def postcode_to_inward_outward(df: DataFrame,
                               pc_field: str,
                               drop_orig: bool = True):
    """Given a field containing a postcode, creates new columns in the dataframe
    called outward_postcode_std and inward_postcode_std

    Original postcode can have spaces or not and be in any case

    Args:
        df (DataFrame): Spark Dataframe
        pc_field (str): Name of field containing postcode
    """

    sql = f"upper(replace({pc_field}, ' ', ''))"
    df = df.withColumn("pc_nospace_temp__", expr(sql))

    # If the postcode is long enough, parse out inner outer
    # If it's too short, assume we only have the outer part

    sql = """
    case 
    when length(pc_nospace_temp__) >= 5 then left(pc_nospace_temp__, length(pc_nospace_temp__) - 3)
    else left(pc_nospace_temp__, 4)
    end
    """

    # sql = f"""left(pc_nospace_temp__, length(pc_nospace_temp__) - 3)"""
    df = df.withColumn("outward_postcode_std", expr(sql))

    sql = f"""right(pc_nospace_temp__, 3)"""

    sql = """
    case 
    when length(pc_nospace_temp__) >= 5 then right(pc_nospace_temp__, 3)
    else null 
    end
    """

    df = df.withColumn("inward_postcode_std", expr(sql))

    df = df.drop("pc_nospace_temp__")

    if drop_orig:
        df = df.drop(pc_field)

    return df
示例#7
0
    def get_categoricals_multiplier(self,
                                    df: DataFrame,
                                    col_list: list = [],
                                    ignore_cols: list = [],
                                    approx_distinct=100,
                                    rsd=0.05):
        """
        Gets a dictionary of col names and the distinct values in the column.
        :param df:
        :param col_list: Subset list of columns to use as categoricals; if null, all columns will be checked for
        approx_distinct values and considered categoricals
        :param ignore_cols: when not selecting a subset of columns using col_list, ignore columns is a list of
        columns that will be skipped when searching for categoricals with approx_distinct columns.
        :param approx_distinct: log a warning message if the approx number of distinct values is greater than this threshold.
        :param rsd:
        :return:
        """
        # TODO - Add logging of findings
        filter_vals = []
        filter_cols = col_list

        if len(col_list) == 0:
            for (dcol, dtype) in df.drop(*ignore_cols).dtypes:
                if dtype == 'string':
                    if self._get_approx_distinct_count_for_col(
                            df, dcol, _rsd=rsd) <= approx_distinct:
                        # LOG print("{} has approx {} distincts".format(dcol, cnt))
                        # LOG print("appending {}".format(dcol))
                        filter_vals.append(df.select(col(dcol)) \
                                           .filter((col(dcol).isNotNull()) &
                                                   (col(dcol).isin("", "Y", "N") == False)) \
                                           .distinct().rdd.map(lambda row: str(row[0])).collect())
                        filter_cols.append(dcol)
            # ?? TODO - What about the rest of the potential categorical types (i.e. bools/ints/floats/etc)
            return feature_factory.feature.Multiplier.create_from_cats(
                filter_cols, filter_vals)
        else:
            for dcol in col_list:
                if self._get_approx_distinct_count_for_col(
                        df, dcol) > approx_distinct:
                    print("WARN! {} has more than {} distinct values".format(
                        dcol, approx_distinct))
                filter_vals.append(df.select(col(dcol)) \
                                   .filter((col(dcol).isNotNull()) &
                                           (col(dcol).isin("", "Y", "N") == False)) \
                                   .distinct().rdd.map(lambda row: str(row[0])).collect())
            return feature_factory.feature.Multiplier._create_from_cats(
                filter_cols, filter_vals)
示例#8
0
def standardise_dob(
    df: DataFrame,
    dob_col: str,
    date_fmt_if_string: str = "yyyy-MM-dd",
    drop_orig: bool = True,
):
    """Create column called dob_std with dob as a string in yyyy-MM-dd format
    or null otherwise

    Args:
        df (DataFrame): Spark dataframe
        dob_col (str): Name of dob column
        date_fmt_if_string (str, optional): Date format if incoming dates are already string. Defaults to "yyyy-MM-dd".
        drop_orig (bool, optional): Drop original date of birth column. Defaults to True.

    Returns:
        DataFrame: Spark DataFrame with new standardised dob column called dob_std
    """

    dtypes = dict(df.dtypes)

    if dtypes[dob_col] == "date":
        df = df.withColumn("dob_std", date_format(dob_col, "yyyy-MM-dd"))

    if dtypes[dob_col] == "timestamp":
        df = df.withColumn("dob_std", date_format(dob_col, "yyyy-MM-dd"))

    if dtypes[dob_col] == "string":
        df = df.withColumn("dob_std", to_timestamp(dob_col, date_fmt_if_string))
        df = df.withColumn("dob_std", date_format("dob_std", "yyyy-MM-dd"))

    if drop_orig:
        if dob_col != "dob_std":
            df = df.drop(dob_col)

    return df
示例#9
0
    def append_features(self,
                        df: DataFrame,
                        groupBy_cols,
                        feature_sets: [FeatureSet],
                        withTrendsForFeatures: [FeatureSet] = None):
        """
        Appends features to incoming df. The features columns and groupby cols will be deduped and validated.
        If there's a group by, the groupby cols will be applied before appending features.
        If there's not a group by and no agg features then the features will be appended to df.
        :param df:
        :param groupBy_cols:
        :param feature_sets: input of FeatureSet
        :return:
        """
        # If groupBy Column is past in as something other than list, convert to list
        # Validation - If features, passed in is dict, convert to list of vals, etc.
        # groupBy_cols = self.helpers._to_list(groupBy_cols)
        groupBy_cols, groupBy_joiners = self.helpers._extract_groupby_joiner(
            groupBy_cols)
        features, dups = self.helpers._dedup_fast(df, [
            feature for feature_set in feature_sets
            for feature in feature_set.features.values()
        ])
        df = self.helpers._resolve_feature_joiners(
            df, features, groupBy_joiners).repartition(*groupBy_cols)

        # feature_cols = []
        agg_cols = []
        non_agg_cols = {}
        features_to_drop = []
        # base_cols = [f.base_col for f in features]

        # column validation
        # valid_result, undef_cols = self.helpers.validate_col(df, *base_cols)
        # assert valid_result, "base cols {} are not defined in df columns {}".format(undef_cols, df.columns)

        # valid_result, undef_cols = self.helpers._validate_col(df, *groupBy_cols)
        # assert valid_result, "groupby cols {} are not defined in df columns {}".format(undef_cols, df.columns)
        for feature in features:
            assert True if ((len(feature.aggs) > 0) and (len(
                groupBy_cols) > 0) or feature.agg_func is None) else False, "{} has either aggs or groupBys " \
                                               "but not both, ensure both are present".format(feature.name)
            # feature_cols.append(feature.assembled_column)
            # feature_cols.append(F.col(feature.output_alias))
            agg_cols += [agg_col for agg_col in feature.aggs]
            if feature.agg_func is None:
                non_agg_cols[feature.output_alias] = feature.assembled_column
            else:
                df = df.withColumn(feature.output_alias,
                                   feature.assembled_column)

            if feature.is_temporary:
                features_to_drop.append(feature.name)

        if len(groupBy_cols) > 0:
            df = df.groupBy(*groupBy_cols)\
                .agg(*agg_cols)
        for fn, col in non_agg_cols.items():
            df = df.withColumn(fn, col)

        final_df = df.drop(*features_to_drop)
        # else:
        #     new_df = df.select(*df.columns + feature_cols)
        return final_df
示例#10
0
def process_log_data(spark, input_data: str, output_data: str,
                     schema: StructType, songs_df: DataFrame) -> None:
    """
    Extract the raw data from S3, transform the data to our likings, and load it back into S3 in parquet format.

    Arguments:
        spark: An active Spark connection.
        input_data: Path to the S3 bucket with input data.
        output_data: Path to the S3 bucket where the transformed data is going to be stored.
        schema: DataFrame schema of the log data .json files.
        songs_df: DataFrame from the song_data function which is needed to create the songplays table.
    Returns:
        None.
    """
    log_data = spark.read.json(path=log_data_path, schema=log_schema)

    log_data_cached = log_data.repartition(8).cache()
    print(log_data_cached.count())

    users_df = (log_data_cached.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'), col('gender'), col('level'),
        col('ts')).withColumn(
            'row_number',
            sql_f.row_number().over(
                Window.partitionBy('user_id').orderBy(
                    col('ts').desc()))).where("row_number = 1").drop(
                        'row_number', 'ts').repartition(8))

    users_df_write = (users_df.write.mode('overwrite').parquet(
        f"{output_data_path}/analytical/users"))

    time_df = (log_data_cached.where("page = 'NextSong'").withColumn(
        'start_time',
        sql_f.from_unixtime(col('ts') / 1000).cast(TimestampType())).select(
            col('start_time'),
            hour('start_time').alias('hour'),
            dayofmonth('start_time').alias('day'),
            weekofyear('start_time').alias('week'),
            month("start_time").alias('month'),
            year("start_time").alias('year')).withColumn(
                'weekday',
                sql_f.when(dayofweek(col('start_time')) < 6,
                           True).otherwise(False)).dropDuplicates(
                               ['start_time']).repartition(8, 'year', 'month'))

    time_df_export = (time_df.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(f"{output_data_path}/analytical/time"))

    events_df = (log_data_cached.where("page = 'NextSong'").withColumn(
        "songplay_id", monotonically_increasing_id()).withColumn(
            'start_time',
            sql_f.from_unixtime(
                col('ts') / 1000).cast(TimestampType())).select(
                    col('songplay_id'), col('start_time'),
                    col('userId').alias('user_id'), col('level'),
                    col('sessionId').alias('session_id'), col('location'),
                    col('userAgent').alias('user_agent'), col('song'),
                    col('length'),
                    month("start_time").alias('month'),
                    year("start_time").alias('year')).repartition(8))

    join_condition = [
        songs_df.title == events_df.song, songs_df.duration == events_df.length
    ]

    songplays_df = (events_df.join(broadcast(songs_df.drop('year')),
                                   on=join_condition,
                                   how='left').select(col('songplay_id'),
                                                      col('start_time'),
                                                      col('user_id'),
                                                      col('level'),
                                                      col('song_id'),
                                                      col('artist_id'),
                                                      col('session_id'),
                                                      col('location'),
                                                      col('user_agent'),
                                                      col('month'),
                                                      col('year')).repartition(
                                                          8, 'year', 'month'))

    songplays_df_write = (songplays_df.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(f"{output_data_path}/analytical/songplays"))
from pyspark.sql.dataframe import DataFrame
sax = spark._jvm.com.ralib.notebook.spark.SAXSparkSession.getSession(
    spark._jsparkSession)

ds_Churn_Modelling = DataFrame(sax.getDataSet("Churn_Modelling"), spark)

#imports
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler

#data preparation
ds_Churn_Modelling = ds_Churn_Modelling.drop('RowNumber', 'Gender',
                                             'CustomerId', 'Surname',
                                             'Geography')

assembler = VectorAssembler(inputCols=[
    'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
    'IsActiveMember', 'EstimatedSalary'
],
                            outputCol="features")

scaler = StandardScaler(inputCol="features",
                        outputCol="scaledFeatures",
                        withStd=True,
                        withMean=False)

#model preparation

#create Logistic Regression object