Пример #1
0
def parse_dates(df, format):
    """
    Parses dateinto year,month,day
    :param df: input df
    :param format: the format of the timestamp
    :return: dataframe
    """
    return df.withColumn('parsed_date',
                         f.to_timestamp(f.col('transaction_date'), format)) \
        .withColumn("year", f.year(f.col('parsed_date'))) \
        .withColumn("month", f.month(f.col('parsed_date'))) \
        .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \
        .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \
        .drop("transaction_date")
Пример #2
0
    def shared_test_partition_preserving(self, func, preserve, create = None):
        from pyspark.sql.functions import month
        from tests.test_data import FORECAST_DATA

        flintContext = self.flintContext

        def create_dataframe():
            return flintContext.read.pandas(make_pdf(FORECAST_DATA, ["time", "id", "forecast"]))

        if create is None:
            create = create_dataframe

        df_lazy = create()

        df_eager = create()
        df_eager.timeSeriesRDD

        df = create()
        df_joined = df.leftJoin(df, right_alias="right")

        df = create()
        df_cached = df.cache()
        df_cached.count()

        df_cached_joined = df_cached.leftJoin(df_cached, right_alias="right")

        partition_preserving_input_tranforms = [
            lambda df: df,
            lambda df: df.withColumn("f2", df.forecast * 2),
            lambda df: df.select("time", "id", "forecast"),
            lambda df: df.filter(month(df.time) == 1)
        ]

        order_preserving_input_tranforms = [
            lambda df: df.orderBy("time")
        ]

        input_dfs = [df_lazy, df_eager, df_joined, df_cached, df_cached_joined]

        for transform in partition_preserving_input_tranforms:
            for input_df in input_dfs:
                self.assert_partition_preserving(transform(input_df), func, preserve)

        for transform in order_preserving_input_tranforms:
            for input_df in input_dfs:
                self.assert_order_preserving(transform(input_df), func, preserve)

        df_cached.unpersist()
Пример #3
0
def process_log_data(spark, input_data, output_data):
    """
    Description: This function can be used to read the file in the filepath (log_data)
    to get the user, time, songplays info and used to populate the users and time dim tables and songplays fact table.
    
    Arguments:
        input_data: the path where the input json files are present. 
        output_data: path where the otuput parquet files are written to
    
    Returns:
        None
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*.json"

    # read log data file
    log_df = spark.read.json(log_data).drop_duplicates()

    # filter by actions for song plays
    log_df = log_df.where(col("page") == "NextSong")

    # extract columns for users table
    users_table = log_df.withColumn("rn", row_number().over(Window.partitionBy("userId").orderBy(col("ts").desc()))).where(col("rn")==1).\
                                                    select(col('userId'),
                                                           col('firstName').alias('first_name'),
                                                           col('lastName').alias('last_name'),
                                                           col('gender'),
                                                           col('level'))

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data +
                                                'analytics/users')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ms: datetime.utcfromtimestamp(ms / 1000),
                        TimestampType())
    log_df = log_df.withColumn("start_time", get_timestamp("ts"))

    # extract columns to create time table
    time_table = log_df.select('start_time').dropDuplicates().select(
        'start_time',
        hour('start_time').alias('hour'),
        dayofmonth('start_time').alias('day'),
        weekofyear('start_time').alias('week'),
        month('start_time').alias('month'),
        year('start_time').alias('year'),
        dayofweek('start_time').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        'year', 'month').parquet(output_data + 'analytics/time')

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data +
                              "song_data/*/*/*/*.json").drop_duplicates()

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = log_df.join(song_df,log_df.artist == song_df.artist_name).drop_duplicates().select(monotonically_increasing_id().alias('songplay_id'),\
                                                                                   'start_time', col('userId').alias('user_Id'), 'level',\
                                                                                   'song_id','artist_id',\
                                                                                   col('sessionId').alias('session_id'), 'location',\
                                                                                   col('userAgent').alias('user_agent'))

    songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.start_time).select('songplay_id',\
                                                                                                                  songplays_table.start_time,\
                                                                                                                   'user_Id','level','song_id',\
                                                                                                                   'artist_id','session_id',\
                                                                                                                   'location','user_agent','year',\
                                                                                                                   'month').drop_duplicates()
    # write songplays table to parquet files partitioned by song_id (as year and month are not part of this table)
    songplays_table.write.mode("overwrite").partitionBy(
        'year', 'month').parquet(output_data + 'analytics/songplays')
Пример #4
0
def main():
    spark = SparkSession \
        .builder \
        .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:2.7.0') \
        .getOrCreate()

    # use for only one file
    # filename = 'chicago_taxi_trips_2016_01.csv'

    # use for reading all files
    filename = '*'

    df = spark.read \
        .format('csv') \
        .options(header=True, inferSchema=True) \
        .load(os.path.join(etl_conf['s3_taxi_dir_path'], filename))
    # df.printSchema()

    # Take a look at the top rows
    # df.limit(5).toPandas()

    # Check initial number of records
    # df.count()

    df_with_hour = df.withColumn('year', year(df.trip_start_timestamp))\
                     .withColumn('month', month(df.trip_start_timestamp))\
                     .withColumn('day', dayofmonth(df.trip_start_timestamp))\
                     .withColumn('hour', hour(df.trip_start_timestamp))

    df_features = df_with_hour.select('year', 'month', 'day', 'hour',
                                      'pickup_community_area',
                                      'dropoff_community_area')

    df_no_nulls = df_features.dropna()

    # df_no_nulls.count()

    # Create StringIndexer and fit + transform pickup data
    pickup_indexer = StringIndexer(inputCol='pickup_community_area',
                                   outputCol='pickup_community_area_indexed')

    pickup_indexer_model = pickup_indexer.fit(df_no_nulls)
    df_pickup_indexed = pickup_indexer_model.transform(df_no_nulls)

    # Create StringIndexer and fit + transform dropoff data
    dropoff_indexer = StringIndexer(inputCol='dropoff_community_area',
                                    outputCol='dropoff_community_area_indexed')

    dropoff_indexer_model = dropoff_indexer.fit(df_pickup_indexed)
    df_dropoff_indexed = dropoff_indexer_model.transform(df_pickup_indexed)

    # Create OneHotEncoder and fit + transform pickup & dropoff data
    encoder = OneHotEncoderEstimator() \
        .setInputCols(['hour',
                       'pickup_community_area_indexed',
                       'dropoff_community_area_indexed']) \
        .setOutputCols(['hour_encoded',
                        'pickup_community_area_encoded',
                        'dropoff_community_area_encoded'])

    encoder_model = encoder.fit(df_dropoff_indexed)
    df_encoded = encoder_model.transform(df_dropoff_indexed)

    # df_encoded.printSchema()

    bucket = output_conf['s3_bucket']
    key = output_conf['s3_model_key']

    # save the pickup stringINdexer and model
    pickup_indexer_name = 'pickup_indexer_name'
    pickup_indexer_path = os.path.join(bucket, key, pickup_indexer_name)
    pickup_indexer.write().overwrite().save(pickup_indexer_path)

    pickup_indexer_model_name = 'pickup_indexer_model_name'
    pickup_indexer_model_name_path = os.path.join(bucket, key,
                                                  pickup_indexer_model_name)
    pickup_indexer_model \
        .write() \
        .overwrite() \
        .save(pickup_indexer_model_name_path)

    # save the dropoff stringINdexer and model
    dropoff_indexer_name = 'dropoff_indexer_name'
    dropoff_indexer_path = os.path.join(bucket, key, dropoff_indexer_name)
    dropoff_indexer.write().overwrite().save(dropoff_indexer_path)

    dropoff_indexer_model_name = 'dropoff_indexer_model_name'
    dropoff_indexer_model_name_path = os.path.join(bucket, key,
                                                   dropoff_indexer_model_name)
    dropoff_indexer_model \
        .write() \
        .overwrite() \
        .save(dropoff_indexer_model_name_path)

    # save the one-hot encoder and model
    encoder_name = 'encoder_name'
    encoder_name_path = os.path.join(bucket, key, encoder_name)
    encoder.write().overwrite().save(encoder_name_path)

    encoder_model_name = 'encoder_model_name'
    encoder_model_name_path = os.path.join(bucket, key, encoder_model_name)
    encoder_model.write().overwrite().save(encoder_model_name_path)

    # make final dataframe and store back to S3
    df_final = df_encoded.select('year', 'month', 'day', 'hour_encoded',
                                 'pickup_community_area_encoded',
                                 'dropoff_community_area_encoded')

    bucket = output_conf['s3_bucket']
    key = output_conf['s3_data_key']

    output_path = os.path.join(bucket, key)

    df_final.write.partitionBy('year', 'month', 'day') \
            .parquet(output_path, mode='overwrite')
Пример #5
0
#add new column
new_column = df.withColumn("Continent",)
#rename column name
rename= df.withColumnRenamed("first","first_name")

#string manypulations---->>>don't forget to from pyspark.sql import functions

df.select(functions.upper(df.country)).show()

df.select(functions.split('email','@'))

#concate string
df.select(functions.concat_ws(':','country','first')).collect()
#extract a perticular year ,date ,time from a column
df.select(functions.year('created_at')).show()
df.select(functions.month('created_at')).show()

#filter data
df.filter(col('email').contains('@gmail.com')).show()

df.filter('country'== 'Switzerland').show()

df.filter(col('country').isin("'Switzerland'")).show()

df.filter(col('first').like('T%')).show()

df.filter(col('id').between(1,10)).show()

#some dataframe api

df.select('country').sort('country').show()
Пример #6
0
 def month(self) -> "ks.Series":
     """
     The month of the timestamp as January = 1 December = 12.
     """
     return column_op(lambda c: F.month(c).cast(LongType()))(
         self._data).alias(self._data.name)
Пример #7
0
def expand_date(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    df = df.withColumn("Date", df.Date.cast(T.DateType()))
    return (df.withColumn("Year", F.year(df.Date)).withColumn(
        "Month",
        F.month(df.Date)).withColumn("Week", F.weekofyear(df.Date)).withColumn(
            "Day", F.dayofmonth(df.Date)))
Пример #8
0
def process_log_data(spark, input_data, output_data):
    """
    Load data from S3 bucket for song dataset extract columns for song and artist table
    and write to parquet files which are saved in S3
    :param spark: spark session object
    :param input_data: Path to S3 bucket with song/artist data
    :param output_data: output S3 bucket where parquet files are saved
    :return:
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, LOG_DATA_FILES)

    # read log data file
    actions = spark.read.json(log_data)
    print("Number of rows in action data: %s" % actions.count())

    # filter by actions for song plays
    actions = actions.filter(actions.page == "NextSong")
    print("Filtered  rows in action data: %s" % actions.count())

    # extract columns for users table
    users_table = actions.select('userId', 'firstName', 'lastName', 'gender',
                                 'level').dropDuplicates()

    print("users_table writing to parquet")
    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'), 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: str(int(int(ts) / 1000)))
    actions = actions.withColumn('timestamp', get_timestamp(actions.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda ts: str(datetime.fromtimestamp(int(ts) / 1000)))
    actions = actions.withColumn('datetime', get_datetime(actions.ts))

    # extract columns to create time table
    time_table = actions.select('datetime').withColumn(
        'start_time',
        actions.datetime).withColumn('hour', hour('datetime')).withColumn(
            'day', dayofmonth('datetime')).withColumn(
                'week', weekofyear('datetime')).withColumn(
                    'month', month('datetime')).withColumn(
                        'year', year('datetime')).withColumn(
                            'weekday', dayofweek('datetime')).dropDuplicates()

    print("time_table writing to parquet")
    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time'), 'overwrite')

    print("reading files for songs df")
    # read in song data to use for songplays table
    songs = spark.read.json(os.path.join(input_data, SONG_DATA_FILES))

    # extract columns from joined song and log datasets to create songplays table
    joined_actions = actions.join(songs, songs.title == actions.song)
    songplays_table = joined_actions['datetime', 'userId', 'level', 'song_id',
                                     'artist_id', 'sessionId', 'location',
                                     'userAgent']
    songplays_table.select(
        monotonically_increasing_id().alias('songplay_id')).collect()

    print("songplays_table writing to parquet")
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(os.path.join(output_data, 'songplays'),
                                  'overwrite')
def process_log_data(spark, input_data, output_data):
    """
        Description: This function reads the data from S3 and extracts the following two tables:
            1. users table
            2. time table
            3. songplay table

        Parameters:
            spark       : Spark Session from function (create_spark_session)
            input_data  : S3 location of log data  files with the songs metadata. Files are in json format
            output_data : S3 bucket where dimensional tables are stored in parquet format
    """


    # get filepath to log data file
    log_data = input_data+'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data, inferSchema=true)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

###Users Table
    # extract columns for users table
    users_table =["userdId as user_id", "firstName as first_name", "lastName as last_name", "gender", "level"]

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/')


###Time Table

    # create timestamp column from original timestamp column
    get_timestamp =  udf(date_convert, TimestampType())
    df = df.withColumn("start_time", get_datetime('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: to_date(x), TimestampType())
    df = df.withColumn("start_time", get_timestamp(col("ts")))


    df = df.withColumn("hour", hour("timestamp"))
    df = df.withColumn("day", dayofmonth("timestamp"))
    df = df.withColumn("month", month("timestamp"))
    df = df.withColumn("year", year("timestamp"))
    df = df.withColumn("week", weekofyear("timestamp"))
    df = df.withColumn("weekday", dayofweek("timestamp"))

    time_table = df.select(col("start_time"), col("hour"), col("day"), col("week"), \
                           col("month"), col("year"), col("weekday")).distinct()

    # extract columns to create time table
    time_table =

    # write time table to parquet files partitioned by year and month
    songs_table.write.partitionBy("year", "month").parquet(output_data + 'time/')





###SONGPLAYS

    # read in song data to use for songplays table

    songs_df = spark.read.parquet(output_data + 'songs/*/*/*')
    artists_df = spark.read.parquet(output_data + 'artists/*')
    songs_logs_df = df.join(songs_df, (df.song == songs_df.title))
    artists_songs_logs_df = songs_logs_df.join(artists_df, (songs_logs_df.artist == artists_df.name))


    # extract columns from joined song and log datasets to create songplays table

    songplays_df = artists_songs_logs_df.join(
        time_table,
        artists_songs_logs_df.ts == time_table.start_time, 'left'
    ).drop(artists_songs_logs_df.year)

    # write songplays table to parquet files partitioned by year and month

    songplays_table = songplays_df.select(
        col('start_time').alias('start_time'),
        col('userId').alias('user_id'),
        col('level').alias('level'),
        col('song_id').alias('song_id'),
        col('artist_id').alias('artist_id'),
        col('sessionId').alias('session_id'),
        col('location').alias('location'),
        col('userAgent').alias('user_agent'),
        col('year').alias('year'),
        col('month').alias('month'),
    ).repartition("year", "month")

    songplays_table.write.partitionBy("year", "month").parquet(output_data + 'songplays/')
Пример #10
0
def process_log_data(spark, input_data, output_data):
    """
    Description: Loads log_data from S3 bucket, processes it by extracting
                 the songplays fact table along with user, time and song dimension tables, 
                 and then loads it back to S3
    Parameters:
        spark: cursor object (SparkSession)
        input_path: path to the S3 bucket containing log_data
        output_path: path to S3 bucket where the dimensional tables 
                     will be stored in parquet format
    Returns:
        None
    """

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"
    #     log_data = input_data + "log-data-unzipped/*.json"      for using data locally

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level').dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users_table', mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_date = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d'))
    df = df.withColumn('date', get_date(df.ts))

    # extract columns to create time table
    time_table = df.select('start_time').withColumn('year', year(col('start_time'))) \
                                        .withColumn('month', month(col('start_time'))) \
                                        .withColumn('week', weekofyear(col('start_time'))) \
                                        .withColumn('weekday', date_format(col('start_time'),'E')) \
                                        .withColumn('day', dayofmonth(col('start_time'))) \
                                        .withColumn('hour', hour(col('start_time'))) \
                                    .dropDuplicates()

    # dayofweek vs date_format ref: https://stackoverflow.com/questions/25006607/how-to-get-day-of-week-in-sparksql

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + 'time_table',
                             mode='overwrite',
                             partitionBy=['year', 'month'])

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"  # "song_data/A/A/A/*.json" for sample data
    #     song_data = input_data + "song-data-unzipped/song_data/*/*/*/*.json"     for using data locally

    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    song_df.createOrReplaceTempView('song_df')
    df.createOrReplaceTempView('log_df')
    time_table.createOrReplaceTempView('time_table')

    songplays_table = spark.sql("""SELECT DISTINCT  
                                             t.start_time,
                                             t.year as year,
                                             t.month as month,                                             
                                             l.userId, 
                                             l.level, 
                                             s.song_id,
                                             s.artist_id, 
                                             l.sessionid, 
                                             s.artist_location,
                                             l.useragent
                                FROM song_df s
                                JOIN log_df l
                                     ON s.artist_name = l.artist
                                     AND s.title = l.song
                                     AND s.duration = l.length
                                JOIN time_table t
                                     ON t.start_time = l.start_time
                         """).dropDuplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + 'songplays_table',
                                  mode='overwrite',
                                  partitionBy=['year', 'month'])
Пример #11
0
    StructField("hashTags", ArrayType(StringType()), True),
    StructField("lang", StringType(), True),
    StructField("text", StringType(), True),
    StructField("createdAt", LongType(), True)
])

(spark.readStream.table("tweets.`bronze`").withColumn(
    "json", from_json(col("tweet"), schema)).filter(
        col("json.id").isNotNull()).withColumn(
            "hashtag", explode("json.hashTags")).withColumn(
                "hashtag", lower(col("hashtag"))).withColumn(
                    "createdAt",
                    (col("json.createdAt").cast(LongType()) /
                     1000).cast(TimestampType())).withColumn(
                         "year", year(col("createdAt"))).withColumn(
                             "month", month(col("createdAt"))).withColumn(
                                 "day", dayofmonth(col("createdAt"))).select(
                                     "json.id", "json.user", "hashtag",
                                     "json.lang", "json.text", "createdAt",
                                     "year", "month",
                                     "day").writeStream.format("delta").option(
                                         "checkpointLocation",
                                         silverCheckpointPath).
 outputMode("append").queryName(silverStreamName).table("tweets.`silver`"))

# COMMAND ----------

# %sql select * from tweets.`silver` order by createdAt desc limit 10;

# COMMAND ----------
Пример #12
0
def process_log_data(spark, input_data, output_data):
    """Process log data with spark and store output.

    Keyword arguments:
    spark -- spark session object
    input_data -- filepath to input data files
    output_data -- filepath to store output data files
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.select('*').where(df['page'] == 'NextSong')

    # extract columns for users table
    users_table = df.select(df['userId'].alias('user_id'), \
                        df['firstName'].alias('first_name'), \
                        df['lastName'].alias('last_name'), \
                        df['gender'], \
                        df['level']).distinct()

    # write users table to parquet files
    users_table.write.parquet(path=output_data + 'users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000, IntegerType())
    df = df.withColumn('start_time', get_timestamp('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: from_unixtime(x), TimestampType())
    df = df.withColumn('datetime', from_unixtime('start_time'))

    # extract columns to create time table
    time_table = df.select('start_time', \
                       hour('datetime').alias('hour'), \
                       dayofmonth('datetime').alias('day'), \
                       weekofyear('datetime').alias('week'), \
                       month('datetime').alias('month'), \
                       year('datetime').alias('year'), \
                       date_format('datetime', 'u').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(path = output_data + 'time/', \
                             partitionBy = ('year', 'month'))

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + 'song_data/*/*/*/*.json')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df['song'] == song_df['title']) \
                    .select(monotonically_increasing_id().alias('songplay_id'),
                            'start_time',
                            year('datetime').alias('year'),
                            month('datetime').alias('month'),
                            df['userId'].alias('user_id'),
                            'level',
                            'song_id',
                            'artist_id',
                            df['sessionId'].alias('session_id'),
                            'location',
                            df['userAgent'].alias('user_agent'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(path = output_data + 'songplays/', \
                              partitionBy = ('year', 'month'))
Пример #13
0
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dates').getOrCreate()
df = spark.read.csv('appl_stock.csv', header=True, inferSchema=True)

print(df.select(['Date', 'Open']).show())

# Operowanie na datach

from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year,
                                   weekofyear, format_number, date_format)

df.select(dayofmonth(df['Date'])).show()

df.select(hour(df['Date'])).show()

df.select(month(df['Date'])).show()

# Średnia zamykająca cena na rok
df.select(year(df['Date'])).show()
newdf = df.withColumn("Year", year(df['Date']))
result = newdf.groupBy("Year").mean().select(["Year", "avg(Close)"])
new = result.withColumnRenamed("avg(Close)", "Average Closing Price")
new.select(
    ['Year',
     format_number('Average Closing Price', 2).alias("Avg Close")]).show()
def process_log_data(spark, input_data, output_data):
    """
    Description: This function reads the log_data from S3 and processes it by using
                 Spark and then loads the resulting tables onto S3 in parquet format.
    Parameters:
            @input: spark - Spark Session
            @input: input_data - location of song_data files
            @input: output_data - S3 location where ouput files are stored
    """
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level') \
                        .withColumnRenamed('userId', 'user_id') \
                        .withColumnRenamed('firstName', 'first_name') \
                        .withColumnRenamed('lastName', 'last_name')

    users_table = users_table.dropDuplicates(['user_id'])

    # write users table to parquet files
    users_table.write.parquet(output_data + "users/", 'overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime(
        '%Y-%m-%d %H:%M:%S'))
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(
        lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d'))
    df = df.withColumn("start_time", get_datetime(df.ts))

    # extract columns to create time table
    df = df.withColumn('hour', hour('timestamp'))
    df = df.withColumn('day', dayofmonth('timestamp'))
    df = df.withColumn('week', weekofyear('timestamp'))
    df = df.withColumn('month', month('timestamp'))
    df = df.withColumn('year', year('timestamp'))
    df = df.withColumn('weekday', dayofweek('timestamp'))

    time_table = df.select('start_time', 'hour', 'day', 'week', 'month',
                           'year', 'weekday')
    time_table = time_table.dropDuplicates(['start_time'])

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year',
                                 'month').parquet(output_data + "time/",
                                                  'overwrite')

    # read in song data to use for songplays table
    song_data = input_data + "song_data/*/*/*/*.json"
    song_df = spark.read.json(song_data)

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.artist == song_df.artist_name, how = 'left') \
                        .select('start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent') \
                        .withColumn('songplay_id', monotonically_increasing_id()) \
                        .withColumnRenamed('userId', 'user_id') \
                        .withColumnRenamed('sessionId', 'session_id') \
                        .withColumnRenamed('userAgent', 'user_agent') \
                        .withColumn('year', year('start_time')) \
                        .withColumn('month', month('start_time'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        output_data + 'songplays/', 'overwrite')
Пример #15
0
## collecting date for partitioning
date = data.select(f.max('nav_date')).collect()[0][0]
year = date.year
month = date.month

if month < 10:
    month = '0' + str(month)

part_date = str(year) + str(month)

## extracting weekday column using date
data = data.withColumn('week_day', f.date_format('nav_date', 'E'))

sqlctxt.sql("set hive.exec.dynamic.partition.mode=nonstrict")
funds = data.filter(f.month('nav_date') == 4).select('fund_id').distinct()

funds.registerTempTable("fund_id")
#sqlctxt.sql("insert overwrite table h011gtcsandbox.xnd_pricing_fund_id PARTITION (YYYYMM = " + str(part_date) + ") select * from fund_id")

funds = funds.select(f.collect_set('fund_id')).collect()[0][0]
funds = [str(funds) for funds in funds]

data_without_funds = data.where(~data.fund_id.isin(funds))
funds_miss = data.filter((data.fund_id == '2DEC'))
data_with_funds = data.where(data.fund_id.isin(funds))
data = data_with_funds.unionAll(funds_miss)


## function to convert weekday from string to numerical EX: Monday as 0 so on
def week_day(x):
# Transform
transformed_temp_df = cleaned_temp_df\
    .select("dt",
            "AverageTemperature",
            "AverageTemperatureUncertainty",
            "City",
            "Country",
            "Latitude",
            "Longitude")\
    .withColumn("dt", udf_parse_datetime("dt"))\
    .withColumnRenamed("AverageTemperature", "avg_temp")\
    .withColumnRenamed("AverageTemperatureUncertainty", "avg_temp_uncertainty")\
    .withColumn("city_code", udf_map_country("country"))\
    .withColumnRenamed("City", "city")\
    .withColumnRenamed("Country", "country")\
    .withColumnRenamed("Latitude", "latitude")\
    .withColumnRenamed("Longitude", "longitude")\
    .withColumnRenamed("dt", "date_time")\
    .withColumn('month', month('date_time')) \
    .withColumn('year', year('date_time')) \

transformed_temp_df = transformed_temp_df.filter(transformed_temp_df.city_code != 'null')

# Write
transformed_temp_df.write\
    .partitionBy("city_code", "year", "month")\
    .mode("append")\
    .parquet("{}/transformed/temperature/".format(s3_bucket_name))

Пример #17
0
def process_i94_data(spark, input_data, output_data, local):
    """ - Reads i94 fact data from filepath
        - Converts datetime formats to date
        - Reads and joins mappings for countries, visa_categories and us_states
        - Cleans i94addr and age columns
        - Creates duration measure
        - Creates date dimension table
        - Writes data to output location in parquet file format:
            - i94, visa_categories, us_states, dates


    :params spark: spark session
    :params input_data: list of filepaths, file order must match execution steps
    :params output_data: filepath
    """

    ### read i94 data
    df_spark = get_i94_data(spark, input_data[0])

    if local:
        df_spark = df_spark.limit(300)

    ### Datetime conversions
    # register udfs
    udf_date_from_sas = udf(lambda x: convert_sas_date(x), DateType())
    udf_date_from_str = udf(lambda x: convert_str_to_date(x), DateType())

    # add date columns
    df_spark = df_spark\
        .withColumn("arrival_date", udf_date_from_sas("arrdate")) \
        .withColumn("departure_date", udf_date_from_sas("depdate")) \
        .withColumn("dtadfile_date", udf_date_from_str("dtadfile"))

    ### i94cit/res number to iso-code mapping
    # read country data
    df_con = get_country_mapping(spark, input_data[1])

    # join i94cit
    joinExpr = [df_spark.i94cit == df_con.i94_code]
    df_spark =\
        df_spark.join(df_con.select("i94_code","iso_code"), joinExpr, "left_outer")\
            .withColumn("cit_country_id", coalesce("iso_code", lit(99))).drop("i94_code","iso_code")

    # join i94res
    joinExpr = [df_spark.i94res == df_con.i94_code]
    df_spark =\
        df_spark.join(df_con.select("i94_code","iso_code"), joinExpr, "left_outer")\
            .withColumn("res_country_id", coalesce("iso_code", lit(99))).drop("i94_code","iso_code")

    ### Visatype to visa_id mapping
    # read visa
    df_visa = get_visa_mapping(spark, input_data[2])
    joinExpr = [df_spark.visatype == df_visa.visa]
    df_spark =\
        df_spark.join(df_visa.select("visa","visa_id").dropna(), joinExpr, "left_outer")\
            .withColumn("visa_id", coalesce("visa_id", lit(1))).drop("visa")

    ### Clean i94addr - US-States
    df_states = get_us_states_mapping(spark, input_data[3])
    joinExpr = [df_spark.i94addr == df_states.state_id]
    df_spark = \
        df_spark.join(df_states.select("state_id").dropna(), joinExpr, "left_outer")\
            .withColumn("state_id_clean", coalesce("state_id", lit(99))).drop("state_id")

    ### Clean i94mode - replace nulls with 9 not reported
    df_spark = df_spark.fillna({"i94mode": 9})

    ### Clean i94visa - travel purpose - replace nulls with 9 not reported
    df_spark = df_spark.fillna({"i94visa": 9})

    ### Clean gender
    udf_clean_gender = udf(lambda x: clean_gender(x), StringType())
    df_spark = df_spark.withColumn("gender_clean", udf_clean_gender("gender"))

    ### Clean Age and register udf
    clean_age = udf(lambda x: clean_negative_age(x), IntegerType())
    df_spark = df_spark.withColumn("age", clean_age("i94bir"))

    ### Create time dimension
    df_dates = df_spark.select(col("arrival_date").alias("date")).dropDuplicates().dropna() \
        .withColumn("year", year("date")) \
        .withColumn("month", month("date")) \
        .withColumn("day", dayofmonth("date")) \
        .withColumn("week", weekofyear("date"))

    # Calculate duration in days departure - arrival
    df_spark = df_spark.withColumn("duration",
                                   datediff("departure_date", "arrival_date"))

    ### Select final fields for fact table
    df_spark = df_spark \
                .withColumn("i94_id", monotonically_increasing_id()) \
                .select(
                        "i94_id",
                        "cit_country_id",
                        "res_country_id",
                        col("state_id_clean").alias("state_id"),
                        col("i94mode").alias("mode_id"),
                        col("i94visa").alias("purpose_id"),
                        "visa_id",
                        "arrival_date",
                        col("cicid").alias("cic_id"),
                        col("gender_clean").alias("gender"),
                        "count",
                        "duration",
                        "age",
                        col("i94yr").alias("year"),
                        col("i94mon").alias("month")
                    )

    ### Write out
    df_spark.write.parquet(output_data + "i94.parquet",
                           mode="append",
                           partitionBy=['year', 'month'])
    df_visa.write.parquet(output_data + "visa_categories.parquet",
                          mode="overwrite")
    df_states.write.parquet(output_data + "us_states.parquet",
                            mode="overwrite")
    df_dates.write.parquet(output_data + "dates.parquet",
                           mode="append",
                           partitionBy=['year', 'month'])
    if local:
        df_spark.write.csv(output_data + "i94.csv",
                           header=True,
                           mode="overwrite",
                           sep=";")

        df_visa = df_visa.repartition(1)
        df_visa.write.csv(output_data + "visa_categories.csv",
                          header=True,
                          mode="overwrite",
                          sep=";")

        df_states = df_states.repartition(1)
        df_states.write.parquet(output_data + "us_states.parquet",
                                mode="overwrite")
        df_states.write.csv(output_data + "us_states.csv",
                            header=True,
                            mode="overwrite",
                            sep=";")

        df_dates = df_dates.repartition(1)
        df_dates.write.csv(output_data + "dates.csv",
                           header=True,
                           mode="overwrite",
                           sep=";")
def process_log_data(spark, input_data, output_data):
    """Processs logs data and create fact table and save it to s3"""
    log_data = input_data + "log_data/*/*/*"
    log_data = "s3a://data-cap/log_data/*/*/*"
    
    df = spark.read.format("json").load(log_data)
    df = df.where(df.page == "NextSong")

    def get_ts(x):
        """udf to convert ts to datetime"""
        return datetime.fromtimestamp(x/1000)
    
    get_time_stamp = udf(get_ts, TimestampType())
    df = df.withColumn('start_time', get_time_stamp('ts'))
    df = df.withColumn("songplay_id", F.monotonically_increasing_id())
    df.createOrReplaceTempView("log_data")
    df.show()

    # create uses table and write to s3
    users_table = df.select(
        F.col("userid").alias("user_id"),
        F.col("firstName").alias("first_name"),
        F.col("lastName").alias("last_name"),
        F.col("gender").alias("gender"),
        F.col("level").alias("level")
    ).distinct()
    users_table.write.parquet(output_data + "users_table", mode='overwrite')
    users_table.show()

    # Create time_table
    time_table = df.select("start_time",
                           F.hour("start_time").alias('hour'),
                           F.dayofmonth("start_time").alias('day'),
                           F.weekofyear("start_time").alias('week'),
                           F.month("start_time").alias('month'),
                           F.year("start_time").alias('year'),
                           F.date_format("start_time","u").alias('weekday')
                          ).distinct()
    # Write time_table to s3
    time_table.write.partitionBy("year", "month").parquet(output_data + "time_table", mode='overwrite')

    # read songs data from s3
    song_df = spark.read.parquet(output_data + "songs_table")
    song_df.createOrReplaceTempView("songs_table")
    song_df.show()

    # read artists data from s3
    artist_df = spark.read.parquet(output_data + "artists_table")
    artist_df.createOrReplaceTempView("artists_table")
    artist_df.show()

    # Create a time_table view for exploratory
    time_table.createOrReplaceTempView("time_table")
    time_table.show()

    # Create the fact table by joining logs, songs and artist tables
    songplays_table = spark.sql(""" SELECT log.start_time,
                                       log.userid,
                                       log.level,
                                       art.artist_id,
                                       song.song_id,
                                       log.sessionid,
                                       log.location,
                                       log.useragent
                                       FROM log_data log JOIN artists_table art ON (log.artist = art.artist_name)
                                       JOIN songs_table song ON (song.artist_id = art.artist_id)""")
    # songplays_table.write.partitionBy("userid").parquet(output_data + "songplays_table", mode='overwrite')
    songplays_table.show()
    print(f"number of records in songplays_table {songplays_table.count()}")
def process_log_data(spark, input_data, output_data):
    """Imports the log data. Generates user table, time table, and songplay table and
        saves them to parquest files."""

    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # define log_data_schema
    log_data_schema = StructType([
        StructField('artist', StringType(), True),
        StructField('auth', StringType(), True),
        StructField('firstName', StringType(), True),
        StructField('gender', StringType(), True),
        StructField('itemInSession', IntegerType(), True),
        StructField('lastName', StringType(), True),
        StructField('length', FloatType(), True),
        StructField('level', StringType(), True),
        StructField('location', StringType(), True),
        StructField('method', StringType(), True),
        StructField('page', StringType(), True),
        StructField('registration', FloatType(), True),
        StructField('sessionId', IntegerType(), True),
        StructField('song', StringType(), True),
        StructField('status', StringType(), True),
        StructField('ts', LongType(), True),
        StructField('userAgent', StringType(), True),
        StructField('user_id', IntegerType(), True)
    ])

    # read log data file
    df = spark.read.json(log_data, log_data_schema)

    # filter by actions for song plays
    df = df.filter(df.page == "NextSong")

    #create a spark sql view of the log data
    df.createOrReplaceTempView("df_log_data")

    # extract columns for users table
    users_table = spark.sql("SELECT DISTINCT user_id, firstName, lastName, \
                                             gender, level FROM df_log_data")

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(
        os.path.join(output_data, "users_table"))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0),
                        TimestampType())
    df = df.withColumn("start_time", get_timestamp('ts'))

    # set df_log_data table to new newly modified version
    df.createOrReplaceTempView("df_log_data")

    # extract columns to create time table using pyspark sql functions
    time_table = spark.sql("SELECT DISTINCT start_time FROM df_log_data")
    time_table = time_table.withColumn("hour", hour("start_time")) \
                           .withColumn("day", dayofmonth("start_time")) \
                           .withColumn("week", weekofyear("start_time")) \
                           .withColumn("month", month("start_time")) \
                           .withColumn("year", year("start_time")) \
                           .withColumn("weekday", date_format('start_time','E'))

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy("year", "month")\
              .mode('overwrite').parquet(os.path.join(output_data, "timetable"))

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, "songtable"))
    song_df.createOrReplaceTempView("songs_table")

    artist_df = spark.read.parquet(os.path.join(output_data, "artists_table"))
    artist_df.createOrReplaceTempView("artists_table")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = spark.sql(
        "SELECT ts, user_id, level, songs_table.song_id, \
                                songs_table.artist_id, sessionid, df_log_data.location, userAgent\
                                FROM df_log_data \
                                JOIN songs_table \
                                ON df_log_data.song = songs_table.title\
                                JOIN artists_table \
                                ON df_log_data.artist = artists_table.name")

    songplays_table = songplays_table.withColumn("start_time",
                                                 get_timestamp('ts'))
    songplays_table = songplays_table.withColumn("month", month("start_time")) \
                                     .withColumn("year", year("start_time"))

    # drop ts column
    songplays_table.drop("ts")
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy("year", "month").mode('overwrite')\
                         .parquet(os.path.join(output_data, "songplays_table"))
Пример #20
0
def select_range_time(df,day_ini,day_fin):
    df_ret=df.filter(year("created_at")>=day_ini.year).filter(month("created_at")>=day_ini.month).filter(dayofmonth("created_at")>=day_ini.day)
    df_ret=df_ret.filter(year("created_at")<=day_fin.year).filter(month("created_at")<=day_fin.month).filter(dayofmonth("created_at")<=day_fin.day)
    return(df_ret)
Пример #21
0
def process_log_data(spark, input_data, output_data):
    """
        In this function we are loading the song_data file and create tables for songplays,users and time tables.
        Input: Sparksession, 
               Input_data filepath for songs data 
               Output_data filepath for songs data
               
        Output: We produce parquet files for songplays,users and time tables.
    """
    # get filepath to log data file
    log_data = input_data

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(col("page") == "NextSong")

    # extract columns for users table
    users_table = df['userId', 'firstName', 'lastName', 'gender', 'level']
    #drop duplicates
    users_table = users_table.drop_duplicates(subset=['userId'])

    # write users table to parquet files
    users_table = users_table.write.partitionBy('userId').parquet(
        os.path.join(output_data, 'users.parquet'), 'overwrite')
    print("users_table partitioned!")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: tstodatetime(x))
    df = df.withColumn('daytime', get_timestamp(col("ts")))

    # extract columns to create time table
    time_table = df.select(
        col("ts").alias('start_time'),
        year('daytime').alias('year'),
        month('daytime').alias('month'),
        dayofmonth('daytime').alias('day'),
        hour('daytime').alias('hour'),
        weekofyear('daytime').alias('weekofyear'))
    #We are going to partition later in the code!

    # read in song data to use for songplays table
    sqlContext = SQLContext(spark)
    songs_table = sqlContext.read.parquet(
        'data/outputs/song_data/songs.parquet')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location',
                         'userAgent', 'song']
    #add artists id and song id by joining with songs_table
    songplays_table = songplays_table.alias('s').join(songs_table.alias('e'),col('e.title') == col('s.song'))\
    .select(col('s.ts').alias('start_time'),
        col('s.userId'),
        col('s.level'),
        col('s.sessionId'),
        col('s.location'),
        col('s.userAgent'),
        col('s.song'),
        col('e.artist_id').alias('artist_id'),
        col('e.song_id').alias('song_id'))
    #add month and year for partitioning later based on those
    time_table_short = time_table['start_time', 'month', 'year']
    songplays_table = songplays_table.alias('s').join(time_table_short.alias('t'),col('t.start_time') == col('s.start_time'))\
    .select(col('s.start_time'),
        col('s.userId'),
        col('s.level'),
        col('s.sessionId'),
        col('s.location'),
        col('s.userAgent'),
        col('s.song'),
        col('s.artist_id'),
        col('s.song_id'),
        col('t.year'),
        col('t.month'),
       )
    # write time table to parquet files partitioned by year and month
    time_table = time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'times.parquet'), 'overwrite')
    print("time_table partitioned!")

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays_table.write.partitionBy(
        'year',
        'month').parquet(os.path.join(output_data, 'songplays.parquet'),
                         'overwrite')
    print("songplays_table partitioned!")
Пример #22
0
def process_log_data(spark, input_data, output_data):
    """
    Process the Log dataset of files in JSON format and create the users, time 
    and songplays dimension tables in Spark space.
    Next tables are writen in parquet format to output_data location.
    
    :param: spark: a sparkSession object
    :param input_data: The URI or local location of input datasets
    :param output_data: the URI of S3 bucket or local location for the output files
    """
    
    # get filepath to log data file
    log_data = input_data + "log_data/*/*/*.json"

    # read log data file
    df = spark.read.json(log_data)
    
    # filter by actions for song plays
    df = df.filter("page = 'NextSong'")

    # extract columns for users table    
    users = df.select(["ts", "userId", "firstName", "lastName", "gender", "level"]) \
                    .withColumnRenamed("userId", "user_id") \
                    .withColumnRenamed("firstName", "first_name") \
                    .withColumnRenamed("lastName", "last_name")
    users.createOrReplaceTempView("users")
    users_table = spark.sql(
            """
                SELECT DISTINCT user_id, first_name, last_name, gender, level
                FROM users
                WHERE user_id is NOT NULL
            """
        )    
    
    # write users table to parquet files
    users_table.write.parquet(output_data + "users", mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000.0), TimestampType())
    df = df.withColumn("start_time", get_timestamp("ts"))
    
    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: date.fromtimestamp(x / 1000.0), DateType())
    df = df.withColumn("date", get_datetime("ts"))
    
    # extract columns to create time table
    time_table = df.select("start_time", 
                           hour("date").alias("hour"), 
                           dayofmonth("date").alias("day"), 
                           weekofyear("date").alias("week"), 
                           month("date").alias("month"),
                           year("date").alias("year"),
                           dayofweek("date").alias("weekday")
                        ).distinct()
    
    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + "time", 
                              mode='overwrite',
                              partitionBy=["year", "month"]
                            )

    # read in song data to use for songplays table
    song_df = df.select("artist",
                        "song",
                        "length", 
                        "page", 
                        "start_time",
                        "userId", 
                        "level", 
                        "sessionId",
                        "location", 
                        "userAgent",
                        month("date").alias("month"),
                        year("date").alias("year"),
                        )

    # extract columns from joined song and log datasets to create songplays table 
    song_df.createOrReplaceTempView("staging_events")

    songplays_table = spark.sql(
            """
            SELECT row_number() OVER (PARTITION BY start_time ORDER BY start_time) as songplay_id,
                   e.start_time, 
                   e.userId AS user_id, 
                   e.level AS level, 
                   s.song_id AS song_id, 
                   s.artist_id AS artist_id, 
                   e.sessionId AS session_id, 
                   e.location AS location, 
                   e.userAgent AS user_agent,
                   e.year,
                   e.month
            FROM staging_events e
            LEFT JOIN staging_songs s 
                   ON e.song = s.title
                  AND e.artist = s.artist_name
                  AND e.length = s.duration
            """
        )
    
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + "songplays", 
                              mode='overwrite',
                              partitionBy=["year", "month"]
                            )
# There’s an API named agg(*exprs) that takes a list of column names and expressions for the type of aggregation you’d like to compute.
# You can leverage the built-in functions that mentioned above as part of the expressions for each column.

# Provide the min, count, and avg and groupBy the location column. Diplay the results
agg_df = df.groupBy("location").agg(F.min("id"), F.count("id"),
                                    F.avg("date_diff"))
display(agg_df)

# COMMAND ----------

# DBTITLE 1,I’d like to write out the DataFrames to Parquet, but would like to partition on a particular column.
# You can use the following APIs to accomplish this.
# Ensure the code does not create a large number of partition columns with the datasets otherwise the overhead of the metadata can cause significant slow downs.
# If there is a SQL table back by this directory, you will need to call refresh table <table-name> to update the metadata prior to the query.

df = df.withColumn('end_month', F.month('end_date'))
df = df.withColumn('end_year', F.year('end_date'))
df.write.partitionBy("end_year", "end_month").parquet("/tmp/sample_table")
display(dbutils.fs.ls("/tmp/sample_table"))

# COMMAND ----------

# DBTITLE 1,How do I properly handle cases where I want to filter out NULL data?
null_item_schema = StructType([
    StructField("col1", StringType(), True),
    StructField("col2", IntegerType(), True)
])
null_df = spark.createDataFrame([("test", 1), (None, 2)], null_item_schema)
display(null_df.filter("col1 IS NOT NULL"))

# COMMAND ----------
Пример #24
0
def process_log_data(spark, input_data, output_data):
    """
    Function to load source data and process the data. In the below function
    we are processing the 'log_data' and creating our Fact table:
    songplays and also dimension tables: time and users in parquet format.

    :param spark: SparkSession object
    :param input_data: Source data (log_data)
    :param output_data: Data destination
    :return: None
    """
    # load data into dataframe
    log_data = input_data + "log_data/*/*"
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users_table
    users_table = (df.select(
        col('userId').alias('user_id'),
        col('firstName').alias('first_name'),
        col('lastName').alias('last_name'),
        col('gender').alias('gender'),
        col('level').alias('level')).distinct())

    # write users table to parquet files
    users_table.write.mode("overwrite").parquet(output_data + '/users')

    # create timestamp column from original timestamp column
    df = df.withColumn(
        "ts_timestamp",
        F.to_timestamp(
            F.from_unixtime((col("ts") / 1000),
                            'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp"))

    def get_weekday(date):
        import datetime
        import calendar
        date = date.strftime("%m-%d-%Y")  # , %H:%M:%S
        month, day, year = (int(x) for x in date.split('-'))
        weekday = datetime.date(year, month, day)
        return calendar.day_name[weekday.weekday()]

    udf_week_day = udf(get_weekday, T.StringType())

    # extract columns to create time table
    time_table = (df.withColumn("hour", hour(col("ts_timestamp"))).withColumn(
        "day", dayofmonth(col("ts_timestamp"))).withColumn(
            "week", weekofyear(col("ts_timestamp"))).withColumn(
                "month", month(col("ts_timestamp"))).withColumn(
                    "year", year(col("ts_timestamp"))).withColumn(
                        "weekday", udf_week_day(col("ts_timestamp"))).select(
                            col("ts_timestamp").alias("start_time"),
                            col("hour"), col("day"), col("week"), col("month"),
                            col("year"), col("weekday")))

    # write time table to parquet files partitioned by year and month
    time_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + "/time")

    # read in song data to use for songplays table
    songs_df = spark.read.parquet(os.path.join(output_data, "songs/*/*/*"))
    songs_logs = df.join(songs_df, (df.song == songs_df.title))

    # extract columns from joined song and log datasets
    # to create songplays table
    artists_df = spark.read.parquet(os.path.join(output_data, "artists"))
    artists_songs_logs = songs_logs.alias('a').join(
        artists_df.alias('t'), (songs_logs.artist == artists_df.name) |
        (songs_logs.location == artists_df.location), 'left')
    songplays = artists_songs_logs.join(
        time_table, artists_songs_logs.ts_timestamp == time_table.start_time,
        'left')

    # write songplays table to parquet files partitioned by year and month
    songplays_table = songplays.select(
        col('start_time'),
        col('userId').alias('user_id'),
        col('level'),
        col('song_id'),
        col('artist_id'),
        col('sessionId').alias('session_id'),
        col('a.location'),
        col('userAgent').alias('user_agent'),
        col('year'),
        col('month'),
    ).distinct().repartition("year", "month")

    songplays_table.write.mode("overwrite").partitionBy(
        "year", "month").parquet(output_data + '/songplays')
Пример #25
0
def process_log_data(spark, input_data, output_data):
    """
    Processes a log file. Writes time, users and songplay tables to S3.
    Arguments:
    input_data -- input S3 directory with `song` and `log` files
    output_data -- output S3 directory
    """

    print("Read log data")
    # read log data file
    df_log_data = spark.read.json(input_data + "log-data/*/*/*.json")

    # filter by actions for song plays
    df_log_data = df_log_data[df_log_data['page'] == 'NextSong']

    # extract columns for users table
    users_table = df_log_data[[
        'userId', 'firstName', 'lastName', 'gender', 'level'
    ]].drop_duplicates()

    print("Write...")
    # write users table to parquet files
    users_table.write.save(path=output_data + 'users_table',
                           format='parquet',
                           mode='overwrite')

    df_log_data = df_log_data.withColumn('timestamp', F.from_unixtime(df_log_data['ts']/1000))\
                            .withColumn('hour', F.hour(F.col('timestamp')))\
                            .withColumn('day', F.dayofmonth(F.col('timestamp')))\
                            .withColumn('month', F.month(F.col('timestamp')))\
                            .withColumn('year', F.year(F.col('timestamp')))\
                            .withColumn('weekofyear', F.weekofyear(F.col('timestamp')))\
                            .withColumn('dayofweek', F.dayofweek(F.col('timestamp')))

    # extract columns to create time table
    time_table = df_log_data[[
        'timestamp',
        'hour',
        'day',
        'month',
        'year',
        'weekofyear',
        'dayofweek',
    ]].drop_duplicates()

    print("Write...")
    # write time table to parquet files partitioned by year and month
    time_table.write.save(path=output_data + 'time_table',
                          format='parquet',
                          mode='overwrite',
                          partitionBy=['year', 'month'])

    # read in song data to use for songplays table
    df_song = spark.read.json(input_data + "song_data/*/*/*/*.json",
                              schema=build_song_schema())

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df_log_data.join(df_song,
                                       on = (df_song['title'] == df_log_data['song']) & \
                                           (df_song['artist_name'] == df_log_data['artist']) & \
                                           (df_song['duration'] == df_log_data['length'])
                                      )

    print("Write...")
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.save(path=output_data + 'songplays_table',
                               format='parquet',
                               mode='overwrite',
                               partitionBy=['year', 'month'])
Пример #26
0
def process_log_data(spark, input_data, output_data):
    """
    Extract raw log data, transform into tables for users, songplays, and time, and save into parquet files.

    Arguments:
        spark: The SparkSession object
        input_data: Path to input data where log_data is placed
        output_data: Output path where songplays.parquest, users.parquet and time.parquet will be saved
    """
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*.json')

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.selectExpr('userId as user_id', 'firstName as first_name', 'lastName as last_name', 'gender', 'level') \
        .dropDuplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'users/users.parquet',
                              mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: x / 1000)
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(x)))
    df = df.withColumn('datetime', get_datetime(df.timestamp))

    # extract columns to create time table
    time_table = df.withColumn('hour', hour('datetime')) \
        .withColumn('day', dayofmonth('datetime')) \
        .withColumn('week', weekofyear('datetime')) \
        .withColumn('month', month('datetime')) \
        .withColumn('year', year('datetime')) \
        .withColumn('weekday', date_format('datetime', 'E')) \
        .select(['ts', 'hour', 'day', 'week', 'month', 'year', 'weekday']) \
        .dropDuplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month') \
        .parquet(output_data + 'time/time.parquet', mode='overwrite')

    # read in song data to use for songplays table
    spark.read.json(os.path.join(input_data, 'song_data')) \
        .createOrReplaceTempView('songs')

    # extract columns from joined song and log datasets to create songplays table
    df.withColumn('month', month('datetime')) \
        .withColumn('year', year('datetime')) \
        .createOrReplaceTempView('log_data')
    songplays_table = spark.sql("""
        SELECT l.ts as start_time, l.userId as user_id, l.level, s.song_id, s.artist_id, l.sessionId as session_id, l.location, l.userAgent as user_agent, l.year, l.month
        FROM log_data l
        LEFT JOIN songs s ON s.artist_name = l.artist AND s.title = l.song
    """)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month') \
        .parquet(output_data + 'songplays/songplays.parquet', mode='overwrite')
Пример #27
0
def process_log_data(spark, input_data, output_data):
    """
    Reads log data from S3 and writes results back to S3 as parquet files.
    :param spark: a sparkSession object
    :param input_data: S3 bucket/directory for song data json files
    :param output_data: S3 bucket where result parquet files are written

    :returns: Nothing
    """
    # get filepath to log data file
    log_data = input_data + 'log_data/*/*/*.json'

    # read log data file and filter by actions for songplays
    df = spark.read.json(log_data)
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    # write users table to parquet files
    window = Window\
        .partitionBy('userId')\
        .orderBy(df['ts'].desc())

    users_table = df.withColumn('order_users', F.rank().over(window))\
        .filter('order_users=1')\
        .select('userId', 'firstName', 'lastName', 'gender', 'level')

    users_table\
        .write\
        .format('parquet')\
        .save(output_data + "users_table.parquet")

    # create timestamp column from original timestamp column
    # extract columns to create time table
    # write time table to parquet files partitioned by year and month
    time_table = df.select('ts').distinct()
    time_table = time_table.withColumn('start_time',
                                        F.from_unixtime(time_table['ts']/1000))\
        .withColumn('year', F.year('start_time'))\
        .withColumn('month', F.month('start_time'))\
        .withColumn('day_of_month', F.dayofmonth('start_time'))\
        .withColumn('day_of_week', F.dayofweek('start_time'))\
        .withColumn('week', F.weekofyear('start_time'))\
        .withColumn('hour', F.hour('start_time'))

    time_table\
        .write\
        .partitionBy('year', 'month')\
        .format("parquet")\
        .save(output_data + "time_table.parquet")

    # read in song data to use for songplays table
    song_data = input_data + 'song_data/*/*/*/*.json'
    songs_df = spark.read.json(song_data)

    # join song and log datasets to create songplays table and extract columns
    # write songplays table to parquet files partitioned by year and month
    joinExpression = [
        df.song == songs_df.title, df.length == songs_df.duration,
        df.artist == songs_df.artist_name
    ]
    joinType = "inner"
    songplays_table = df.join(songs_df, joinExpression, joinType)\
        .select('ts',
                'userId',
                'level',
                'song_id',
                'artist_id',
                'sessionId',
                'location',
                'userAgent')
    songplays_table = songplays_table.withColumn('year', F.year(F.to_timestamp(time_table['ts']/1000)))\
        .withColumn('month', F.month(F.to_timestamp(time_table['ts']/1000)))\
        .withColumn('songplay_id', F.monotonically_increasing_id())

    songplays_table\
        .write\
        .partitionBy('year', 'month')\
        .format('parquet')\
        .save(output_data + 'songplays_table.parquet')
Пример #28
0
    os.getenv("HOME") + "/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar")
conf.set("spark.executor.extrajavaoptions", "-Xmx15000m")
conf.set("spark.executor.memory", "15g")
conf.set("spark.driver.memory", "15g")
conf.set("spark.storage.memoryFraction", "0")

spark = SparkSession.builder \
    .config(conf=conf) \
    .master("local") \
    .appName("SAIDI Calculator") \
    .getOrCreate()

config = open('config.yaml')
config = yaml.load(config)

#connect to the database
pw_df = spark.read.jdbc(
    "jdbc:postgresql://timescale.lab11.eecs.umich.edu/powerwatch",
    "pw_dedupe",
    properties={
        "user": config['user'],
        "password": config['password'],
        "driver": "org.postgresql.Driver"
    })

#read the data that we care about
pw_df = pw_df.select(pw_df['core_id'], pw_df['time'], pw_df['product_id'])
pw_df = pw_df.filter("product_id = 7008 OR product_id= 7009")

pw_df.groupBy(month("time")).agg(countDistinct('core_id')).show()
Пример #29
0
def test_month(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, data_gen).select(f.month(f.col('a'))))
Пример #30
0
def process_log_data(spark, input_data, output_data):
    """
    Load and extract song and artist data from source data and save them back to S3
        
    param spark       : the Spark Session
    param input_data  : the source location of song_data
    param output_data : The destination where the results are saved
            
    """
    # get filepath to log data file
    log_data = os.path.join(input_data,"log_data/*/*/*.json")

    # read log data file
    df = spark.read.json(log_data).dropDuplicates()

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong').dropDuplicates()
    
    # created log view 
    df = df.withColumn('songplay_id', monotonically_increasing_id())
    df.createOrReplaceTempView("log_data_table")
    

    # extract columns for users table    
    users_table = spark.sql("""
    SELECT DISTINCT userT.userId   AS user_id, 
           userT.firstName         AS first_name,
           userT.lastName          AS last_name,
           userT.gender            AS gender,
           userT.level             AS level
    FROM log_data_table userT
    WHERE userT.userId IS NOT NULL
    """)
    
    # write users table to parquet files
    #users_table.write.mode('overwrite').parquet(output_data + 'users_table/')
    users_table.write.parquet(os.path.join(output_data,"users_table"), mode="overwrite")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000), TimestampType())
    df = df.withColumn("hour", hour(get_timestamp(df.ts))) \
            .withColumn("day", dayofmonth(get_timestamp(df.ts))) \
            .withColumn("week", weekofyear(get_timestamp(df.ts))) \
            .withColumn("month", month(get_timestamp(df.ts))) \
            .withColumn("year", year(get_timestamp(df.ts))) \
            .withColumn("weekday", dayofweek(get_timestamp(df.ts))) \
    
    # extract columns to create time table
    time_table = df.select(["ts", "hour", "day", "week", "month", "year", "weekday"]).withColumnRenamed("ts", "start_time")
    
    # write time table to parquet files partitioned by year and month
    #time_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'time_table/')
    time_table.write.partitionBy("year", "month").parquet(os.path.join(output_data,"time_table"), mode="overwrite")
    
    # created song view
    song_data_view = input_data + "song_data/*/*/*/*.json"
    Song_df = spark.read.json(song_data_view).dropDuplicates()
    Song_df.createOrReplaceTempView("song_data_table")

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = spark.sql("""
    SELECT logT.songplay_id                     AS songplay_id,
           to_timestamp(logT.ts/1000)           AS start_time,
           month(to_timestamp(logT.ts/1000))    AS month,
           year(to_timestamp(logT.ts/1000))     AS year,
           logT.userId                          AS user_id,
           logT.level                           AS level,
           songT.song_id                        AS song_id,
           songT.artist_id                      AS artist_id,
           logT.sessionId                       AS session_id,
           logT.location                        AS location,
           logT.userAgent                       AS user_agent
    FROM log_data_table logT
    INNER JOIN song_data_table songT ON logT.artist = songT.artist_name \
    AND logT.song = songT.title \
    AND logT.length =songT.duration
    """)

    # write songplays table to parquet files partitioned by year and month
    #songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
    songplays_table.write.partitionBy("year", "month").parquet(os.path.join(output_data,"songplays_table"), mode="overwrite")
Пример #31
0
# In[30]:

# 2. Add a date column
orders_2 = orders_1.withColumn('Date',convertToDate(orders_1['OrderDate']))


# In[31]:

orders_2.show(2)


# In[32]:

# 3. Add month and year
#orders_3 = orders_2.withColumn('Month',getMonth(orders_2['Date'])).withColumn('Year',getYear(orders_2['Date']))
orders_3 = orders_2.withColumn('Month',F.month(orders_2['Date'])).withColumn('Year',F.year(orders_2['Date']))
orders_3 = orders_2.withColumn('Month',getM(orders_2['Date'])).withColumn('Year',getY(orders_2['Date']))


# In[33]:

orders_3.show(5)


# In[34]:

# 3. How many orders by month/year ?
import time
start_time = time.time()
orders_3.groupBy("Year","Month").sum('Total').show()
print "%s Elapsed : %f" % (datetime.today(), time.time() - start_time)
Пример #32
0
collectibles_df.createOrReplaceTempView("collectibles")
collectibles_df = spark.sql(
    "SELECT ROW_NUMBER() OVER(ORDER BY Collectible) as Id, * FROM collectibles"
)

# Generate dim_Time feed
combined_timestamp = source_glasses.select("timestamp") \
    .union(source_report.select("timestamp")) \
    .union(source_smartphone.select("timestamp")) \
    .union(source_smartwatch.select("timestamp"))
time_df = combined_timestamp.select("timestamp") \
    .where(col("Timestamp").isNotNull()) \
    .distinct() \
    .orderBy("timestamp")
time_df = time_df.withColumn("Year", year(time_df["timestamp"])) \
    .withColumn("Month", month(time_df["timestamp"])) \
    .withColumn("Day", dayofmonth(time_df["timestamp"])) \
    .withColumn("Hour", hour(time_df["timestamp"])) \
    .withColumn("Minute", minute(time_df["timestamp"])) \
    .withColumn("Second", second(time_df["timestamp"]))

# prepare glasses activities
glasses_activities_acc_x = time_df.join(glasses_df, "timestamp", how="inner") \
    .select(
    [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_X").alias("Collectible"), "timestamp", "ACC_X"])
glasses_activities_acc_y = time_df.join(glasses_df, "timestamp", how="inner") \
    .select(
    [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Y").alias("Collectible"), "timestamp", "ACC_Y"])
glasses_activities_acc_z = time_df.join(glasses_df, "timestamp", how="inner") \
    .select(
    [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Z").alias("Collectible"), "timestamp", "ACC_Z"])
  ) \
.withColumn('eyesopen', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].eyesopen'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
.withColumn('mouthopen', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].mouthopen'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
.drop('ts') \
.withColumnRenamed('n_ts', 'ts') \
.withColumn('year', F.year('ts')) \
.withColumn('month', F.month('ts'))

## Sometimes we need to distribute the data based on a specific column, higher cardinality is better.
## To see the number of spark partitions being used: df.rdd.getNumPartitions()
df = df.repartition('ts')

## Finally write the data back out to S3 in partitioned Parquet format
## maxRecordsPerFile is recommended over the old method of using coalesce()
df \
  .withColumn('smiling', df.smile.value) \
  .write \
  .option('maxRecordsPerFile', 1000) \
  .partitionBy('year', 'month', 'smiling') \
  .mode('overwrite') \
  .parquet('s3://bucket/prefix')
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    #log_data = os.path.join(input_data,'log_data/2018/11/2018-11-30-events.json')
    log_data = os.path.join(input_data, "log-data/*/*/*.json")

    # read log data file
    print(log_data)
    df = spark.read.json(log_data)
    df.printSchema()

    # filter by actions for song plays
    df = df.where(df.page == 'NextSong')

    # extract columns for users table
    users_table = df.select('userId', 'firstName', 'lastName', 'gender',
                            'level')

    # write users table to parquet files
    users_table.write.mode('overwrite').parquet(output_data +
                                                'users_table.parquet')

    # create timestamp column from original timestamp column
    get_timestamp = F.udf(lambda x: datetime.fromtimestamp((x / 1000.0)),
                          T.TimestampType())
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    df = df.withColumn('datetime', from_unixtime(F.col('ts') / 1000))

    # extract columns to create time table
    time_table = df.select('datetime', \
                           hour('datetime').alias('hour'), \
                           dayofmonth('datetime').alias('day'), \
                           weekofyear('datetime').alias('week'), \
                           month('datetime').alias('month'), \
                           year('datetime').alias('year'), \
                           date_format('timestamp', 'u').alias('weekday'), \
                           'ts')

    # write time table to parquet files partitioned by year and month
    time_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'time_table.parquet')

    # read in song data to use for songplays table
    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    song_table = sqlContext.read.parquet(output_data + 'songs_table.parquet')
    artists_table = sqlContext.read.parquet(output_data +
                                            'artists_table.parquet')
    time_table = sqlContext.read.parquet(output_data + 'time_table.parquet')

    song_table = song_table.withColumnRenamed("artist_id", "artistId")

    condition = [song_table.artistId == artists_table.artist_id]
    songs_artists_table = song_table.join(artists_table, condition)

    songs_artists_table.show(2)
    condition = [
        songs_artists_table.duration == df.length,
        songs_artists_table.title == df.song,
        songs_artists_table.artist_name == df.artist
    ]
    song_log_data = songs_artists_table.join(df, condition)
    song_log_data.printSchema()

    #condition = [song_log_data.ts == time_table.start_time]
    #song_long_time_data = song_log_data.join(time_table,condition)

    # extract columns from joined song and log datasets to create songplays table

    song_log_data = song_log_data.withColumn('datetime',
                                             from_unixtime(F.col('ts') / 1000))

    songplays_table = song_log_data.select(\
                        monotonically_increasing_id().alias('songplay_id'),\
                        'datetime', \
                        'userId', \
                        'level', \
                        'song_id', \
                        'artist_id', \
                        'sessionId', \
                        'location', \
                        'userAgent',\
                        month('datetime').alias('month'), \
                        year('datetime').alias('year')
                        )

    songplays_table.printSchema()
    songplays_table.show(2)
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.mode('overwrite').partitionBy(
        'year', 'month').parquet(output_data + 'songplays_table.parquet')