예제 #1
0
def schema_song_data():
    """
    Description:
        Schema design for song datasets.
    """
    try:
        print("schema_song_data fuction is statrting.")
        print("**************************************")

        schema = R([
            Fld("artist_id", Str()),
            Fld("artist_latitude", Dbl()),
            Fld("artist_location", Str()),
            Fld("artist_longitude", Dbl()),
            Fld("artist_name", Str()),
            Fld("duration", Dbl()),
            Fld("num_songs", Int()),
            Fld("song_id", Str()),
            Fld("title", Str()),
            Fld("year", Int()),
        ])

        print("schema_song_data is successfull created")
        print("***************************************")
        return schema

    except:
        print("schema_song_data function is successful created.")
        print("************************************************")
예제 #2
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id","year", "duration"]
    
    # extract columns to create songs table
    songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
    
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"]

    # extract columns to create artists table
    artists_table = df.selectExpr(artists_fields).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists/')
예제 #3
0
def process_song_data(spark, input_data, output_data):
    '''
        Description: This function can be used to load the song data from the input S3 bucket
                     and write the parquet files to the output S3 bucket.
        Arguments:
            spark: SparkSession
            input_data: location for the input data
            output_data: location for the output data
        Returns:
            None
    '''
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    print(song_data)

    # read song data file
    songsSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    df = spark.read.json(song_data, schema=songsSchema).distinct()
    print(df.count())
    print(df.show(5, truncate=False))

    df.printSchema()

    # extract columns to create songs table

    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").distinct()
    songs_table.printSchema()
    songs_table.show(5)
    print('songs', songs_table.count())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + "songs")

    # extract columns to create artists table
    df.createOrReplaceTempView("df")
    artists_table = spark.sql(
        "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df"
    ).distinct()
    artists_table.printSchema()
    artists_table.show(5)
    print('artists', artists_table.count())

    # write artists table to parquet files
    artists_table.repartitionByRange(
        3,
        "artist_id").write.mode('overwrite').parquet(output_data + "artists")
예제 #4
0
파일: etl.py 프로젝트: zilnus/cityuda
def process_song_data(spark, input_data, output_data):
    """ Process song_data json files which located in S3
        Create table songs_table and artists_table
        Store the table in parque format in S3
        Return the table to be used in process_log_data function
    
    Args:
      spark                 : Spark Session
      input_data  (string)  : location json files (input)
      output_data (string)  : location parque files (output)
      
    Returns:
      songs_data    (Spark Dataframe) : Song Data tables
    
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # set schema song data
    songSchema = StructType([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year", "duration") \
                    .where("song_id is not null") \
                    .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        os.path.join(output_data, 'songs'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select(col("artist_id"),
                              col("artist_name").alias("name"),
                              col("artist_location").alias("location"),
                              col("artist_latitude").alias("latitude"),
                              col("artist_longitude").alias("longitude")) \
                      .where("artist_id is not null") \
                      .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'),
                                'overwrite')

    # return song_data table to be used in process_log_data
    return df
예제 #5
0
def get_log_schema():
    """
    Creates a schema for log data.
    
    :return: schema
    """
    log_schema = R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Str()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Dbl()),
        Fld("sessionId", Str()),
        Fld("song", Str()),
        Fld("status", Str()),
        Fld("ts", Long()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
    return log_schema
예제 #6
0
def process_song_data(spark, input_data, output_data):
    """This function loads song_data from S3 and processes it by extracting the songs and artist tables
        and then again loaded back to S3
    Args:
        spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession
        input_data (str): S3 bucket where song files are stored
        output (str): S3 bucket file path to store resulting files

    Returns:
        None
    """
    print("**** Starting to process song data *****")
    # get filepath to song data file
    song_data = input_data+'song_data/*/*/*/*.json'
    
    # read song data file
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("song_id",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    
    try:
        df = spark.read.json(song_data, schema=songSchema)
    except Exception as e:
        print(e)
        
    # extract columns to create songs table
    songs_fields = ["song_id", "title", "artist_id", "year", "duration"]
    songs_table = df.select(songs_fields).dropDuplicates(["song_id"])
    
    # write songs table to parquet files partitioned by year and artist
    try:
        songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite")
    except Exception as e:
        print(e)
    
    print("**** songs table data load is complete *****")
    
    # extract columns to create artists table
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"]
    artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    try:
        artists_table.write.parquet(output_data + "artists.parquet",  mode="overwrite")
    except Exception as e:
        print(e)
    print("**** artists table data load is complete *****")
    
    print("**** song data processing is finished *****")
예제 #7
0
def process_song_data(spark, input_data, output_data):
    """import Song dataset extract columns and create songs and artist tables
    write those tables to parquet files
    
    Parameters:
    spark: name of spark session
    input_data: location of the source data s3 bucket 
    output_data: location of the destination data s3 bucket
    
    Returns:
    writes songs table in parquet to output_data location + songs
    writes artist_table in parquet to output_dat location + artists
    
    """

    # Setting up the JSON table structure for the Song dataset
    song_dataset_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Str()),
    ])
    """get filepath to song data file 
    use "song_data/*/*/*/*.json" for full dataset
    use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record

    """
    song_data = input_data + "song_data/*/*/*/*.json"

    # read song data file with dataset_schema
    df = spark.read.json(song_data, schema=song_dataset_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'artist_id', 'year', 'duration')

    # drop duplicate rows in songs table
    songs_table = songs_table.dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('append').partitionBy(
        'year', 'artist_id').parquet(output_data + "songs")

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude')

    # drop duplicate rows in artists table
    artists_table = artists_table.dropDuplicates()

    # write artists table to parquet files
    artists_table.write.mode('append').parquet(output_data + "artists")
예제 #8
0
def process_song_data(spark, input_data, output_data):
    '''
    load song data in json format from S3 bucket and process these data by extracting 
    songs table and artists table, and save these tables back to S3 bucket
    
    :param spark: spark session
    :param input_data: data location for input data
    :param output_data: data location for output data
    :return: no return value
    '''
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # create songs schema
    songSchema = R([
        Fld('artist_id', Str()),
        Fld('artist_latitude', Dbl()),
        Fld('artist_location', Str()),
        Fld('artist_longitude', Dbl()),
        Fld('artist_name', Str()),
        Fld('duration', Dbl()),
        Fld('num_songs', Int()),
        Fld('title', Str()),
        Fld('year', Int()),
    ])

    # load songs json files from S3
    df_songs = spark.read.json(song_data, schema=songSchema)

    # select columns for songs_table
    songs_attr = ['title', 'artist_id', 'year', 'duration']
    songs_table = df_songs.select(songs_attr)\
    .dropDuplicates()\
    .withColumn('song_id', monotonically_increasing_id())

    # write songs_table to S3
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + 'songs/')

    # select artists columns
    artists_attr = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]
    artists_table = df_songs.select(artists_attr)\
    .dropDuplicates()

    artists_table = artists_table\
    .withColumnRenamed('artist_name','name')\
    .withColumnRenamed('artist_location','location')\
    .withColumnRenamed('artist_latitude','latitude')\
    .withColumnRenamed('artist_longitude','longitude')

    # write artists_table to S3
    artists_table.write.parquet(output_data + 'artists/')
예제 #9
0
def process_song_data(spark, input_data, output_data):
    """
    This function loads the songs JSON dataset from S3, 
    then uses the data to create the songs and artists tables
    
    Input:
    spark = SparkSession object
    input_data = Start of path variable for input files
    output_data = Start of path variable for output files
    
    Output: None
    """

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Define schema
    SongSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=SongSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropduplicates()

    # write songs table to parquet files partitioned by year and artist
    output_path = os.path.join(output_data, 'songs_table.parquet')
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_path,
                                                       mode="overwrite")

    # extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location",
                              "artist_latitude",
                              "artist_longitude").dropduplicates()

    # write artists table to parquet files
    output_path = os.path.join(output_data, 'artists_table.parquet')
    artists_table.write.parquet(output_path, mode="overwrite")

    #export whole songs data file to parquet
    output_path = os.path.join(output_data, 'songs_data_table.parquet')
    df.write.parquet(output_path, mode="overwrite")
예제 #10
0
def process_song_data(spark, input_data, output_data):
    """Process song data, transform the data into songs and artists tables
    and store it in parquet files on S3.

    Parameters
    ----------
    spark : SparkSession
        cursor to the sparkify database connection
    input_data : string
        input data prepend path
    output_data : string
        output data prepend path
    """
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, song_schema)

    # extract columns to create songs table
    songs_table = df.select(
        ["song_id", "title", "artist_id", "year", "duration"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \
        .parquet(os.path.join(output_data, 'analytics/songs'))

    # extract columns to create artists table
    artists_table = df.select([
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ])
    artists_table = artists_table.withColumnRenamed("artist_name", "name") \
        .withColumnRenamed("artist_location", "location") \
        .withColumnRenamed("artist_latitude", "latitude") \
        .withColumnRenamed("artist_longitude", "longitude")

    # write artists table to parquet files
    artists_table.write.mode("overwrite") \
        .parquet(os.path.join(output_data, 'analytics/artists'))
예제 #11
0
def process_song_data(spark, input_data, output_data):

    print('%%%%% Starting up the SONG data process')

    # get filepath to song data file
    song_data = 'song_data/A/*/*/*.json'

    # setting up the schema for the data that we're about to pull
    songSchema = ST([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file - Uncomment the line below to download from S3, otherwise line 30 will access the data locally
    #df = spark.read.json(input_data + song_data)
    raw_song_df = spark.read.json(input_data + song_data, songSchema)

    # extract columns to create songs table
    songs_table = raw_song_df.select(raw_song_df.song_id, \
                                 raw_song_df.title, \
                                 raw_song_df.artist_id, \
                                 raw_song_df.year.cast(Int()), \
                                 raw_song_df.duration.cast(Dbl()))

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        'year', 'artist_id').parquet(output_data + 'songs')

    print('%%%%% Songs table has been created and written to the S3 Bucket')

    # extract columns to create artists table
    artists_table = raw_song_df.select(raw_song_df.artist_id , \
                                  raw_song_df.artist_latitude.alias('latitude'), \
                                  raw_song_df.artist_location.alias('location'), \
                                  raw_song_df.artist_longitude.alias('longitude'), \
                                  raw_song_df.artist_name.alias('name')).dropDuplicates(['artist_id','name'])

    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(output_data + 'artist')

    print('%%%%% Artists table has been created and written to the S3 Bucket')
    print('%%%%% SONG data has been completed and returning the raw_song_df')
    return raw_song_df
예제 #12
0
def process_song_data(spark, input_data, output_data):
    """
    The function loads data from song_data dataset and extract columns
    for songs and artist tables and write the data into parquet
    files which will be loaded on s3.
    
    """
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    # get filepath to song data file
    song_data = 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(os.path.join(input_data, song_data),
                         schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'title', 'artist_id', 'year',
                            'duration').dropDuplicates()

    songs_table.createOrReplaceTempView('songs')

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs/songs.parquet'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude') \
                      .withColumnRenamed('artist_name', 'name') \
                      .withColumnRenamed('artist_location', 'location') \
                      .withColumnRenamed('artist_latitude', 'latitude') \
                      .withColumnRenamed('artist_longitude', 'longitude') \
                      .dropDuplicates()

    artists_table.createOrReplaceTempView('artists')

    # write artists table to parquet files
    artists_table.write.parquet(
        os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
예제 #13
0
def process_song_data(spark, input_data, output_data):
    """
    Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables
    and then loaded the processed data back to S3 (output_data)
    
    :param spark: Spark Session object
    :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files
    :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format 
    """

    # Get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # Read song data file
    print("Reading song_data JSON files from S3")
    df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \
                         columnNameOfCorruptRecord='corrupt_record').dropDuplicates()
    print("Read completed")

    # Extract columns to create songs table
    songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \
                    .withColumn("song_id", monotonically_increasing_id())

    print("Writing Songs table to S3 after processing")
    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs/",
                              mode="overwrite",
                              partitionBy=["year", "artist_id"])
    print("Completed")

    # Extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \
                        .dropDuplicates()

    print("Writing Artists table to S3 after processing")
    # Write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
    print("Completed")
예제 #14
0
def process_song_data(spark, input_data, output_data):
    """
    read song data from s3 and then create the songs_table and artists_table. load them back to s3.
    
    parameters:
    spark: spark session
    input_data: path of song data
    output_data: path of output table
    
    """
    # get filepath to song data file
    # song_data = input_data + "song_data/*/*/*/*.json"
    song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json"

    # create song table schema
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0
    song_field = ["title", "duration", "year", "artist_id"]
    songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\
    .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull())

    # extract columns to create artists table, drop if artist_id and name containing any null values
    artist_field = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ]
    artists_table = df.select(artist_field).dropDuplicates().dropna(
        subset=["artist_id", "artist_name"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_data + "songs/",
                                                       mode="overwrite")

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
def process_song_data(spark, input_data, output_data):
    """
    Method to process song data and create tables: songs, artists
    :param spark: Spark session
    :param input_data: S3 bucket
    :param output_data: S3 bucket
    :return: Data frame of song data
    """
    # get filepath to song data file
    song_data = input_data + '/song-data/A/A/B/*.json'

    songs_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    # read song data file
    print('Reading song data.')
    df = spark.read.json(song_data, schema=songs_schema)

    song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration']

    # extract columns to create songs table
    songs_table = df.selectExpr(song_columns).dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    print('Writing songs to parquet.')
    write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id')

    artist_columns = [
        'artist_id', 'artist_name as name', 'artist_location as location',
        'artist_latitude as latitude', 'artist_longitude as longitude'
    ]

    # extract columns to create artists table
    artists_table = df.selectExpr(artist_columns).dropDuplicates()

    # write artists table to parquet files
    print('Writing artists to parquet.')
    write_parquet(artists_table, output_data, 'artists', None, None)

    return df
예제 #16
0
def process_song_data(spark, input_data, output_data):
    """
        Description: This function fetches song_data from S3 into a staging dataframe, 
        then extracts the songs and artist tables,
        and eventually exports data back to S3
        
        Parameters:
            spark       : object for Spark Session
            input_data  : location of song_data 
            output_data : location of target S3 bucket
            
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    # define schema
    songdata_schema = R([
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dbl()),
    Fld("artist_location",Str()),
    Fld("artist_longitude",Dbl()),
    Fld("artist_name",Str()),
    Fld("duration",Dbl()),
    Fld("num_songs",Int()),
    Fld("title",Str()),
    Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songdata_schema)

    # extract columns to create songs table
    songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration'])

    songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\
    select(['song_id', 'artist_name', 'artist_id', 'year', 'duration'])
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/')

    # extract columns to create artists table
    selection = ['artist_id', 'artist_name as name', \
                 'artist_location as location', 'artist_latitude as latitude', \
                 'artist_longitude as longitude']
    artists_table = df.selectExpr(selection).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
예제 #17
0
def process_song_data(spark, input_data, output_data):
    """
    process_song_data - Loads the song data files from S3, and saves the song information to a parquet file
    (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file.
    """

    # Get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
    #    song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json')

    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # Read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # Extract columns to create songs table
    songs_table = df.select(
        ['song_id', 'title', 'artist_id', 'year', 'duration'])

    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").mode('overwrite').parquet(
                                      os.path.join(output_data,
                                                   'songs.parquet'))

    # Extract columns to create artists table, and find the distinct artists
    artists_table = df.select([
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]).withColumnRenamed('artist_name', 'name').withColumnRenamed(
        'artist_location', 'location').withColumnRenamed(
            'artist_latitude',
            'latitude').withColumnRenamed('artist_longitude',
                                          'longitude').distinct()

    # Write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(
        os.path.join(output_data, 'artists.parquet'))
예제 #18
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + "./data/song_data/*/*/*/*.json"
    """Creating the song_data file schema that we are going to add to spark"""
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # reading song data file json structure
    df = spark.read.json(song_data, schema=songSchema)
    """Filtering out only the needed columns for the songs table"""
    song_fields = ["title", "artist_id", "year", "duration"]

    print('Creating the songs table and dropping duplicates')
    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())
    print(
        "--- All duplicate songs have been dropped and the songs table created ---"
    )
    print('Printing some rows from the songs_table')
    songs_table.show(15)
    print('Saving the songs table to the s3 bucket')
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + "songs")
    print("--- songs.parquet completed ---")
    """Filtering out only the needed columns for the artists table"""
    artists_data = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]

    print("--- Starting to drop duplicate artists....")
    artists_table = df.selectExpr(artists_data).dropDuplicates()
    print("All duplicate artists have been dropped......")

    print('Printing some rows from the artists_table')
    artists_table.show(15)
    """writing the artists table to the parquets file"""
    artists_table.write.parquet(output_data + "artists")
    print("--- artists.parquet completed ---")
    print("*** process_song_data completed ***\n\n")
예제 #19
0
def process_song_data(spark, input_data, output_data):
    """
		Loads song_data from S3, extracting needed columns for 'song_table' and 'artist_table' 
		and writting their parquet format on S3
		
		Parameters:
			spark       : Spark Session
			input_data  : Location of song_data json files with the songs metadata
			output_data : S3 bucket were tables in parquet format store
	"""

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Making right type for input json structure
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = songs_table = df.selectExpr(
        "song_id", "title", "artist_id", "year",
        "duration").orderBy("song_id").drop_duplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs'))

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name",
                                  "artist_location as location",
                                  "artist_latitude as latitude",
                                  "artist_longitude as longitude").orderBy(
                                      "artist_id").drop_duplicates()

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'))
예제 #20
0
def create_log_schema():
    """
    Schema structure for log data
    :return: StructType
    """
    log_schema = R([
        Fld('artist', St()),
        Fld('auth', St()),
        Fld('firstName', St()),
        Fld('gender', St()),
        Fld('itemInSession', LInt()),
        Fld('lastName', St()),
        Fld('length', Fl()),
        Fld('level', St()),
        Fld('location', St()),
        Fld('method', St()),
        Fld('page', St()),
        Fld('registration', Dbl()),
        Fld('sessionId', LInt()),
        Fld('song', St()),
        Fld('status', LInt()),
        Fld('ts', LInt()),
        Fld('userAgent', St()),
        Fld('userId', St())
    ])

    return log_schema
예제 #21
0
def get_log_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """

    return R([
        Fld("artist", Str()),
        Fld("auth", Str()),
        Fld("firstName", Str()),
        Fld("gender", Str()),
        Fld("itemInSession", Int()),
        Fld("lastName", Str()),
        Fld("length", Dbl()),
        Fld("level", Str()),
        Fld("location", Str()),
        Fld("method", Str()),
        Fld("page", Str()),
        Fld("registration", Str()),
        Fld("sessionId", Int()),
        Fld("song", Str()),
        Fld("status", Int()),
        Fld("ts", Str()),
        Fld("userAgent", Str()),
        Fld("userId", Str())
    ])
예제 #22
0
def create_log_data():
    """
    Create schema for log data.
    
    return: schema
    """
    log_schema = StructType([
        StructField("artist", Str()), 
        StructField('auth', Str()),
        StructField('firstName', Str()),
        StructField('gender', Str()),
        StructField('itemInSession', Int()),
        StructField('lastName', Str()),
        StructField('length', Dbl()),
        StructField('level', Str()),
        StructField('location', Str()),
        StructField('method', Str()),
        StructField('page', Str()),
        StructField('registration', Dec()),
        StructField('sessionId', Int()),
        StructField('song', Str()),
        StructField('status', Int()),
        StructField('ts', Long()),
        StructField('userAgent', Str()),
        StructField('userId', Int())
    ])
    return log_schema
예제 #23
0
파일: etl.py 프로젝트: jfvanreu/DataLake
def process_song_data(spark, input_data_songs, output_data):
    """
    Read song data by providing it an expected schema.
    Create songs and artists tables.
    """
    # define song data schema to improve performance
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    song_data = input_data_songs

    t_start = time()
    dfSongs = spark.read.json(song_data, schema=song_schema)
    t_end = time() - t_start
    print('Read song data in {} secs'.format(t_end))
    dfSongs.printSchema()

    dfSongs.count()
    dfSongs.show(5)

    songs_table = dfSongs.filter(dfSongs.song_id != '')\
                     .select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songs_table.show(5)
    songs_table.write.partitionBy(
        "year",
        "artist_id").mode('overwrite').parquet(output_data +
                                               'songs/songs_table.parquet')

    artists_table = dfSongs.filter(dfSongs.artist_id !='') \
                        .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"),
                                 col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\
                        .dropDuplicates()

    artists_table.show(5)

    artists_table.write.mode('overwrite').parquet(
        output_data + 'artists/artists_table.parquet')
예제 #24
0
def process_song_data(spark, input_data, output_data):
    """
    Reads from song files, 
    transforms them into songs and artists data, 
    and writes them in parquet format.
    
    params:
    - spark: spark session object
    - input_data: input data path
    - output_data: output data path
    """

    # get filepath to song data file
    song_data = input_data + "/song_data/*/*/*/*.json"

    # use schema when read json files
    song_schema = St([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs", mode="overwrite", \
                              partitionBy=["year", "artist_id"])

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", \
                                  "artist_latitude as latitude", "artist_longitude as longitude") \
                                  .dropDuplicates()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists", mode="overwrite")
예제 #25
0
def get_song_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """
    return R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])
예제 #26
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + '*/*/*/*.json'

    # creating schema fo our song_data

    songs_model = St([
        Sfld("num_songs", Int()),
        Sfld("artist_id", Str()),
        Sfld("artist_latitude", Dbl()),
        Sfld("artist_longitude", Dbl()),
        Sfld("artist_location", Str()),
        Sfld("artist_name", Str()),
        Sfld("song_id", Str()),
        Sfld("title", Str()),
        Sfld("duration", Dbl()),
        Sfld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songs_model)

    # extract columns to create songs table
    songs_table = df.select(["title", "artist_id", "year",
                             "duration"]).dropDuplicates().withColumn(
                                 "song_id", monotonically_increasing_id())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        output_data + 'songs_table/', mode='overwrite')

    # extract columns to create artists table
    artists_fields = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ]
    artists_table = df.select(artists_fields).withColumnRenamed(
        'artist_name', 'name').withColumnRenamed(
            'artist_location', 'location').withColumnRenamed(
                'artist_latitude',
                'latitude').withColumnRenamed('artist_longitude',
                                              'longitude').dropDuplicates()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists_table/',
                                mode='overwrite')
예제 #27
0
def process_song_data(spark, input_data, output_data):
    """
    Extract data from song_data and write songs and artists table
    
    Arguments:
    - spark : SparkSession object
    - input_data : input data root dir path
    - output_data : output data root dir path
    """
    # schema for song_data 
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year",Int())
    ])
    
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*"
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema).dropDuplicates(["song_id"])

    # extract columns to create songs table
    df.createOrReplaceTempView("song_data")
    songs_table = spark.sql("""
        SELECT song_id, title, artist_id, year, duration FROM song_data
    """)
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs_table", 'overwrite')

    # extract columns to create artists table
    artists_table = spark.sql("""
        SELECT artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude 
        FROM song_data
    """).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table")
예제 #28
0
def process_song_data(spark, input_data, output_data):
    """
        The funtion process song data using spark on AWS.
        Input: 
              song_log in  .json format, 
        output:
              Processed data in parquet format loaded back to S3
        args:
              Spark session, input_data, output_data
        Return:
              none
    """

    song_data = input_data + 'song_data/*/*/*/*.json'

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id", "year", "duration"]

    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())

    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_data + 'songs/')

    artists_fields = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_latitude as latitude", "artist_longitude as longitude"
    ]

    artists_table = df.selectExpr(artists_fields).dropDuplicates()

    artists_table.write.parquet(output_data + 'artists/')
예제 #29
0
def process_song_data(spark, input_data, output_data):
    """
    Description:
                Function that processes the raw data from the S3 bucket
    Parameters:
    
                :spark:       uses the earlier instantiated spark session
                :input_data:  path of the location where the files are residing
                :output_data: path of the location where the files will be saved after processing
                :return:      none
    
    """

    song_data = input_data + 'song_data/A/A/A/*.json'

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id", "year", "duration"]

    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())

    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + 'songs/')

    artists_fields = [
        "artist_id", "artist_name as name", "artist_location as location",
        "artist_latitude as latitude", "artist_longitude as longitude"
    ]

    artists_table = df.selectExpr(artists_fields).dropDuplicates()

    artists_table.write.mode('overwrite').parquet(output_data + 'artists/')
예제 #30
0
def process_song_data(spark, input_data, output_data):
    """
    This function processes the song data of sparkify and creates
    facts/dimensions via spark and saves them to our data lake afterwards
	Arguments:
	    spark {SparkSession}: Spark session to launch the program
	    input_data {str}: location (local/s3) where the (root) input song data resides
	    output_data {str}: location (local/s3) where the (root) output files should be written
    """
    # get filepath to song data file
    # song_data = f"{input_data}song_data/A/A/A/*.json"
    song_data = f"{input_data}song_data/*/*/*/*.json"

    # read song data file
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Str()),
        Fld("artist_longitude", Str()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # since schema can not infered automatically, we need to specify it beforehand
    df_song = spark.read.json(song_data, schema=songSchema)
    df_song.cache()

    # extract columns to create songs table
    songs_table = df_song.filter(df_song.song_id != '') \
        .select(['song_id',
                 'title',
                 'artist_id',
                 'year',
                 'duration']) \
        .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    output_song_data = f"{output_data}song_data/"
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_song_data)

    # extract columns to create artists table
    artists_table = df_song.filter(df_song.artist_id != '') \
        .selectExpr(['artist_id',
                     'artist_name as name',
                     'artist_location as location',
                     'artist_latitude as latitude',
                     'artist_longitude as longitude']) \
        .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    output_artist_data = f"{output_data}artist_data/"
    artists_table.write.mode('overwrite').parquet(output_artist_data)