Пример #1
0
def process_song_data(spark, input_data, output_data):
    """This function loads song_data from S3 and processes it by extracting the songs and artist tables
        and then again loaded back to S3
    Args:
        spark(:obj:`pyspark.sql.session.SparkSession`): SparkSession
        input_data (str): S3 bucket where song files are stored
        output (str): S3 bucket file path to store resulting files

    Returns:
        None
    """
    print("**** Starting to process song data *****")
    # get filepath to song data file
    song_data = input_data+'song_data/*/*/*/*.json'
    
    # read song data file
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("song_id",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    
    try:
        df = spark.read.json(song_data, schema=songSchema)
    except Exception as e:
        print(e)
        
    # extract columns to create songs table
    songs_fields = ["song_id", "title", "artist_id", "year", "duration"]
    songs_table = df.select(songs_fields).dropDuplicates(["song_id"])
    
    # write songs table to parquet files partitioned by year and artist
    try:
        songs_table.write.parquet(output_data + "songs.parquet", partitionBy=("year", "artist_id"), mode="overwrite")
    except Exception as e:
        print(e)
    
    print("**** songs table data load is complete *****")
    
    # extract columns to create artists table
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as lattitude", "artist_longitude as longitude"]
    artists_table = df.selectExpr(artists_fields).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    try:
        artists_table.write.parquet(output_data + "artists.parquet",  mode="overwrite")
    except Exception as e:
        print(e)
    print("**** artists table data load is complete *****")
    
    print("**** song data processing is finished *****")
Пример #2
0
def process_song_data(spark, input_data, output_data):
    '''
        Description: This function can be used to load the song data from the input S3 bucket
                     and write the parquet files to the output S3 bucket.
        Arguments:
            spark: SparkSession
            input_data: location for the input data
            output_data: location for the output data
        Returns:
            None
    '''
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")
    print(song_data)

    # read song data file
    songsSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    df = spark.read.json(song_data, schema=songsSchema).distinct()
    print(df.count())
    print(df.show(5, truncate=False))

    df.printSchema()

    # extract columns to create songs table

    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").distinct()
    songs_table.printSchema()
    songs_table.show(5)
    print('songs', songs_table.count())

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + "songs")

    # extract columns to create artists table
    df.createOrReplaceTempView("df")
    artists_table = spark.sql(
        "select artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude from df"
    ).distinct()
    artists_table.printSchema()
    artists_table.show(5)
    print('artists', artists_table.count())

    # write artists table to parquet files
    artists_table.repartitionByRange(
        3,
        "artist_id").write.mode('overwrite').parquet(output_data + "artists")
Пример #3
0
def process_song_data(spark, input_data, output_data):
    """ Process song_data json files which located in S3
        Create table songs_table and artists_table
        Store the table in parque format in S3
        Return the table to be used in process_log_data function
    
    Args:
      spark                 : Spark Session
      input_data  (string)  : location json files (input)
      output_data (string)  : location parque files (output)
      
    Returns:
      songs_data    (Spark Dataframe) : Song Data tables
    
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # set schema song data
    songSchema = StructType([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year", "duration") \
                    .where("song_id is not null") \
                    .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(
        os.path.join(output_data, 'songs'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select(col("artist_id"),
                              col("artist_name").alias("name"),
                              col("artist_location").alias("location"),
                              col("artist_latitude").alias("latitude"),
                              col("artist_longitude").alias("longitude")) \
                      .where("artist_id is not null") \
                      .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'),
                                'overwrite')

    # return song_data table to be used in process_log_data
    return df
Пример #4
0
def schema_song_data():
    """
    Description:
        Schema design for song datasets.
    """
    try:
        print("schema_song_data fuction is statrting.")
        print("**************************************")

        schema = R([
            Fld("artist_id", Str()),
            Fld("artist_latitude", Dbl()),
            Fld("artist_location", Str()),
            Fld("artist_longitude", Dbl()),
            Fld("artist_name", Str()),
            Fld("duration", Dbl()),
            Fld("num_songs", Int()),
            Fld("song_id", Str()),
            Fld("title", Str()),
            Fld("year", Int()),
        ])

        print("schema_song_data is successfull created")
        print("***************************************")
        return schema

    except:
        print("schema_song_data function is successful created.")
        print("************************************************")
def process_song_data(spark, input_data, output_data):
    """
    Description: This function helps us to read the song_data from S3, put this data into spark dataframe
                 ,extract columns from this dataframe to form "songinf table" and "artist table", transform 
                 "songinf table data" and "artist table data" into a format that this project needs.
    Parameters: -spark: spark session
                -input_data: location of song_data json file (in S3 bucket)
                -output_data: location that the final table will be saved (in S3 bucket)
    Return: None
    """
    #--------------------read song data--------------------#
    print('Read song_data...')
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data_*.json")
    
    # define the song data schema for reading
    SongSchema = R([
                    Fld("artist_id",Str()),
                    Fld("artist_latitude",Doub()),
                    Fld("artist_location",Str()),
                    Fld("artist_longitude",Doub()),
                    Fld("artist_name",Str()),
                    Fld("duration",Doub()),
                    Fld("num_songs",Long()),
                    Fld("song_id",Str()),
                    Fld("title",Str()),
                    Fld("year",Long())
                    ])
    
    # read song data file
    song_df = spark.read.json(song_data, schema=SongSchema)
    
    #--------------------deal with song table--------------------#
    # extract columns to create songinf df
    songinf_df = song_df.select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songinf_df = songinf_df.dropDuplicates(['song_id'])
    songinf_df = songinf_df.dropna(how = "any", subset = ["song_id"])
    songinf_df = songinf_df.filter(songinf_df.song_id != "")
    
    print('Songs table: ')
    print(songinf_df.sort('song_id').show(5))
    
    # write songs table to parquet files partitioned by year and artist
    print('Save Songs table into S3...')
    songinf_df.write.partitionBy("year", "artist_id").parquet("{}/song_table.parquet".format(output_data))

    #--------------------deal with artists table--------------------#
    # extract columns to create artists df
    artist_df = song_df.select(['artist_id', 'artist_name', 'artist_location', 'artist_latitude', 'artist_longitude'])
    artist_df = artist_df.dropDuplicates(['artist_id'])
    artist_df = artist_df.dropna(how = "any", subset = ["artist_id"])
    artist_df = artist_df.filter(artist_df.artist_id != "")
    
    print('artists table: ')
    print(artist_df.sort('artist_id').show(5))
    
    # write artists table to parquet files
    print('Save artists table into S3...')
    artist_df.write.parquet("{}/artist_table.parquet".format(output_data))
Пример #6
0
def process_song_data(spark, input_data, output_data):
    """
    This function processes the song data of sparkify and creates
    facts/dimensions via spark and saves them to our data lake afterwards
	Arguments:
	    spark {SparkSession}: Spark session to launch the program
	    input_data {str}: location (local/s3) where the (root) input song data resides
	    output_data {str}: location (local/s3) where the (root) output files should be written
    """
    # get filepath to song data file
    # song_data = f"{input_data}song_data/A/A/A/*.json"
    song_data = f"{input_data}song_data/*/*/*/*.json"

    # read song data file
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Str()),
        Fld("artist_longitude", Str()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # since schema can not infered automatically, we need to specify it beforehand
    df_song = spark.read.json(song_data, schema=songSchema)
    df_song.cache()

    # extract columns to create songs table
    songs_table = df_song.filter(df_song.song_id != '') \
        .select(['song_id',
                 'title',
                 'artist_id',
                 'year',
                 'duration']) \
        .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    output_song_data = f"{output_data}song_data/"
    songs_table.write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_song_data)

    # extract columns to create artists table
    artists_table = df_song.filter(df_song.artist_id != '') \
        .selectExpr(['artist_id',
                     'artist_name as name',
                     'artist_location as location',
                     'artist_latitude as latitude',
                     'artist_longitude as longitude']) \
        .dropDuplicates(['artist_id'])

    # write artists table to parquet files
    output_artist_data = f"{output_data}artist_data/"
    artists_table.write.mode('overwrite').parquet(output_artist_data)
Пример #7
0
def get_residence_cities(spark):
    cities = pd.read_csv('residence_city.txt',
                         sep='=',
                         names=['id', 'country'])
    cities['country'] = cities['country'].str.replace("'", '').str.strip()
    cities_data = cities.values.tolist()
    cities_schema = R([Fld('id', Str(), True), Fld('country', Str(), True)])
    cities = spark.createDataFrame(cities_data, cities_schema)
    cities.write.mode('overwrite').parquet('resident_city.parquet')
    return cities
Пример #8
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    songSchema = R([
        Fld("artist_id",Str()),
        Fld("artist_latitude",Dbl()),
        Fld("artist_location",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_name",Str()),
        Fld("duration",Dbl()),
        Fld("num_songs",Int()),
        Fld("title",Str()),
        Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    song_fields = ["title", "artist_id","year", "duration"]
    
    # extract columns to create songs table
    songs_table = df.select(song_fields).dropDuplicates().withColumn("song_id", monotonically_increasing_id())
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + 'songs/')
    
    artists_fields = ["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"]

    # extract columns to create artists table
    artists_table = df.selectExpr(artists_fields).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists/')
Пример #9
0
def process_song_data(spark, input_data, output_data):
    """
    This function loads the songs JSON dataset from S3, 
    then uses the data to create the songs and artists tables
    
    Input:
    spark = SparkSession object
    input_data = Start of path variable for input files
    output_data = Start of path variable for output files
    
    Output: None
    """

    # get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')

    # Define schema
    SongSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=SongSchema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropduplicates()

    # write songs table to parquet files partitioned by year and artist
    output_path = os.path.join(output_data, 'songs_table.parquet')
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_path,
                                                       mode="overwrite")

    # extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location",
                              "artist_latitude",
                              "artist_longitude").dropduplicates()

    # write artists table to parquet files
    output_path = os.path.join(output_data, 'artists_table.parquet')
    artists_table.write.parquet(output_path, mode="overwrite")

    #export whole songs data file to parquet
    output_path = os.path.join(output_data, 'songs_data_table.parquet')
    df.write.parquet(output_path, mode="overwrite")
def process_song_data(spark, input_data, output_data):
    """
    Processes song data and stores them as parquet files

    Loads song data into a spark DataFrame and transforms them into songs
    and artists DataFrames which are subsequently written as parquet files
    to songs and artists folders in the specified output path.

    Parameters:
    spark : SparkSession instance
    input_data (str) : Path of the directory of song_data
    output_data (str) : Path of the directory where the parquet files will be stored

    """

    # specify schema for song data
    songs_schema = Struct([
        Fld('num_songs', Int()),
        Fld('artist_id', Str()),
        Fld('artist_latitude', Double()),
        Fld('artist_longtitude', Double()),
        Fld('artist_location', Str()),
        Fld('artist_name', Str()),
        Fld('song_id', Str()),
        Fld('title', Str()),
        Fld('duration', Double()),
        Fld('year', Int())
    ])

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(song_data, songs_schema)

    # extract columns to create songs table
    songs_table = df[['song_id', 'title', 'artist_id', 'year', 'duration']] \
        .dropDuplicates(['song_id'])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(os.path.join(output_data, 'songs'),
                              'overwrite',
                              partitionBy=['year', 'artist_id'])

    # extract columns to create artists table
    artists_table = df[[
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longtitude'
    ]].dropDuplicates(['artist_id'])

    # write artists table to parquet files
    artists_table.write.parquet(os.path.join(output_data, 'artists'),
                                'overwrite')
Пример #11
0
def process_song_data(spark, input_data, output_data):
    """Process song data, transform the data into songs and artists tables
    and store it in parquet files on S3.

    Parameters
    ----------
    spark : SparkSession
        cursor to the sparkify database connection
    input_data : string
        input data prepend path
    output_data : string
        output data prepend path
    """
    # get filepath to song data file
    song_data = os.path.join(input_data, "song_data/*/*/*/*.json")

    song_schema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, song_schema)

    # extract columns to create songs table
    songs_table = df.select(
        ["song_id", "title", "artist_id", "year", "duration"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode("overwrite").partitionBy('year', 'artist_id') \
        .parquet(os.path.join(output_data, 'analytics/songs'))

    # extract columns to create artists table
    artists_table = df.select([
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ])
    artists_table = artists_table.withColumnRenamed("artist_name", "name") \
        .withColumnRenamed("artist_location", "location") \
        .withColumnRenamed("artist_latitude", "latitude") \
        .withColumnRenamed("artist_longitude", "longitude")

    # write artists table to parquet files
    artists_table.write.mode("overwrite") \
        .parquet(os.path.join(output_data, 'analytics/artists'))
Пример #12
0
def process_song_data(spark, input_data, output_data):
    """import Song dataset extract columns and create songs and artist tables
    write those tables to parquet files
    
    Parameters:
    spark: name of spark session
    input_data: location of the source data s3 bucket 
    output_data: location of the destination data s3 bucket
    
    Returns:
    writes songs table in parquet to output_data location + songs
    writes artist_table in parquet to output_dat location + artists
    
    """

    # Setting up the JSON table structure for the Song dataset
    song_dataset_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Str()),
    ])
    """get filepath to song data file 
    use "song_data/*/*/*/*.json" for full dataset
    use "song_data/A/B/C/TRABCEI128F424C983.json" to pull a single record

    """
    song_data = input_data + "song_data/*/*/*/*.json"

    # read song data file with dataset_schema
    df = spark.read.json(song_data, schema=song_dataset_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'artist_id', 'year', 'duration')

    # drop duplicate rows in songs table
    songs_table = songs_table.dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('append').partitionBy(
        'year', 'artist_id').parquet(output_data + "songs")

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude')

    # drop duplicate rows in artists table
    artists_table = artists_table.dropDuplicates()

    # write artists table to parquet files
    artists_table.write.mode('append').parquet(output_data + "artists")
Пример #13
0
def process_song_data(spark, input_data, output_data):

    print('%%%%% Starting up the SONG data process')

    # get filepath to song data file
    song_data = 'song_data/A/*/*/*.json'

    # setting up the schema for the data that we're about to pull
    songSchema = ST([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file - Uncomment the line below to download from S3, otherwise line 30 will access the data locally
    #df = spark.read.json(input_data + song_data)
    raw_song_df = spark.read.json(input_data + song_data, songSchema)

    # extract columns to create songs table
    songs_table = raw_song_df.select(raw_song_df.song_id, \
                                 raw_song_df.title, \
                                 raw_song_df.artist_id, \
                                 raw_song_df.year.cast(Int()), \
                                 raw_song_df.duration.cast(Dbl()))

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.mode('overwrite').partitionBy(
        'year', 'artist_id').parquet(output_data + 'songs')

    print('%%%%% Songs table has been created and written to the S3 Bucket')

    # extract columns to create artists table
    artists_table = raw_song_df.select(raw_song_df.artist_id , \
                                  raw_song_df.artist_latitude.alias('latitude'), \
                                  raw_song_df.artist_location.alias('location'), \
                                  raw_song_df.artist_longitude.alias('longitude'), \
                                  raw_song_df.artist_name.alias('name')).dropDuplicates(['artist_id','name'])

    # write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(output_data + 'artist')

    print('%%%%% Artists table has been created and written to the S3 Bucket')
    print('%%%%% SONG data has been completed and returning the raw_song_df')
    return raw_song_df
Пример #14
0
def process_song_data(spark, input_data, output_data):
    """
    The function loads data from song_data dataset and extract columns
    for songs and artist tables and write the data into parquet
    files which will be loaded on s3.
    
    """
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    # get filepath to song data file
    song_data = 'song_data/*/*/*/*.json'

    # read song data file
    df = spark.read.json(os.path.join(input_data, song_data),
                         schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select('song_id', 'title', 'artist_id', 'year',
                            'duration').dropDuplicates()

    songs_table.createOrReplaceTempView('songs')

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(
        os.path.join(output_data, 'songs/songs.parquet'), 'overwrite')

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude') \
                      .withColumnRenamed('artist_name', 'name') \
                      .withColumnRenamed('artist_location', 'location') \
                      .withColumnRenamed('artist_latitude', 'latitude') \
                      .withColumnRenamed('artist_longitude', 'longitude') \
                      .dropDuplicates()

    artists_table.createOrReplaceTempView('artists')

    # write artists table to parquet files
    artists_table.write.parquet(
        os.path.join(output_data, 'artists/artists.parquet'), 'overwrite')
def process_song_data(spark, input_data, output_data):
    """
    Method to process song data and create tables: songs, artists
    :param spark: Spark session
    :param input_data: S3 bucket
    :param output_data: S3 bucket
    :return: Data frame of song data
    """
    # get filepath to song data file
    song_data = input_data + '/song-data/A/A/B/*.json'

    songs_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Int())
    ])

    # read song data file
    print('Reading song data.')
    df = spark.read.json(song_data, schema=songs_schema)

    song_columns = ['song_id', 'title', 'artist_id', 'year', 'duration']

    # extract columns to create songs table
    songs_table = df.selectExpr(song_columns).dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    print('Writing songs to parquet.')
    write_parquet(songs_table, output_data, 'songs', 'year', 'artist_id')

    artist_columns = [
        'artist_id', 'artist_name as name', 'artist_location as location',
        'artist_latitude as latitude', 'artist_longitude as longitude'
    ]

    # extract columns to create artists table
    artists_table = df.selectExpr(artist_columns).dropDuplicates()

    # write artists table to parquet files
    print('Writing artists to parquet.')
    write_parquet(artists_table, output_data, 'artists', None, None)

    return df
Пример #16
0
def process_song_data(spark, input_data, output_data):
    '''
    load song data in json format from S3 bucket and process these data by extracting 
    songs table and artists table, and save these tables back to S3 bucket
    
    :param spark: spark session
    :param input_data: data location for input data
    :param output_data: data location for output data
    :return: no return value
    '''
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # create songs schema
    songSchema = R([
        Fld('artist_id', Str()),
        Fld('artist_latitude', Dbl()),
        Fld('artist_location', Str()),
        Fld('artist_longitude', Dbl()),
        Fld('artist_name', Str()),
        Fld('duration', Dbl()),
        Fld('num_songs', Int()),
        Fld('title', Str()),
        Fld('year', Int()),
    ])

    # load songs json files from S3
    df_songs = spark.read.json(song_data, schema=songSchema)

    # select columns for songs_table
    songs_attr = ['title', 'artist_id', 'year', 'duration']
    songs_table = df_songs.select(songs_attr)\
    .dropDuplicates()\
    .withColumn('song_id', monotonically_increasing_id())

    # write songs_table to S3
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + 'songs/')

    # select artists columns
    artists_attr = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]
    artists_table = df_songs.select(artists_attr)\
    .dropDuplicates()

    artists_table = artists_table\
    .withColumnRenamed('artist_name','name')\
    .withColumnRenamed('artist_location','location')\
    .withColumnRenamed('artist_latitude','latitude')\
    .withColumnRenamed('artist_longitude','longitude')

    # write artists_table to S3
    artists_table.write.parquet(output_data + 'artists/')
Пример #17
0
def process_song_data(spark, input_data, output_data):
    """
    process_song_data - Loads the song data files from S3, and saves the song information to a parquet file
    (parititioned by year and artist_id), and then extracts the distinct artists and saves them to a parquet file.
    """

    # Get filepath to song data file
    song_data = os.path.join(input_data, 'song_data/*/*/*/*.json')
    #    song_data = os.path.join(input_data,'song_data/A/A/A/TRAAAAK128F9318786.json')

    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # Read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # Extract columns to create songs table
    songs_table = df.select(
        ['song_id', 'title', 'artist_id', 'year', 'duration'])

    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").mode('overwrite').parquet(
                                      os.path.join(output_data,
                                                   'songs.parquet'))

    # Extract columns to create artists table, and find the distinct artists
    artists_table = df.select([
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]).withColumnRenamed('artist_name', 'name').withColumnRenamed(
        'artist_location', 'location').withColumnRenamed(
            'artist_latitude',
            'latitude').withColumnRenamed('artist_longitude',
                                          'longitude').distinct()

    # Write artists table to parquet files
    artists_table.write.mode('overwrite').parquet(
        os.path.join(output_data, 'artists.parquet'))
Пример #18
0
def process_song_data(spark, input_data_songs, output_data):
    """
    Read song data by providing it an expected schema.
    Create songs and artists tables.
    """
    # define song data schema to improve performance
    song_schema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Long()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("year", Long())
    ])

    song_data = input_data_songs

    t_start = time()
    dfSongs = spark.read.json(song_data, schema=song_schema)
    t_end = time() - t_start
    print('Read song data in {} secs'.format(t_end))
    dfSongs.printSchema()

    dfSongs.count()
    dfSongs.show(5)

    songs_table = dfSongs.filter(dfSongs.song_id != '')\
                     .select(['song_id', 'title', 'artist_id', 'year', 'duration'])
    songs_table.show(5)
    songs_table.write.partitionBy(
        "year",
        "artist_id").mode('overwrite').parquet(output_data +
                                               'songs/songs_table.parquet')

    artists_table = dfSongs.filter(dfSongs.artist_id !='') \
                        .select(col("artist_id"),col("artist_name").alias("name"), col("artist_location").alias("location"),
                                 col("artist_longitude").alias("longitude"), col("artist_latitude").alias("latitude"))\
                        .dropDuplicates()

    artists_table.show(5)

    artists_table.write.mode('overwrite').parquet(
        output_data + 'artists/artists_table.parquet')
Пример #19
0
def immigration_etl(source="s3://immigration-us-1/sas_data",
                    destination="s3://immigration-us-1/sas_data_ready_to_use",
                    country_dict="s3://immigration-us-1/raw_data/country_dict.csv",
                    visa_dict="s3://immigration-us-1/raw_data/visa_dict.csv"
                   ):
    # Local udfs
    
    spaceDeleteUDF = udf(lambda s: s.replace(" ", ""), Str())
    ampesandDeleteUDF = udf(lambda s: s.replace("'", ""), Str())
    udf_to_datetime_sas = udf(lambda x: date_converter(x), DateType())
   
    Schema_country = R([
        Fld("id",Str()),
        Fld("country",Str())
        ])
    Schema_Visa = R([
        Fld("id",Str()),
        Fld("Visa_Type",Str())
        ])

    df_spark=spark.read.parquet(source)
    # Only immigrants from air 
    immigrants=df_spark.where(F.col("i94mode")==1)
    immigrants=immigrants.select("cicid", "i94yr", "i94mon", "i94cit", "i94res", "i94port", "arrdate", "i94visa",  "biryear", "gender", "visatype", "airline")
    country_dict=spark.read.csv(country_dict, header=True, mode="DROPMALFORMED", sep="=",schema=Schema_country)
    
    immigrants = immigrants.join(country_dict, immigrants.i94cit == country_dict.id,how='right') 
    immigrants=immigrants.withColumnRenamed("country", "cit_country")
    immigrants=immigrants.drop('id','i94cit')
    
    immigrants = immigrants.join(country_dict, immigrants.i94res == country_dict.id,how='right') 
    immigrants=immigrants.withColumnRenamed("country", "res_country")
    immigrants=immigrants.drop('id','i94res')
    
    visa_dict=spark.read.csv(visa_dict, header=False, mode="DROPMALFORMED",sep="=", schema=Schema_Visa)
    immigrants = immigrants.join(visa_dict, immigrants.i94visa == visa_dict.id,how='right') 
    immigrants=immigrants.drop('id','i94visa')
    
    immigrants=immigrants.withColumn("arrdate", udf_to_datetime_sas("arrdate"))
    
    immigrants = immigrants.withColumn("cicid", immigrants["cicid"].cast(IntegerType()))
    immigrants = immigrants.withColumn("i94yr", immigrants["i94yr"].cast(IntegerType()))
    immigrants = immigrants.withColumn("biryear", immigrants["biryear"].cast(IntegerType()))
    immigrants = immigrants.withColumn("i94mon", immigrants["i94mon"].cast(IntegerType()))
    
    immigrants.show(10) 
    immigrants.write.parquet(destination)
Пример #20
0
def process_song_data(spark, input_data, output_data):
    """
    Reads from song files, 
    transforms them into songs and artists data, 
    and writes them in parquet format.
    
    params:
    - spark: spark session object
    - input_data: input data path
    - output_data: output data path
    """

    # get filepath to song data file
    song_data = input_data + "/song_data/*/*/*/*.json"

    # use schema when read json files
    song_schema = St([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])

    # read song data file
    df = spark.read.json(song_data, schema=song_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").dropDuplicates()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs", mode="overwrite", \
                              partitionBy=["year", "artist_id"])

    # extract columns to create artists table
    artists_table = df.selectExpr("artist_id", "artist_name as name", "artist_location as location", \
                                  "artist_latitude as latitude", "artist_longitude as longitude") \
                                  .dropDuplicates()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists", mode="overwrite")
Пример #21
0
def process_song_data(spark, input_data, output_data):
    """
    Process the songs data from S3 storage and create the analytical tables, songs table and artists table.
    
    This function read the data in json files from the S3 storage, transforme the data into tha analytcal tables
    (songs and artists), and write it into partitioned parquet files on S3.
    
    Args:
        spark: the spark session
        input_data: the S3 bucket to read data from
        output_data: the S3 bucket to write analytics tables to
    """
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    # defined the song data schema
    song_data_schema = R([
        Fld("artist_id", Str(), False),
        Fld("artist_latitude", Str(), True),
        Fld("artist_longitude", Str(), True),
        Fld("artist_location", Str(), True),
        Fld("artist_name", Str(), False),
        Fld("song_id", Str(), False),
        Fld("title", Str(), False),
        Fld("duration", Dbl(), False),
        Fld("year", Int(), False)
    ])

    # read song data file
    df = spark.read.json(song_data, schema=song_data_schema)

    # extract columns to create songs table
    songs_table = df.select("song_id", "title", "artist_id", "year",
                            "duration").distinct()

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs_table.parquet",
                              mode="overwrite",
                              partitionBy=["year", "artist_id"])

    # extract columns to create artists table
    artists_table = df.select(
        "artist_id",
        col("artist_name").alias("name"),
        col("artist_location").alias("location"),
        col("artist_latitude").alias("latitude"),
        col("artist_longitude").alias("longitude"),
    ).distinct()

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table.parquet",
                                mode="overwrite")
Пример #22
0
def get_song_src_schema():
    """
    Get the source spark schema definition
    :return: The schema definition
    """
    return R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int())
    ])
Пример #23
0
def process_song_data(spark, input_data, output_data):
    """
    Extract data from song_data and write songs and artists table
    
    Arguments:
    - spark : SparkSession object
    - input_data : input data root dir path
    - output_data : output data root dir path
    """
    # schema for song_data 
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("song_id", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year",Int())
    ])
    
    # get filepath to song data file
    song_data = input_data + "song_data/*/*/*"
    
    # read song data file
    df = spark.read.json(song_data, schema=songSchema).dropDuplicates(["song_id"])

    # extract columns to create songs table
    df.createOrReplaceTempView("song_data")
    songs_table = spark.sql("""
        SELECT song_id, title, artist_id, year, duration FROM song_data
    """)
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year", "artist_id").parquet(output_data + "songs_table", 'overwrite')

    # extract columns to create artists table
    artists_table = spark.sql("""
        SELECT artist_id, artist_name as name, artist_location as location, artist_latitude as latitude, artist_longitude as longitude 
        FROM song_data
    """).dropDuplicates(["artist_id"])
    
    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists_table")
Пример #24
0
def process_song_data(spark, input_data, output_data):
    """
    Loads the song_data from AWS S3 (input_data) and extracts the songs and artist tables
    and then loaded the processed data back to S3 (output_data)
    
    :param spark: Spark Session object
    :param input_data: Location (AWS S3 path) of songs metadata (song_data) JSON files
    :param output_data: Location (AWS S3 path) where dimensional tables will be stored in parquet format 
    """

    # Get filepath to song data file
    song_data = input_data + "song_data/*/*/*/*.json"

    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # Read song data file
    print("Reading song_data JSON files from S3")
    df = spark.read.json(song_data, mode='PERMISSIVE', schema=songSchema, \
                         columnNameOfCorruptRecord='corrupt_record').dropDuplicates()
    print("Read completed")

    # Extract columns to create songs table
    songs_table = df.select("title", "artist_id", "year", "duration").dropDuplicates() \
                    .withColumn("song_id", monotonically_increasing_id())

    print("Writing Songs table to S3 after processing")
    # Write songs table to parquet files partitioned by year and artist
    songs_table.write.parquet(output_data + "songs/",
                              mode="overwrite",
                              partitionBy=["year", "artist_id"])
    print("Completed")

    # Extract columns to create artists table
    artists_table = df.select("artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude") \
                        .dropDuplicates()

    print("Writing Artists table to S3 after processing")
    # Write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
    print("Completed")
Пример #25
0
def process_song_data(spark, input_data, output_data):
    """
    read song data from s3 and then create the songs_table and artists_table. load them back to s3.
    
    parameters:
    spark: spark session
    input_data: path of song data
    output_data: path of output table
    
    """
    # get filepath to song data file
    # song_data = input_data + "song_data/*/*/*/*.json"
    song_data = input_data + "song_data/A/B/C/TRABCEI128F424C983.json"

    # create song table schema
    songSchema = R([
        Fld("num_songs", Int()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_name", Str()),
        Fld("title", Str()),
        Fld("duration", Dbl()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=songSchema)

    # extract columns to create songs table, drop if year and artist_id are missing and year should not equal to 0
    song_field = ["title", "duration", "year", "artist_id"]
    songs_table = df.select(song_field).dropDuplicates().withColumn("song_id",F.monotonically_increasing_id())\
    .filter(~col("year").isin([0]) & col("year").isNotNull() & col("artist_id").isNotNull())

    # extract columns to create artists table, drop if artist_id and name containing any null values
    artist_field = [
        "artist_id", "artist_name", "artist_location", "artist_latitude",
        "artist_longitude"
    ]
    artists_table = df.select(artist_field).dropDuplicates().dropna(
        subset=["artist_id", "artist_name"])

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year",
                                  "artist_id").parquet(output_data + "songs/",
                                                       mode="overwrite")

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists/", mode="overwrite")
Пример #26
0
def create_song_schema():
    """
    Schema structure for song data
    :return: StructType
    """
    song_schema = R([
        Fld("num_songs", In()),
        Fld("artist_id", St()),
        Fld("artist_latitude", Fl()),
        Fld("artist_longitude", Fl()),
        Fld("artist_location", St()),
        Fld("artist_name", St()),
        Fld("song_id", St()),
        Fld("title", St()),
        Fld("duration", Fl()),
        Fld("year", SInt())
    ])
    return song_schema
Пример #27
0
def process_song_data(spark, input_data, output_data):
    """
    Imports and process data from song dataset and then write data to
    parquet files on Amazon S3

    Parameters:
        spark: spark session
        input_data: S3 bucket path to input data from.
        output_data: another S3 bucket path to write data to it.
    """

    # get filepath to song data file
    song_data = input_data + "song-data/*/*/*/*.json"

    SongSchema = ST([
        Fld("song_id", Str()),
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # read song data file
    df = spark.read.json(song_data, schema=SongSchema).dropDuplicates(
        ['song_id', 'artist_id'])

    # extract columns to create songs table
    songs_table = df.select('song_id', 'artist_id', 'year', 'duration')

    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + "songs")

    # extract columns to create artists table
    artists_table = df.select('artist_id', 'artist_name', 'artist_location',
                              'artist_latitude', 'artist_longitude')

    # write artists table to parquet files
    artists_table.write.parquet(output_data + "artists")
Пример #28
0
def process_song_data(spark, input_data, output_data):
    """
        Description: This function fetches song_data from S3 into a staging dataframe, 
        then extracts the songs and artist tables,
        and eventually exports data back to S3
        
        Parameters:
            spark       : object for Spark Session
            input_data  : location of song_data 
            output_data : location of target S3 bucket
            
    """

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    # define schema
    songdata_schema = R([
    Fld("artist_id",Str()),
    Fld("artist_latitude",Dbl()),
    Fld("artist_location",Str()),
    Fld("artist_longitude",Dbl()),
    Fld("artist_name",Str()),
    Fld("duration",Dbl()),
    Fld("num_songs",Int()),
    Fld("title",Str()),
    Fld("year",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data, schema=songdata_schema)

    # extract columns to create songs table
    songs_table = df.select(['artist_name', 'artist_id', 'year', 'duration'])

    songs_table = songs_table.dropDuplicates().withColumn('song_id', monotonically_increasing_id()).\
    select(['song_id', 'artist_name', 'artist_id', 'year', 'duration'])
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'songs/')

    # extract columns to create artists table
    selection = ['artist_id', 'artist_name as name', \
                 'artist_location as location', 'artist_latitude as latitude', \
                 'artist_longitude as longitude']
    artists_table = df.selectExpr(selection).dropDuplicates()
    
    # write artists table to parquet files
    artists_table.write.partitionBy('year', 'artist_id').parquet(output_data + 'artists/')
Пример #29
0
def process_song_data(spark, input_data, output_data):
    """Read song data from source json files ,extract songs and artist tables then store the in parqute files in the target location
    Parameters:
    spark: spark session
    input_data: source of songs json files
    output_data: target to store extracted tables in as parquet files.
    """
    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'
    
    # create song data schema 
    from pyspark.sql.types import StructType as R, StructField as Fld, DoubleType as Dbl, StringType as Str, IntegerType as Int, DateType as Date,TimestampType as Ts
    songSchema = R([
        Fld("song_id",Str()),
        Fld("title",Str()),
        Fld("duration",Dbl()),
        Fld("year",Int()),
        Fld("artist_id",Str()),
        Fld("artist_name",Str()),
        Fld("artist_latitude",Str()),
        Fld("artist_longitude",Dbl()),
        Fld("artist_location",Dbl()),
        Fld("num_songs",Int()),
    ])
    
    # read song data file
    df = spark.read.json(song_data,schema=songSchema)

    # define fields to be created in the extracted songs_table
    songs_table_fields =["song_id","title","artist_id","year","duration"]
    
    # extract columns to create songs table
    songs_table = df.select(songs_table_fields).dropDuplicates()
    
    # write songs table to parquet files partitioned by year and artist
    songs_table.write.partitionBy("year","artist_id").parquet(output_data + 'songs/')

    # define artist table fields
    artists_table_fields = ["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"]
    
    # extract columns to create artists table
    artists_table = df.select(artists_table_fields).dropDuplicates();
    # write artists table to parquet files
    artists_table.write.parquet(output_data + 'artists/')
Пример #30
0
def process_song_data(spark, input_data, output_data):
    # get filepath to song data file
    song_data = input_data + "./data/song_data/*/*/*/*.json"
    """Creating the song_data file schema that we are going to add to spark"""
    songSchema = R([
        Fld("artist_id", Str()),
        Fld("artist_latitude", Dbl()),
        Fld("artist_location", Str()),
        Fld("artist_longitude", Dbl()),
        Fld("artist_name", Str()),
        Fld("duration", Dbl()),
        Fld("num_songs", Int()),
        Fld("title", Str()),
        Fld("year", Int()),
    ])

    # reading song data file json structure
    df = spark.read.json(song_data, schema=songSchema)
    """Filtering out only the needed columns for the songs table"""
    song_fields = ["title", "artist_id", "year", "duration"]

    print('Creating the songs table and dropping duplicates')
    songs_table = df.select(song_fields).dropDuplicates().withColumn(
        "song_id", monotonically_increasing_id())
    print(
        "--- All duplicate songs have been dropped and the songs table created ---"
    )
    print('Printing some rows from the songs_table')
    songs_table.show(15)
    print('Saving the songs table to the s3 bucket')
    songs_table.write.partitionBy('year',
                                  'artist_id').parquet(output_data + "songs")
    print("--- songs.parquet completed ---")
    """Filtering out only the needed columns for the artists table"""
    artists_data = [
        'artist_id', 'artist_name', 'artist_location', 'artist_latitude',
        'artist_longitude'
    ]

    print("--- Starting to drop duplicate artists....")
    artists_table = df.selectExpr(artists_data).dropDuplicates()
    print("All duplicate artists have been dropped......")

    print('Printing some rows from the artists_table')
    artists_table.show(15)
    """writing the artists table to the parquets file"""
    artists_table.write.parquet(output_data + "artists")
    print("--- artists.parquet completed ---")
    print("*** process_song_data completed ***\n\n")