示例#1
0
def process_dates_data(spark_context, output_path):
    """
    Function to read in date data from a temporary view and
    create date dimension table data and output it as parquet
    """

    query = """
           SELECT distinct date_date FROM
           (
           SELECT arr_date AS date_date FROM  temp_date_view
           UNION ALL
           SELECT dept_date AS date_date FROM temp_date_view)
        """
    df_date_data = spark_context.sql(sqlQuery=query)

    #adding more columns to the dates table to enable different analysis
    df_date_data = df_date_data.select("date_date",\
               dayofmonth("date_date").alias("day"),\
               weekofyear("date_date").alias("week"),\
               month("date_date").alias("month"),\
               date_format("date_date", 'MMMM').alias("month_name"),\
               year("date_date").alias("year"),\
               dayofweek("date_date").alias("day_of_week"),\
               date_format("date_Date", 'E').alias("day_of_week_name"))

    #Running quality tests for dates dimension table
    print("\nRunning quality test for immi_dates_dm table")
    check_unique_keys(df_date_data, "immi_dates_dm", "date_date")
    ensure_no_nulls(df_date_data, "date_date")
    check_data_type(df_date_data, "date_date", "TimestampType")
    check_greater_that_zero(df_date_data)

    #outputting the dates data
    df_date_data\
    .write.parquet(output_path + "dates", 'overwrite')
示例#2
0
def process_airports_data(spark_context, input_path, output_path):
    """
    Function to read in airports data and process the data using spark
    and output as airports dimension table data as parquet
    """

    #Reading the airports data and extractin latitude and longitude from coordinates column
    airports_file_path = input_path + "airport-codes_csv.csv"
    df_airports_data = spark_context.read.format('csv').option(
        "header", True).load(airports_file_path)

    #Dropping the null values for iata_code
    df_airports_data = df_airports_data.dropna(subset=["iata_code"])

    #filtering out 0 iata_code and any airport type that is closed
    df_airports_data = df_airports_data.filter((col("iata_code")!="0") & (col("type") != "closed"))\
                            .dropDuplicates(subset=["iata_code","type"])

    #Cleaning duplicated IATA codes dont exist in i94PORT codes
    df_airports_data = df_airports_data.filter(col("iata_code")\
                            .isin(["MRE","YMX","SGL","RCH","PRM","LHG"]) == False)
    df_airports_data = df_airports_data.withColumn("region_code", \
                                            to_split_extract_string(df_airports_data\
                                                .iso_region,lit("-"),lit(1)))\
                            .withColumn("latitude", \
                                to_split_extract_float(df_airports_data\
                                    .coordinates,lit(","),lit(0),lit("float")))\
                            .withColumn("longitude", \
                                to_split_extract_float(df_airports_data\
                                    .coordinates,lit(","),lit(1),lit("float")))\
                            .drop("coordinates")

    #updating the definition of the airports data
    df_airports_data = df_airports_data.selectExpr("ident as id",\
                                         "type",\
                                         "name",\
                                         "elevation_ft",\
                                         "continent",\
                                         "iso_country",\
                                         "iso_region",\
                                         "municipality",\
                                         "gps_code",\
                                         "iata_code",\
                                         "region_code",\
                                         "latitude",\
                                         "longitude")

    #Running quality tests for airports data
    print("\nRunning quality test for airports_dm table")
    check_unique_keys(df_airports_data, "airports_dm", "id")
    ensure_no_nulls(df_airports_data, "id")
    check_greater_that_zero(df_airports_data)

    #outputting the airports data
    df_airports_data\
    .write.parquet(output_path + "airports", 'overwrite')
示例#3
0
def process_other_dimension_tables(spark_context, output_path):
    """
    Fuinction to process various mappings for immigration fact table codes
    and output as visa, ports and mode of transaport dimension table data
    in parquet format
    """

    #Reading the dimensions into spark dataframe from dictionary objects
    df_visa_codes = spark_context.createDataFrame(
        list(map(list, visa_codes.items())), ["code", "travel_purpose"])

    df_ports_codes = spark_context.createDataFrame(
        list(map(list, ports_codes.items())), ["code", "port_name"])

    df_mode_codes = spark_context.createDataFrame(
        list(map(list, mode_codes.items())), ["code", "travel_mode"])

    #Running quality checks for remaining dimensions
    print("\nRunning quality test for visas_dm table")
    check_unique_keys(df_visa_codes, "visas_dm", "code")
    ensure_no_nulls(df_visa_codes, "code")
    check_greater_that_zero(df_visa_codes)

    print("\nRunning quality test for ports_dm table")
    check_unique_keys(df_ports_codes, "ports_dm", "code")
    ensure_no_nulls(df_ports_codes, "code")
    check_greater_that_zero(df_ports_codes)

    print("\nRunning quality test for modes_dm table")
    check_unique_keys(df_mode_codes, "modes_dm", "code")
    ensure_no_nulls(df_mode_codes, "code")
    check_greater_that_zero(df_mode_codes)

    #outputting the visas, ports and mode of transport data
    df_visa_codes\
    .write.parquet(output_path + "visas", 'overwrite')

    df_ports_codes\
    .write.parquet(output_path + "ports", 'overwrite')

    df_mode_codes\
    .write.parquet(output_path + "modes", 'overwrite')
示例#4
0
def process_immigration_data(spark_context, input_path, output_path):
    """
    Function to read immigration and process immigration data and 
    output immigration fact table and airlines dimension table
    as parquet
    """

    #read the data
    immigration_file_path = input_path + "sas_data"
    df_immigration_data = spark_context.read.parquet(immigration_file_path)

    #cleaning the arrival and departure dates, and if dates are null assigning 0 date
    df_immigration_data = df_immigration_data\
                            .withColumn("arrdate", get_datetime(df_immigration_data.arrdate))\
                            .withColumn("depdate", when((df_immigration_data.depdate.isNull()) , 0.0)\
                            .otherwise(df_immigration_data.depdate))

    df_immigration_data = df_immigration_data\
                            .withColumn("depdate",get_datetime(df_immigration_data.depdate))

    #converting double columns to integer
    float_immi_columns = {
        _[0]: _[1]
        for _ in df_immigration_data.dtypes if _[1] == 'double'
    }
    columns = [_ for _ in float_immi_columns.keys()]
    df_immigration_data = df_immigration_data.fillna(0, subset=columns)
    for _ in columns:
        df_immigration_data = df_immigration_data.withColumn(
            _, cast_integer(df_immigration_data[_]))

    #extracting unique flights data and creating a flights table with the unique identifier
    df_flights_data = df_immigration_data.selectExpr("airline as airline_code","fltno as flight_number")\
                               .dropDuplicates()\
                               .na\
                               .drop(subset=["airline_code","flight_number"])

    #creating a window to generate a serialised column
    window = Window.orderBy(col("airline_code"), col("flight_number"))
    df_flights_data = df_flights_data.withColumn("flight_id",
                                                 row_number().over(window))

    #joining the immigration data with flights and only keeping the flight_id
    df_immigration_data = df_flights_data.selectExpr("airline_code as airline",\
                                                     "flight_number as fltno",\
                                                     "flight_id")\
                                          .join(df_immigration_data,\
                                                on=["airline", "fltno"],
                                                how="left")

    df_immigration_data = df_immigration_data.selectExpr("cicid",\
                             "i94cit as city_code",\
                             "i94res as res_code",\
                             "i94port as port",\
                             "arrdate as arr_date",\
                             "i94mode as mode",\
                             "i94addr as addr",\
                             "depdate as dept_date",\
                             "i94bir as age",\
                             "i94visa as visa_code",\
                             "count as counter",\
                             "matflag as matflag",\
                             "biryear as birth_year",\
                             "gender",\
                             "flight_id",\
                             "i94yr as date_year",\
                             "i94mon as date_month",\
                             "visatype as visa_type",\
                             "admnum as admission_number")

    #filtering out any other years and where dept date is null
    df_immigration_data = df_immigration_data.filter(
        "year(dept_date) = 2016 or dept_date IS NULL")

    #filtering out where departure dates are less than arrival dates
    df_immigration_data = df_immigration_data.withColumn("diff", datediff("dept_date","arr_date"))\
                                        .filter("diff >= 0 or dept_date is NULL")\
                                        .drop("diff")

    #creating a temporary view temp_date_view of all unique arrival and departure dates
    df_immigration_data.select("arr_date","dept_date")\
            .filter("dept_date IS NOT NULL")\
            .dropDuplicates(subset=["arr_date","dept_date"])\
            .createOrReplaceTempView("temp_date_view")

    #Lets perform quality tests for immigration and flights data

    print("Running quality test for immigration_fct table")
    check_unique_keys(df_immigration_data, "immigration_fct", "cicid")
    ensure_no_nulls(df_immigration_data, "cicid")
    check_data_type(df_immigration_data, "dept_date", "TimestampType")
    check_data_type(df_immigration_data, "arr_date", "TimestampType")
    check_greater_that_zero(df_immigration_data)

    print("\nRunning quality test for airlines_dm table")
    check_unique_keys(df_flights_data, "airlines_dm", "flight_id")
    ensure_no_nulls(df_flights_data, "flight_id")
    check_greater_that_zero(df_flights_data)

    #outputting the immigration partitioned by year and month and outputting flights data
    df_immigration_data\
        .write.partitionBy("date_year","date_month")\
            .parquet(output_path + "immigration_fact", 'overwrite')

    df_flights_data\
        .write.parquet(output_path + "flights", 'overwrite')
示例#5
0
def process_demographics_data(spark_context, input_path, output_path):
    """
    Function to read in demographics data and produce aggregations for 
    state level, race level and outputting as demographics dimension 
    table data in parquet format
    """

    demographics_file_path = input_path + "us-cities-demographics.csv"
    df_demographics_data = spark_context.read.format("csv")\
                                .option("sep", ";")\
                                .option("header", True)\
                                .load(demographics_file_path)

    #Subsetting the data without race and count, and aggregating after removing duplicates
    df_demographics_state_city_data = df_demographics_data\
                                                    .dropDuplicates(subset=["City","State","State Code"])\
                                                    .groupby("State Code","State")\
                                                    .agg(mean("Median Age").alias("median_age"),\
                                                         Sum("Male Population").alias("male_population"),\
                                                         Sum("Female Population").alias("female_population"),\
                                                         Sum("Total Population").alias("total_population"),\
                                                         Sum("Number of Veterans").alias("veterans_population"),\
                                                         Sum("Foreign-born").alias("foreign_born_population"),\
                                                         mean("Average Household Size").alias("household_size_avg"))

    #Subsetting out the race data and splitting race to it individual columns, by doing a pivot
    df_race_state_city_data = df_demographics_data.select("State Code", "Race", "Count")\
                                   .groupby("State Code")\
                                   .pivot("Race")\
                                   .agg(Sum("Count"))

    #Once the data is transformed and normalised, can join and bring back together
    df_demographics_data = df_demographics_state_city_data.join(df_race_state_city_data,\
                                                                on="State Code",\
                                                                how="inner")

    #Updating the definition of demographics to state level so can join to fact immigration table
    df_demographics_data = df_demographics_data.select(col("State Code").alias("state_code"),\
                                    col("State").alias("state_name"),\
                                    col("median_age"),\
                                    col("male_population"),\
                                    col("female_population"),\
                                    col("total_population"),\
                                    col("veterans_population"),\
                                    col("foreign_born_population"),\
                                    col("household_size_avg"),\
                                    col("American Indian and Alaska Native")\
                                    .alias("american_indian_and_alaskan_native_population"),\
                                    col("Asian").alias("asian_population"),\
                                    col("Black or African-American")\
                                    .alias("black_or_african_american_populaton"),\
                                    col("Hispanic or Latino")\
                                    .alias("hispanic_latino_population"),\
                                    col("White").alias("white_population")\
                                    )
    #Running quality tests for demographics data
    print("\nRunning quality test for demographics_dm table")
    check_unique_keys(df_demographics_data, "demographics_dm", "state_code")
    ensure_no_nulls(df_demographics_data, "state_code")
    check_greater_that_zero(df_demographics_data)

    #outputting the demographics data
    df_demographics_data\
    .write.parquet(output_path + "demographics", 'overwrite')
示例#6
0
def process_climate_data(spark_context, input_path, output_path):
    """Function to read in temperature data and regions dimension data
    and output as regions aggregation with monthly averages of temperatures
    as a regions dimension table data in a parquet format
    """

    #read in climate and global temperature data
    climate_file_path = input_path + "GlobalLandTemperaturesByCity.csv"
    df_temperature_data = spark_context.read.format("csv").option(
        "header", True).load(climate_file_path)

    #Filtering the temperature data to 2000 century
    df_temperature_data = df_temperature_data.withColumn("dt",to_date("dt",'yyyy-MM-dd'))\
                                         .withColumn("month",date_format("dt","MMMM"))\
                                         .filter(df_temperature_data.dt >= "2000-01-01")

    #Casting the latitude and longitude to float and dropping few columns
    df_temperature_data = df_temperature_data.withColumn("Latitude",cast_lat_lon(df_temperature_data.Latitude))\
                            .withColumn("Longitude",cast_lat_lon(df_temperature_data.Longitude))\
                            .withColumn("Country", upper(df_temperature_data.Country))\
                            .drop("AverageTemperatureUncertainty","City","dt")

    #Aggregating the data to country and month, first and then to the country level below
    df_temperature_data = df_temperature_data.groupby("Country", "month")\
                            .agg(mean("AverageTemperature").alias("AverageTemperature"),\
                                 mean("Latitude").alias("Latitude"),\
                                 mean("Longitude").alias("Longitude"))

    df_temperature_lat_lon = df_temperature_data.groupby("Country")\
                            .agg(first("Latitude").alias("Latitude"),\
                                 first("Longitude").alias("Longitude"))

    df_temperature_data = df_temperature_data.groupby("Country")\
                            .pivot("Month")\
                            .agg(mean("AverageTemperature").alias("AverageTemperature"))

    #Doing the join after all the aggregations to have data with lat lon and country aggregations of montly temperatures
    df_temperature_data = df_temperature_data.join(df_temperature_lat_lon,
                                                   on="Country",
                                                   how="inner")

    #Getting the regions mapping so can use with the fact immigration table
    df_regions_data = spark_context.createDataFrame(
        list(map(list, cit_and_res_codes.items())),
        ["cit_res_code", "cit_res_name"])

    df_regions_data = df_regions_data.withColumn("Country",
                                                 df_regions_data.cit_res_name)

    df_regions_data = df_regions_data.join(df_temperature_data,
                                           on="Country",
                                           how="left")

    #Updating the final definition of regions data
    df_regions_data = df_regions_data.selectExpr("cit_res_code",\
                                                 "cit_res_name",\
                                                 "Latitude as latitude",\
                                                 "Longitude as longitude",\
                                                 "January as temp_january",\
                                                 "February as temp_february",\
                                                 "March as temp_march",\
                                                 "April as temp_april",\
                                                 "May as temp_may",\
                                                 "June as temp_june",\
                                                 "July as temp_july",\
                                                 "August as temp_august",\
                                                 "September as temp_september",\
                                                 "October as temp_october",\
                                                 "November as temp_november",\
                                                 "December as temp_december")

    #Running quality tests for regions dimension table
    print("\nRunning quality test for regions_dm table")
    check_unique_keys(df_regions_data, "regions_dm", "cit_res_code")
    ensure_no_nulls(df_regions_data, "cit_res_code")
    check_greater_that_zero(df_regions_data)

    #outputting the regions data
    df_regions_data\
    .write.parquet(output_path + "regions", 'overwrite')