def customer_name_mri_match():
    actDf = spark.sql("""
			select dl_file_timestamp,dl_file_prefix,dl_filename,dl_line_no,user_id,title,first_name,last_name,middle_names
			from dl_business.account where trim(title) != '' and trim(first_name) != '' and trim(last_name) != ''
			""")
    actDf = actDf.dropDuplicates([
        'dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'title',
        'first_name', 'last_name', 'middle_names'
    ])
    actDf = actDf.withColumn(
        "UPR_TITLE",
        clean_string_udf(F.trim(F.upper(F.col("title"))))).withColumn(
            "UPR_LNAME",
            clean_string_udf(F.trim(F.upper(F.col("last_name"))))).withColumn(
                "UPR_FNAME",
                clean_string_udf(F.trim(F.upper(
                    F.col("first_name"))))).withColumn(
                        "UPR_MNAME",
                        clean_string_udf(F.trim(F.upper(
                            F.col("middle_names")))))
    actDf = actDf.where((F.col("UPR_TITLE").isNotNull())
                        & (F.col("UPR_LNAME").isNotNull())
                        & (F.col("UPR_FNAME").isNotNull()))
    actDf.createOrReplaceTempView("account_clean_data")
    actlocDF = spark.sql("""
			select a.*,concat_ws('',a.UPR_TITLE,a.UPR_FNAME,a.UPR_LNAME,a.UPR_MNAME) as NAME_STR,b.orig_addr_lines,b.std_addr_lines,b.orig_post_code,b.std_post_code 
			from account_clean_data a,location_match_data b 
			where a.dl_file_timestamp = b.dl_file_timestamp and a.dl_file_prefix = b.dl_file_prefix and a.dl_line_no = b.dl_line_no
			""")
    #actlocDFMRI = fuzzy_match_rate_idx(actlocDF.toPandas(),['NAME_STR'])
    actlocDFMRI = actlocDF.withColumn("NAME_STR_MRI",
                                      name_mrc_udf(F.col("NAME_STR")))
    print("Names Match rating codex generation step is done")
    actlocDFMRI.createOrReplaceTempView("account_location_data")
    actlocGrpDF = spark.sql("""
			select NAME_STR_MRI,count(*) 
			from account_location_data 
			group by NAME_STR_MRI
			""")
    #actlocMatchDF = actlocDF.join(actlocGrpDF,(actlocDF.UPR_TITLE == actlocGrpDF.UPR_TITLE) & (actlocDF.UPR_LNAME == actlocGrpDF.UPR_LNAME) & (actlocDF.UPR_FNAME == actlocGrpDF.UPR_FNAME) & (actlocDF.UPR_MNAME == actlocGrpDF.UPR_MNAME)).select(actlocDF['UPR_TITLE'],actlocDF['UPR_FNAME'],actlocDF['UPR_LNAME'],actlocDF['UPR_MNAME'],actlocDF['user_id'],actlocDF['title'],actlocDF['first_name'],actlocDF['last_name'],actlocDF['middle_names'],actlocDF['orig_addr_lines'],actlocDF['std_addr_lines'],actlocDF['orig_post_code'],actlocDF['std_post_code'],actlocDF['dl_file_timestamp'],actlocDF['dl_file_prefix'],actlocDF['dl_filename'],actlocDF['dl_line_no'])
    actlocMatchDF = actlocDFMRI.join(
        actlocGrpDF,
        (actlocDFMRI.NAME_STR_MRI == actlocGrpDF.NAME_STR_MRI)).select(
            actlocDFMRI['UPR_TITLE'], actlocDFMRI['UPR_FNAME'],
            actlocDFMRI['UPR_LNAME'], actlocDFMRI['UPR_MNAME'],
            actlocDFMRI['user_id'], actlocDFMRI['title'],
            actlocDFMRI['first_name'], actlocDFMRI['last_name'],
            actlocDFMRI['middle_names'], actlocDFMRI['NAME_STR_MRI'],
            actlocDFMRI['orig_addr_lines'], actlocDFMRI['std_addr_lines'],
            actlocDFMRI['orig_post_code'], actlocDFMRI['std_post_code'],
            actlocDFMRI['dl_file_timestamp'], actlocDFMRI['dl_file_prefix'],
            actlocDFMRI['dl_filename'], actlocDFMRI['dl_line_no'])
    finalExactMatDF = actlocMatchDF.select(
        'UPR_TITLE', 'UPR_LNAME', 'UPR_FNAME', 'UPR_MNAME', 'std_addr_lines',
        'std_post_code', 'user_id', 'title', 'last_name', 'first_name',
        'middle_names', 'NAME_STR_MRI', 'orig_addr_lines', 'orig_post_code',
        'dl_filename', 'dl_file_timestamp', 'dl_file_prefix', 'dl_line_no')
    finalExactMatDF.persist()
    finalExactMatDF.createOrReplaceTempView("identity_stage_data")
    print("Names string match step based on match rating codex is done")
def etl_us_cities_demographics(spark, input_dir, output_dir):
    """Clean the us cities demograpgics data"""
    # this data set is clean
    # load data
    data_input_full_file_path = f'{input_dir}/us-cities-demographics.csv'
    us_cities_demographics_spark_df = spark.read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1", sep=';') \
        .load(data_input_full_file_path)

    us_cities_demographics_spark_df = us_cities_demographics_spark_df \
        .withColumnRenamed("City", "city") \
        .withColumnRenamed("State", "state") \
        .withColumnRenamed("Median Age", "median_age") \
        .withColumnRenamed("Male Population", "male_population") \
        .withColumnRenamed("Female Population", "female_population") \
        .withColumnRenamed("Total Population", "total_population") \
        .withColumnRenamed("Number of Veterans", "num_of_veterans") \
        .withColumnRenamed("Foreign-born", "foreign_born") \
        .withColumnRenamed("Average Household Size", "avg_house_size") \
        .withColumnRenamed("State Code", "state_code") \
        .withColumnRenamed("Race", "race") \
        .withColumnRenamed("Count", "count") \
        .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code'))))

    data_output_full_file_path = f'{output_dir}/us-cities-demographics.parquet'
    us_cities_demographics_spark_df \
        .write \
        .options(encoding="ISO-8859-1") \
        .mode('overwrite') \
        .parquet(data_output_full_file_path)
Пример #3
0
def filter_fiscal_year_and_month(
        df: DataFrame,
        fiscal_year: str,
        month_abbr: str,
        date_fmt: str = 'MM/dd/yyyy',
        filter_column_year: str = 'voucher_creation_date',
        filter_column_month: str = 'shipment_pickup_date') -> DataFrame:
    filtered = (
        df
        .withColumn(
            '_fiscal_year',
            F.coalesce(
                F.year(F.add_months(
                    F.to_date(filter_column_year, date_fmt), 3)),
                F.year(F.add_months(
                    F.to_date(filter_column_month, date_fmt), 3))
            ))
        .withColumn(
            '_month_abbr',
            F.coalesce(
                F.upper(F.date_format(
                    F.to_date(filter_column_year, date_fmt), 'MMM')),
                F.upper(F.date_format(
                    F.to_date(filter_column_month, date_fmt), 'MMM'))
            ))
        .where(F.col('_fiscal_year') == fiscal_year)
        .where(F.col('_month_abbr') == month_abbr)
    )  # yapf: disable
    return filtered
Пример #4
0
def format_output(df):
    df = df.withColumn("uniqueKey",
                       f.upper(f.concat(f.lit("RY"),
                                        f.substring(f.col('year'), 3, 2),
                                        f.lit("_"),
                                        f.col("channel"),
                                        f.lit("_"),
                                        f.col("division"),
                                        f.lit("_"),
                                        f.col("gender"),
                                        f.lit("_"),
                                        f.col("category"),
                                        ))) \
        .withColumn("channel", f.upper(f.col("channel"))) \
        .withColumn("year", f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2))) \
        .withColumn("week_1", f.concat(f.lit("W"), f.col("week")))

    output = df.orderBy("week").groupBy('uniqueKey', 'division', 'gender', 'category', 'channel', 'year').agg(
        f.to_json(
            f.collect_list(
                f.create_map('week_1', 'netSales')
            )
        ).alias('Net Sales'),
        f.to_json(
            f.collect_list(
                f.create_map('week_1', 'salesUnits')
            )
        ).alias('Sales Units')
    )

    return output
def customer_name_exact_match(locMatchDF):
    locMatchDF.createOrReplaceTempView("location_match_data")
    actDf = spark.sql("""
						select dl_file_timestamp,dl_file_prefix,dl_filename,dl_line_no,user_id,title,first_name,last_name,middle_names
						from dl_business.account where trim(title) != '' and trim(first_name) != '' and trim(last_name) != ''
						""")
    actDf = actDf.dropDuplicates([
        'dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'title',
        'first_name', 'last_name', 'middle_names'
    ])
    actDf = actDf.withColumn(
        "UPR_TITLE",
        clean_string_udf(F.trim(F.upper(F.col("title"))))).withColumn(
            "UPR_LNAME",
            clean_string_udf(F.trim(F.upper(F.col("last_name"))))).withColumn(
                "UPR_FNAME",
                clean_string_udf(F.trim(F.upper(
                    F.col("first_name"))))).withColumn(
                        "UPR_MNAME",
                        clean_string_udf(F.trim(F.upper(
                            F.col("middle_names")))))
    actDf = actDf.where((F.col("UPR_TITLE").isNotNull())
                        & (F.col("UPR_LNAME").isNotNull())
                        & (F.col("UPR_FNAME").isNotNull()))
    actDf.createOrReplaceTempView("account_clean_data")
    actlocDF = spark.sql("""
					select a.*,b.orig_addr_lines,b.std_addr_lines,b.orig_post_code,b.std_post_code 
					from account_clean_data a,location_match_data b 
					where a.dl_file_timestamp = b.dl_file_timestamp and a.dl_file_prefix = b.dl_file_prefix and a.dl_line_no = b.dl_line_no
					""")
    actlocDF.createOrReplaceTempView("account_location_data")
    actlocGrpDF = spark.sql("""
					select UPR_TITLE,UPR_LNAME,UPR_FNAME,UPR_MNAME,count(*) 
					from account_location_data 
					group by UPR_TITLE,UPR_LNAME,UPR_FNAME,UPR_MNAME
					""")
    actlocMatchDF = actlocDF.join(
        actlocGrpDF, (actlocDF.UPR_TITLE == actlocGrpDF.UPR_TITLE) &
        (actlocDF.UPR_LNAME == actlocGrpDF.UPR_LNAME) &
        (actlocDF.UPR_FNAME == actlocGrpDF.UPR_FNAME) &
        (actlocDF.UPR_MNAME == actlocGrpDF.UPR_MNAME)).select(
            actlocDF['UPR_TITLE'], actlocDF['UPR_FNAME'],
            actlocDF['UPR_LNAME'], actlocDF['UPR_MNAME'], actlocDF['user_id'],
            actlocDF['title'], actlocDF['first_name'], actlocDF['last_name'],
            actlocDF['middle_names'], actlocDF['orig_addr_lines'],
            actlocDF['std_addr_lines'], actlocDF['orig_post_code'],
            actlocDF['std_post_code'], actlocDF['dl_file_timestamp'],
            actlocDF['dl_file_prefix'], actlocDF['dl_filename'],
            actlocDF['dl_line_no'])
    nameMatDF = actlocMatchDF.select(
        'UPR_TITLE', 'UPR_LNAME', 'UPR_FNAME', 'UPR_MNAME', 'std_addr_lines',
        'std_post_code', 'user_id', 'title', 'last_name', 'first_name',
        'middle_names', 'orig_addr_lines', 'orig_post_code', 'dl_filename',
        'dl_file_timestamp', 'dl_file_prefix', 'dl_line_no')
    #nameMatDF.persist()
    #nameMatDF.createOrReplaceTempView("identity_stage_data")
    print("Names string match step based on string exact match is done")
    return nameMatDF
Пример #6
0
def process(df, params=None, log=None):
    df_transformed = (
        df
        .select(
            col('id'),
            upper(col('first_name')).alias("first_name"),
            upper(col('second_name')).alias("second_name"),
            col('floor')))

    return df_transformed
Пример #7
0
def clean_centerline(centerline):
    '''
    PREPROCESSING:
    This function takes in the centerline as the input.
    The role of this function is to clean the centerline file and split compound house numbers into two column (for each L and R combination)
    We replace rows that are same in FULL_STREE and ST_LABEL with '0' to avoid the OR operation during join
    We return: the centerline file after preprocessing
    '''

    centerline = centerline.select('PHYSICALID', 'L_LOW_HN', 'L_HIGH_HN',
                                   'R_LOW_HN', 'R_HIGH_HN', 'FULL_STREE',
                                   'ST_LABEL', 'BOROCODE')
    centerline = centerline.na.drop(subset=[
        'PHYSICALID', 'L_LOW_HN', 'L_HIGH_HN', 'R_LOW_HN', 'R_HIGH_HN',
        'FULL_STREE', 'ST_LABEL', 'BOROCODE'
    ])
    centerline = centerline.withColumn(
        'FULL_STREE',
        F.upper(F.col('FULL_STREE'))).withColumn('ST_LABEL',
                                                 F.upper(F.col('ST_LABEL')))

    house = F.split(centerline['L_LOW_HN'], '-')
    centerline = centerline.withColumn('L_LOW_HN_1',
                                       house.getItem(0).cast('int'))
    centerline = centerline.withColumn('L_LOW_HN_2',
                                       house.getItem(1).cast('int'))

    house = F.split(centerline['L_HIGH_HN'], '-')
    centerline = centerline.withColumn('L_HIGH_HN_1',
                                       house.getItem(0).cast('int'))
    centerline = centerline.withColumn('L_HIGH_HN_2',
                                       house.getItem(1).cast('int'))

    house = F.split(centerline['R_LOW_HN'], '-')
    centerline = centerline.withColumn('R_LOW_HN_1',
                                       house.getItem(0).cast('int'))
    centerline = centerline.withColumn('R_LOW_HN_2',
                                       house.getItem(1).cast('int'))

    house = F.split(centerline['R_HIGH_HN'], '-')
    centerline = centerline.withColumn('R_HIGH_HN_1',
                                       house.getItem(0).cast('int'))
    centerline = centerline.withColumn('R_HIGH_HN_2',
                                       house.getItem(1).cast('int'))

    print(
        "Done performing preprocessing for Centerline, now moving to the conditional joins part"
    )

    centerline = centerline.withColumn(
        'ST_LABEL',
        F.when(centerline['FULL_STREE'] == centerline['ST_LABEL'],
               '0').otherwise(centerline['ST_LABEL']))

    return (centerline)
Пример #8
0
def run_pipeline():
    sc = get_session()
    us_immg_df = process_immigration_data(sc, 'sas_data')
    us_dem_data, race_counts = get_demographics_info(
        spark, "us-cities-demographics.csv")
    us_immg_df=us_immg_df.join(us_dem_data, (upper(us_immg_df.port_city)==upper(us_dem_data.City)) & \
                                           (upper(us_immg_df.port_state)==upper(us_dem_data.State_Code)), how='left')
    us_immg_df.count()
    us_immg_df = us_immg_df.drop("City", "State_Code")

    us_immg_df.drop('arrival_date').write.mode("overwrite").parquet(
        'i94-immigration-data.parquet')
    qa_checks(us_dem_data, race_counts, us_immg_df)
Пример #9
0
def pre_processing(output_uri):

    spark =  SparkSession.builder.appName("process sample data").getOrCreate()
    rdd = spark.sparkContext.parallelize([
        (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
        (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
        (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
    ])
    df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
    spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
    if type(df.c) == type(upper(df.c)) == type(df.c.isNull()):
        df_new = df.withColumn('upper_c', upper(df.c))
    df_new
    df_new.repartition(1).write.option("header", "true").mode("overwrite").csv(output_uri)
Пример #10
0
def getTemperature(input_loc):
    """ Read Global Temperature information
    
    Args:
      input_loc      : input folder in EMR HDFS
      
    Returns:
      df_temperature : dataframe contains Global Temperature
    
    """
    filePath = os.path.join(input_loc, 'GlobalLandTemperaturesByCity.csv')
    df_temperature = spark.read.csv(filePath, header=True, inferSchema=True)

    # Remove all dates prior to 2010
    df_temperature = df_temperature.filter(df_temperature.dt >= '2010-01-01')

    # Aggregate by Country
    df_temperature = df_temperature.groupby(["Country"]).agg(
        avg("AverageTemperature").alias("AverageTemperature"),
        first("Latitude").alias("Latitude"),
        first("Longitude").alias("Longitude"))

    # uppercase Country so later could be joined with Country dimension table
    df_temperature = df_temperature.withColumn('Country',upper(col('Country'))) \
                         .withColumn('AverageTemperature',round('AverageTemperature',2))

    return df_temperature
def etl_airport_code(spark, input_dir, output_dir):
    """Clean the airport code data"""

    # load data
    airport_code_data_input_full_file_path = f'{input_dir}/airport-codes_csv.csv'
    airport_code_spark_df = spark.read \
        .format('csv') \
        .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
        .load(airport_code_data_input_full_file_path)

    airport_code_spark_df = airport_code_spark_df \
        .withColumnRenamed('name', 'airport_name') \
        .filter(F.col('iso_country') == 'US')

    # split iso_region column into Latitude and Longitude
    split_iso_region = F.split(airport_code_spark_df['iso_region'], '-')
    airport_code_spark_df = airport_code_spark_df \
        .withColumn('region', split_iso_region.getItem(1)) \
        .withColumn('municipality_region', F.concat_ws(', ', F.upper(F.col('municipality')), F.upper(F.col('region'))))

    new_airport_code_spark_df = airport_code_spark_df \
        .drop('iso_region') \
        .drop('coordinates')

    data_output_full_file_path = f'{output_dir}/airport-codes.parquet'
    new_airport_code_spark_df \
        .write \
        .options(encoding="ISO-8859-1") \
        .mode('overwrite') \
        .parquet(data_output_full_file_path)
Пример #12
0
def clean_violations(violations):
    
    violations = violations.na.drop(subset=['Street Name','House Number','Violation County','Issue Date'])
    violations = violations.select('House Number','Street Name','Violation County', 'Issue Date')
    violations = violations.withColumn('Street Name', F.upper(F.col('Street Name')))
    violations = violations.withColumn("House Number", F.regexp_replace(F.col("House Number"), "[A-Z]", ""))
    split_year = F.split(violations['Issue Date'],'/')
    violations = violations.withColumn('Year',split_year.getItem(2)).drop('Issue Date')
    split_col = F.split(violations['House Number'],'-')
    violations = violations.withColumn('House_Num1',split_col.getItem(0).cast('int'))
    violations = violations.withColumn('House_Num2',split_col.getItem(1).cast('int'))

    boroughs = {'MAN':'1', 'MH':'1', 'MN':'1', 'NEWY':'1', 'NEW Y':'1', 'NY':'1',
           'BRONX':'2','BX':'2', 'PBX':'2',
           'BK':'3', 'K':'3', 'KING':'3', 'KINGS':'3',
           'Q':'4', 'QN':'4', 'QNS':'4', 'QU':'4','QUEEN':'4',
           'R':'5', 'RICHMOND':'5'}
    violations = violations.replace(boroughs, subset='Violation County')

    violations.createOrReplaceTempView('violations')
    violations = spark.sql('SELECT * FROM violations WHERE Year >= 2015 AND Year <= 2019')
    # violations = violations.groupby('House Number','Street Name','Violation County','Year','House_Num1','House_Num2').count()
    print("Done performing preprocessing for Violations, now moving to Centerline")
    
    return(violations)
Пример #13
0
def clean_centerline(centerline):
    
    centerline = centerline.select('PHYSICALID','L_LOW_HN','L_HIGH_HN', 'R_LOW_HN','R_HIGH_HN','FULL_STREE','ST_LABEL','BOROCODE')
    centerline = centerline.na.drop(subset=['PHYSICALID','L_LOW_HN','L_HIGH_HN', 'R_LOW_HN','R_HIGH_HN','FULL_STREE','ST_LABEL','BOROCODE'])
    centerline = centerline.withColumn('FULL_STREE', F.upper(F.col('FULL_STREE'))).withColumn('ST_LABEL', F.upper(F.col('ST_LABEL')))

    split_col = F.split(centerline['L_LOW_HN'], '-')
    centerline = centerline.withColumn('L_LOW_HN_1', split_col.getItem(0).cast('int'))
    centerline = centerline.withColumn('L_LOW_HN_2', split_col.getItem(1).cast('int'))

    split_col = F.split(centerline['L_HIGH_HN'], '-')
    centerline = centerline.withColumn('L_HIGH_HN_1', split_col.getItem(0).cast('int'))
    centerline = centerline.withColumn('L_HIGH_HN_2', split_col.getItem(1).cast('int'))

    split_col = F.split(centerline['R_LOW_HN'], '-')
    centerline = centerline.withColumn('R_LOW_HN_1', split_col.getItem(0).cast('int'))
    centerline = centerline.withColumn('R_LOW_HN_2', split_col.getItem(1).cast('int'))

    split_col = F.split(centerline['R_HIGH_HN'], '-')
    centerline = centerline.withColumn('R_HIGH_HN_1', split_col.getItem(0).cast('int'))
    centerline = centerline.withColumn('R_HIGH_HN_2', split_col.getItem(1).cast('int'))

    print("Done performing preprocessing for Centerline, now moving to the conditional joins part")
    
    return(centerline)
Пример #14
0
def clean_violations(violations):
    '''
    PREPROCESSING:
    This function takes in the violations (all five files) as input. 
    In this function, we clean the violations table and drop the unwanted columns.
    We split house number into two to accomadate for compound house numbers
    We return: the violations file after pivoting on the years to reduce number of rows
    '''

    violations = violations.na.drop(subset=['Street Name','House Number','Violation County','Issue Date'])
    violations = violations.select('House Number','Street Name','Violation County', 'Issue Date')
    violations = violations.withColumn('Street Name', F.upper(F.col('Street Name')))
    violations = violations.withColumn("House Number", F.regexp_replace(F.col("House Number"), "[A-Z]", ""))
    split_year = F.split(violations['Issue Date'],'/')
    violations = violations.withColumn('Year',split_year.getItem(2)).drop('Issue Date')
    split_col = F.split(violations['House Number'],'-')
    violations = violations.withColumn('House_Num1',split_col.getItem(0).cast('int'))
    violations = violations.withColumn('House_Num2',split_col.getItem(1).cast('int'))

    boroughs = {'MAN':'1', 'MH':'1', 'MN':'1', 'NEWY':'1', 'NEW Y':'1', 'NY':'1',
           'BRONX':'2','BX':'2', 'PBX':'2',
           'BK':'3', 'K':'3', 'KING':'3', 'KINGS':'3',
           'Q':'4', 'QN':'4', 'QNS':'4', 'QU':'4','QUEEN':'4',
           'R':'5', 'RICHMOND':'5'}
    violations = violations.replace(boroughs, subset='Violation County')

    violations.createOrReplaceTempView('violations')
    violations = spark.sql('SELECT * FROM violations WHERE Year >= 2015 AND Year <= 2019')

    violations_pivot = violations.groupby('Violation County','Street Name','House Number','House_Num1','House_Num2').pivot('Year',["2015","2016","2017","2018","2019"]).count().cache()
    
    print("Done performing preprocessing for Violations, now moving to Centerline")
    
    return(violations_pivot)
Пример #15
0
def transform_read_centerline_data(df: DataFrame) -> DataFrame:
    """Transforming centerline data to make it joinable, below are the things steps in high level

    1. Converted ST_LABEL & FULL_STREE to upper case
    2. Converted L_LOW_HN & L_HIGH_HN  separated by '-' for odd house number
    3. Converted R_LOW_HN & R_HIGH_HN  separated by '-' for even house number
    4. Removed any data having no house number in L_LOW_HN and R_LOW_HN
    """
    df = (df.select("PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME",
                    "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").orderBy(
                        "PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME",
                        "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN",
                        "R_HIGH_HN").coalesce(200).withColumn(
                            "ST_NAME", F.upper(F.col("ST_NAME"))).withColumn(
                                "FULL_STREE",
                                F.upper(F.col("FULL_STREE"))).filter(
                                    (F.col("L_LOW_HN").isNotNull())
                                    | (F.col("R_LOW_HN").isNotNull())))
    df = df.withColumn("L_TEMP_ODD", F.split("L_LOW_HN", "-")).withColumn(
        "L_LOW_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("L_HIGH_HN", "-")).withColumn(
        "L_HIGH_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("R_LOW_HN", "-")).withColumn(
        "R_LOW_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    df = df.withColumn("L_TEMP_ODD", F.split("R_HIGH_HN", "-")).withColumn(
        "R_HIGH_HN",
        F.col("L_TEMP_ODD").getItem(0).cast("int") +
        F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise(
            F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000,
    )

    return df
Пример #16
0
    def city(cName):
    if cName == "MAN" or cName == "MH" or cName == "MN" or cName == "NEWY" or cName == "NEW Y" or cName == "NY":
        return "1"
    elif cName == "BX" or cName == "BRONX":
        return "2"
    elif cName == "BK" or cName == "K" or cName == "KING" or cName == "KINGS":
        return "3"
    elif cName == "Q" or cName == "QN" or cName == "QNS" or cName == "QU" or cName == "QUEEN":
        return "4"
    elif cName == "R" or cName == "RICHMOND":
        return "5"
    else: return "*!NULL!*"

    spark.udf.register("city", city)


    city = func.udf(city)

    df1 = df1.select(
    func.col("House Number").cast("int").alias("House Number"), 
    func.upper(func.col("Street Name")).alias("Street Name"), 
    city(func.col("Violation County")).cast("int").alias("BOROCODE")
    )

    df2 = df2.select(
        func.col("PHYSICALID").cast("int").alias("PHYSICALID"),
        func.upper(func.col("FULL_STREE")).alias("FULL_STREE"),
        func.upper(func.col("ST_LABEL")).alias("ST_LABEL"),
        func.col("BOROCODE").cast("int").alias("BOROCODE"),
        func.col("L_LOW_HN").cast("int").alias("L_LOW_HN"),
        func.col("L_HIGH_HN").cast("int").alias("L_HIGH_HN"),
        func.col("R_LOW_HN").cast("int").alias("R_LOW_HN"),
        func.col("R_HIGH_HN").cast("int").alias("R_HIGH_HN")
    )

    df1.join(
    df2, (df1["BOROCODE"] == df2["BOROCODE"]) & #join 1  #join2 \/
    ((df1["Street Name"] == df2["ST_LABEL"]) | (df1["Street Name"] == df2["FULL_STREE"]))).filter(
        (
            (func.col("House Number")%2 != 0) &
            (func.col("House Number") >= func.col("L_LOW_HN")) &
            (func.col("House Number") <= func.col("L_HIGH_HN"))
        ) | (
            (func.col("House Number")%2 == 0) &
            (func.col("House Number") >= func.col("R_LOW_HN")) &
            (func.col("House Number") <= func.col("R_HIGH_HN"))
        )).count()
Пример #17
0
def main():
    """ mathod to get TCIA data file list"""
    bktname = "dataengexpspace"
    datalist = "s3://dataengexpspace/data/TCIAData/ACRIN-DSC-MR-Brain.json"

    conf = SparkConf().setAppName("Prepare_TCIAData_list")
    sctx = SparkContext(conf=conf).getOrCreate()
    sqlctx = SQLContext(sctx)
    tciadf = sqlctx.read.json(datalist)
    tmpdf = tciadf.na.fill("NULL")
    tmpdf = tmpdf.withColumn('ShortStudyUID', col('StudyInstanceUID').substr(60, 5))
    tmpdf = tmpdf.withColumn('ShortInstanceUID', col('SeriesInstanceUID').substr(60, 5))
    tmpdf = tmpdf.withColumn('SeriesDescription', upper(col('SeriesDescription')))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[(]', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[)]', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[*]', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[+]', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'\s+', '_'))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[?]*[/]', 'over'))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[/]{1}', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[?]$', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'^[_]', ''))
    tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription',
                                                                 r'[_]$', ''))

    tmpdf = tmpdf.withColumn('S3objkey', concat(lit('data/TCIAData_p3/'),
                                                col('Collection'), lit('/'),
                                                col('PatientID'), lit('/'),
                                                col('SeriesDate'), lit('-'),
                                                col('ShortStudyUID'), lit('/'),
                                                col('SeriesNumber'), lit('-'),
                                                col('SeriesDescription'), lit('-'),
                                                col('ShortInstanceUID'), lit('/blob.zip')
                                               )
                             )
    tmpdf = tmpdf.select('SeriesInstanceUID', 'S3objkey')
    tmpdf.show(5)
    tmpdf.write.save('file:///tmp/tmpdfjson', format='json', mode='overwrite')
    s3client = boto3.client('s3',
                            aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                            aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'))

    tmpname = glob.glob('/tmp/tmpdfjson/*.json')[0]
    print(tmpname)
    response = s3client.upload_file(tmpname, bktname,
                                    "data/TCIAData/metadata/filelist_p3.json")
    if response is not None:
        print(">>>>>>> Upload problem for file: ", tmpname)
    print("remove tmp file: {}".format(tmpname))
    os.remove(tmpname)
Пример #18
0
 def upper_column(self, column):
     try:
         self.spark_df = self.spark_df.withColumn('temp', f.upper(f.col(column))).drop(column)\
             .withColumnRenamed('temp', column)
         return self.get_json_df_response()
     except Exception as e:
         print(e)
         return None
Пример #19
0
def MyTransform(glueContext, dfc) -> DynamicFrameCollection:
    import pyspark.sql.functions as F
    # convert dymamic frame to data frame
    df = dfc.select(list(dfc.keys())[0]).toDF()
    df = df.withColumn("title", F.upper(F.col("title")))
    # create dynamic frame from dataframe
    upperDf = DynamicFrame.fromDF(df, glueContext, "filter_votes")
    return (DynamicFrameCollection({"CustomTransform0": upperDf}, glueContext))
def upper_columns(df: DataFrame, cols: list) -> DataFrame:
    new_cols = []
    for field in df.schema.fields:
        if field.dataType == T.StringType() and field.name in cols:
            new_cols.append(F.upper(F.col(field.name)).alias(field.name))
        else:
            new_cols.append(F.col(field.name))
            
    return df.select(*new_cols)
def preprocess_names(spark: SparkSession, data_dir = catalog['clean/whitehouse_logs_cleaned']):

    whl_df = spark.read.options(delimiter=',').csv(path=data_dir, header=True, inferSchema=True)

    # created a concatenated Name string for visitor
    whl_df = whl_df.withColumn('VISITOR_NAME', upper(concat_ws(' ',
                                                         whl_df.NAMEFIRST,
                                                         whl_df.NAMEMID,
                                                         whl_df.NAMELAST)))

    # created a concatenated Name string for visitee
    whl_df = whl_df.withColumn('VISITEE_NAME',
                               upper(concat_ws(' ',
                                               whl_df.visitee_namefirst,
                                               whl_df.visitee_namelast)))
    # persist
    whl_df.write.\
        mode('overwrite').\
        csv(catalog['clean/whitehouse_logs_processed'], header=True)
Пример #22
0
def add_fiscal_year_and_month_abbr(
        df,
        date_fmt: str = 'yyyy/MM/dd',
        filter_column_year: str = 'voucher_creation_date',
        filter_column_month: str = 'shipment_pickup_date') -> DataFrame:
    expr_mapping = {
        '_fiscal_year': (F.coalesce(
            F.year(F.add_months(F.to_date(filter_column_year, date_fmt), 3)),
            F.year(F.add_months(F.to_date(filter_column_month, date_fmt),
                                3)))),
        '_month_abbr': (F.coalesce(
            F.upper(
                F.date_format(F.to_date(filter_column_year, date_fmt), 'MMM')),
            F.upper(
                F.date_format(F.to_date(filter_column_month, date_fmt),
                              'MMM'))))
    }
    select_expr = build_col_expr(expr_mapping)
    transformed = df.select(F.expr('*'), *select_expr)
    return transformed
Пример #23
0
 def _process_marketplace(self, source_df, target_file):
     """
     Process Marketplace
     :param source_df: Spark DataFrame with source data
     :param target_file: name of the file for storing
     """
     marketplace_df = source_df.select(upper(
         col("marketplace"))).dropDuplicates()
     logging.debug(f"Writing {target_file}")
     marketplace_df.coalesce(1).\
         write.csv(path = target_file, sep = ';', header = True, mode = "overwrite", quote = '"', escape = '"')
Пример #24
0
def house_number_extract(df):
    #make address_line_1 all uppercase
    df = df.withColumn('address_line_1', f.upper('address_line_1'))
    
    #extract house number or box number into column housenumber
    df = df.withColumn('housenumber',
                      f.when(
                          f.col('address_line_1').rlike('^[A-Z]{2}'),
                          f.regexp_extract(f.col('address_line_1'),'(BOX\\s)([0-9]+[0-9A-Z.*-]*)', 2))
                       .otherwise(f.regexp_extract(f.col('address_line_1'),'^([A-Z]*[0-9]+[0-9A-Z.*-]*)', 1)))
    return df
Пример #25
0
def getHotelCityData(sparkSession, sourcePath):
    hotelDimDF = sparkSession.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "false")\
    .option("delimiter", "|")\
    .load(sourcePath)\
    .select("hotel_id", "city_code")\
    .withColumnRenamed("hotel_id", "mmt_hotel_id")\
    .withColumn("htl_city_code", upper(col("city_code")))\
    .dropDuplicates()

    return hotelDimDF
Пример #26
0
def uppercase_columns(df: DataFrame, col_list: List) -> DataFrame:
    """
    Rewrite the selected columns with upper cases
    :param df: dataframe
    :param col_list: string array of columns to be upper-cased
    :return: dataframe
    """
    for col in col_list:
        df = df.withColumn(col, F.upper(F.col(col)))
        df = df.withColumn(col, F.regexp_replace(F.col(col), 'İ', 'I'))
        df = df.withColumn(col, F.trim(F.col(col)))
    logging.info(f"{col_list} columns are converted to uppercase")
    return df
Пример #27
0
def getDemographic(input_loc):
    """ Read Demographic information
    
    Args:
      input_loc      : input folder in EMR HDFS
      
    Returns:
      df_demographic : dataframe contains Demographic information
    
    """
    filePath = os.path.join(input_loc, 'us-cities-demographics.csv')
    df_demographic = spark.read.csv(filePath,
                                    sep=";",
                                    header=True,
                                    inferSchema=True)

    # remove rows based on missing values
    df_demographic = df_demographic.na.drop(
        subset=("Male Population", "Female Population", "Number of Veterans",
                "Foreign-born", "Average Household Size"))

    # pivot operation
    groupcol = ('City', 'State', 'Median Age', 'Male Population',
                'Female Population', 'Total Population', 'Number of Veterans',
                'Foreign-born', 'Average Household Size', 'State Code')
    aggrcol = sum('Count')
    df_demographic = df_demographic.groupBy(
        *groupcol).pivot("Race").agg(aggrcol)

    # group by State
    df_demographic = df_demographic.groupBy("State Code","State").agg( \
        avg("Median Age").alias("avg_medianage"),
        sum("Male Population").alias("total_male"),
        sum("Female Population").alias("total_female"),
        sum("Total Population").alias("total_population"),
        sum("Number of Veterans").alias("total_veteran"),
        sum("Foreign-born").alias("total_foreignborn"),
        sum("American Indian and Alaska Native").alias("total_americannative"),
        sum("Asian").alias("total_asian"),
        sum("Black or African-American").alias("total_african"),
        sum("Hispanic or Latino").alias("total_hispanic"),
        sum("White").alias("total_white"),
        min("Average Household Size").alias("min_avghousesize"),
        max("Average Household Size").alias("max_avghousesize")
    )

    # upper case and rounding
    df_demographic = df_demographic.withColumn('State',upper(col('State'))) \
                     .withColumn('avg_medianage',round('avg_medianage',2))

    return df_demographic
Пример #28
0
def transform_parking_violation_data(df: DataFrame,
                                     column: str = "Violation County"
                                     ) -> DataFrame:
    """Transforming parking vialation data to make it joinable, below are the things steps in high level

    1. Added Borocode
    2. Converted house number in case it is separated by '-'
    3. Converted 'Street Name' to upper case
    4. Removed any data having no house number
    """

    df = (df.select(
        "Violation County", "House Number", "Street Name", "Summons Number",
        "Issue Date").distinct().withColumn(
            "year",
            F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))).orderBy(
                "Violation County", "House Number",
                "Street Name", "year").coalesce(100).groupBy(
                    "Violation County", "House Number", "Street Name",
                    "year").agg({
                        "Summons Number": "count"
                    }).withColumnRenamed(
                        "count(Summons Number)", "total_cnt").withColumn(
                            "BOROCODE",
                            F.when(
                                F.col(column).isin([
                                    "MAN", "MH", "MN", "NEWY", "NEW Y", "NY"
                                ]), 1).when(
                                    F.col(column).isin(["BRONX", "BX"]),
                                    2).when(
                                        F.col(column).isin(
                                            ["BK", "K", "KING", "KINGS"]),
                                        3).when(
                                            F.col(column).isin([
                                                "Q", "QN", "QNS", "QU", "QUEEN"
                                            ]), 4).when(
                                                F.col(column).isin(
                                                    ["R", "RICHMOND"]),
                                                5).otherwise(0),
                        ))

    df = (df.filter(F.col("House Number").isNotNull()).withColumn(
        "temp", F.split("House Number", "-")).withColumn(
            "House Number",
            F.col("temp").getItem(0).cast("int") +
            F.when(F.col("temp").getItem(1).isNull(), "0").otherwise(
                F.col("temp").getItem(1)).cast("int") / 1000,
        ).withColumn("temp",
                     F.col("temp").getItem(0).cast("int")).withColumn(
                         "Street Name", F.upper(F.col("Street Name"))))
    return df
Пример #29
0
def processing_airport_data(spark, output_data):
    """
        Description: 
        
            This function reads the airport code file using the
            spark read method and performs data cleaning, processing,
            and finally saves the cleaned data as parquet file to S3.
              
        Arguments:
        
            spark: spark session.
            output_data: the output root directory to the s3 bucket.
           
        Returns:
            None

    """
    airportCode_df = spark.read.csv("Airportgeocode.csv", sep=',', header=True)

    airportCode_df = airportCode_df.withColumn('iso_region_state',F.upper(split("iso_region","-")[1])).drop("iso_region").\
            withColumn('ElevationFt',col("elevation_ft").cast('integer')).drop("elevation_ft","_c0")

    airportCode_df = airportCode_df.na.fill({
        'ElevationFt': 0,
        'municipality': 'Nil',
        'gps_code': 'Nil',
        'iata_code': 'Nil',
        'local_code': 'Nil'
    })

    airportCode_df = airportCode_df.dropDuplicates(['ident'])

    @udf
    def extractCity(line):
        import re
        x = re.search(r"('city':\s*('*\s*\w*(\s*\w*)*\s*'))", line)
        if x:
            x = x.group(2)
            val = x.replace("'", "").strip()
        else:
            val = 'Nil'
        return val

    airportCode_df = airportCode_df.withColumn("City", extractCity('geocode'))

    airportCode_df = airportCode_df.dropDuplicates(['City'])

    print("writing airportCode_df table to s3 bucket")
    airportCode_df.write.mode("overwrite").parquet(
        os.path.join(output_data, "airportData"))
    print("writing airportCode_df table completed")
Пример #30
0
def cad_to_usd_rate(currency_exchange_rates: DataFrame, fiscal_year: str,
                    month_abbr: str) -> float:
    """Currently returns latest exchange rate for the given month."""
    filtered = (
        currency_exchange_rates
        .where(F.col('currency_code_from') == 'CAD')
        .where(F.col('currency_code_to') == 'USD')
        .where(F.year(F.add_months(F.to_date(
            'effective_date', 'yyyyMMdd'), 3)) == fiscal_year)
        .where(F.upper(F.date_format(F.to_date(
            'effective_date', 'yyyyMMdd'), 'MMM')) == month_abbr)
        .sort('effective_date', ascending=False)
    )  # yapf: disable
    return filtered.first().conversion_rate_multiplier
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


# COMMAND ----------

from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------
Пример #32
0

import pyspark.sql.functions as F

df = sqlContext.createDataFrame([('a', 1), ('b', 2), ('a', 3)], ["key", "value"])
df2 = df.withColumn('key', F.upper(df.key))
df2.groupBy('key').agg(F.avg(df.value)).collect()