def customer_name_mri_match(): actDf = spark.sql(""" select dl_file_timestamp,dl_file_prefix,dl_filename,dl_line_no,user_id,title,first_name,last_name,middle_names from dl_business.account where trim(title) != '' and trim(first_name) != '' and trim(last_name) != '' """) actDf = actDf.dropDuplicates([ 'dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'title', 'first_name', 'last_name', 'middle_names' ]) actDf = actDf.withColumn( "UPR_TITLE", clean_string_udf(F.trim(F.upper(F.col("title"))))).withColumn( "UPR_LNAME", clean_string_udf(F.trim(F.upper(F.col("last_name"))))).withColumn( "UPR_FNAME", clean_string_udf(F.trim(F.upper( F.col("first_name"))))).withColumn( "UPR_MNAME", clean_string_udf(F.trim(F.upper( F.col("middle_names"))))) actDf = actDf.where((F.col("UPR_TITLE").isNotNull()) & (F.col("UPR_LNAME").isNotNull()) & (F.col("UPR_FNAME").isNotNull())) actDf.createOrReplaceTempView("account_clean_data") actlocDF = spark.sql(""" select a.*,concat_ws('',a.UPR_TITLE,a.UPR_FNAME,a.UPR_LNAME,a.UPR_MNAME) as NAME_STR,b.orig_addr_lines,b.std_addr_lines,b.orig_post_code,b.std_post_code from account_clean_data a,location_match_data b where a.dl_file_timestamp = b.dl_file_timestamp and a.dl_file_prefix = b.dl_file_prefix and a.dl_line_no = b.dl_line_no """) #actlocDFMRI = fuzzy_match_rate_idx(actlocDF.toPandas(),['NAME_STR']) actlocDFMRI = actlocDF.withColumn("NAME_STR_MRI", name_mrc_udf(F.col("NAME_STR"))) print("Names Match rating codex generation step is done") actlocDFMRI.createOrReplaceTempView("account_location_data") actlocGrpDF = spark.sql(""" select NAME_STR_MRI,count(*) from account_location_data group by NAME_STR_MRI """) #actlocMatchDF = actlocDF.join(actlocGrpDF,(actlocDF.UPR_TITLE == actlocGrpDF.UPR_TITLE) & (actlocDF.UPR_LNAME == actlocGrpDF.UPR_LNAME) & (actlocDF.UPR_FNAME == actlocGrpDF.UPR_FNAME) & (actlocDF.UPR_MNAME == actlocGrpDF.UPR_MNAME)).select(actlocDF['UPR_TITLE'],actlocDF['UPR_FNAME'],actlocDF['UPR_LNAME'],actlocDF['UPR_MNAME'],actlocDF['user_id'],actlocDF['title'],actlocDF['first_name'],actlocDF['last_name'],actlocDF['middle_names'],actlocDF['orig_addr_lines'],actlocDF['std_addr_lines'],actlocDF['orig_post_code'],actlocDF['std_post_code'],actlocDF['dl_file_timestamp'],actlocDF['dl_file_prefix'],actlocDF['dl_filename'],actlocDF['dl_line_no']) actlocMatchDF = actlocDFMRI.join( actlocGrpDF, (actlocDFMRI.NAME_STR_MRI == actlocGrpDF.NAME_STR_MRI)).select( actlocDFMRI['UPR_TITLE'], actlocDFMRI['UPR_FNAME'], actlocDFMRI['UPR_LNAME'], actlocDFMRI['UPR_MNAME'], actlocDFMRI['user_id'], actlocDFMRI['title'], actlocDFMRI['first_name'], actlocDFMRI['last_name'], actlocDFMRI['middle_names'], actlocDFMRI['NAME_STR_MRI'], actlocDFMRI['orig_addr_lines'], actlocDFMRI['std_addr_lines'], actlocDFMRI['orig_post_code'], actlocDFMRI['std_post_code'], actlocDFMRI['dl_file_timestamp'], actlocDFMRI['dl_file_prefix'], actlocDFMRI['dl_filename'], actlocDFMRI['dl_line_no']) finalExactMatDF = actlocMatchDF.select( 'UPR_TITLE', 'UPR_LNAME', 'UPR_FNAME', 'UPR_MNAME', 'std_addr_lines', 'std_post_code', 'user_id', 'title', 'last_name', 'first_name', 'middle_names', 'NAME_STR_MRI', 'orig_addr_lines', 'orig_post_code', 'dl_filename', 'dl_file_timestamp', 'dl_file_prefix', 'dl_line_no') finalExactMatDF.persist() finalExactMatDF.createOrReplaceTempView("identity_stage_data") print("Names string match step based on match rating codex is done")
def etl_us_cities_demographics(spark, input_dir, output_dir): """Clean the us cities demograpgics data""" # this data set is clean # load data data_input_full_file_path = f'{input_dir}/us-cities-demographics.csv' us_cities_demographics_spark_df = spark.read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1", sep=';') \ .load(data_input_full_file_path) us_cities_demographics_spark_df = us_cities_demographics_spark_df \ .withColumnRenamed("City", "city") \ .withColumnRenamed("State", "state") \ .withColumnRenamed("Median Age", "median_age") \ .withColumnRenamed("Male Population", "male_population") \ .withColumnRenamed("Female Population", "female_population") \ .withColumnRenamed("Total Population", "total_population") \ .withColumnRenamed("Number of Veterans", "num_of_veterans") \ .withColumnRenamed("Foreign-born", "foreign_born") \ .withColumnRenamed("Average Household Size", "avg_house_size") \ .withColumnRenamed("State Code", "state_code") \ .withColumnRenamed("Race", "race") \ .withColumnRenamed("Count", "count") \ .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code')))) data_output_full_file_path = f'{output_dir}/us-cities-demographics.parquet' us_cities_demographics_spark_df \ .write \ .options(encoding="ISO-8859-1") \ .mode('overwrite') \ .parquet(data_output_full_file_path)
def filter_fiscal_year_and_month( df: DataFrame, fiscal_year: str, month_abbr: str, date_fmt: str = 'MM/dd/yyyy', filter_column_year: str = 'voucher_creation_date', filter_column_month: str = 'shipment_pickup_date') -> DataFrame: filtered = ( df .withColumn( '_fiscal_year', F.coalesce( F.year(F.add_months( F.to_date(filter_column_year, date_fmt), 3)), F.year(F.add_months( F.to_date(filter_column_month, date_fmt), 3)) )) .withColumn( '_month_abbr', F.coalesce( F.upper(F.date_format( F.to_date(filter_column_year, date_fmt), 'MMM')), F.upper(F.date_format( F.to_date(filter_column_month, date_fmt), 'MMM')) )) .where(F.col('_fiscal_year') == fiscal_year) .where(F.col('_month_abbr') == month_abbr) ) # yapf: disable return filtered
def format_output(df): df = df.withColumn("uniqueKey", f.upper(f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2), f.lit("_"), f.col("channel"), f.lit("_"), f.col("division"), f.lit("_"), f.col("gender"), f.lit("_"), f.col("category"), ))) \ .withColumn("channel", f.upper(f.col("channel"))) \ .withColumn("year", f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2))) \ .withColumn("week_1", f.concat(f.lit("W"), f.col("week"))) output = df.orderBy("week").groupBy('uniqueKey', 'division', 'gender', 'category', 'channel', 'year').agg( f.to_json( f.collect_list( f.create_map('week_1', 'netSales') ) ).alias('Net Sales'), f.to_json( f.collect_list( f.create_map('week_1', 'salesUnits') ) ).alias('Sales Units') ) return output
def customer_name_exact_match(locMatchDF): locMatchDF.createOrReplaceTempView("location_match_data") actDf = spark.sql(""" select dl_file_timestamp,dl_file_prefix,dl_filename,dl_line_no,user_id,title,first_name,last_name,middle_names from dl_business.account where trim(title) != '' and trim(first_name) != '' and trim(last_name) != '' """) actDf = actDf.dropDuplicates([ 'dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'title', 'first_name', 'last_name', 'middle_names' ]) actDf = actDf.withColumn( "UPR_TITLE", clean_string_udf(F.trim(F.upper(F.col("title"))))).withColumn( "UPR_LNAME", clean_string_udf(F.trim(F.upper(F.col("last_name"))))).withColumn( "UPR_FNAME", clean_string_udf(F.trim(F.upper( F.col("first_name"))))).withColumn( "UPR_MNAME", clean_string_udf(F.trim(F.upper( F.col("middle_names"))))) actDf = actDf.where((F.col("UPR_TITLE").isNotNull()) & (F.col("UPR_LNAME").isNotNull()) & (F.col("UPR_FNAME").isNotNull())) actDf.createOrReplaceTempView("account_clean_data") actlocDF = spark.sql(""" select a.*,b.orig_addr_lines,b.std_addr_lines,b.orig_post_code,b.std_post_code from account_clean_data a,location_match_data b where a.dl_file_timestamp = b.dl_file_timestamp and a.dl_file_prefix = b.dl_file_prefix and a.dl_line_no = b.dl_line_no """) actlocDF.createOrReplaceTempView("account_location_data") actlocGrpDF = spark.sql(""" select UPR_TITLE,UPR_LNAME,UPR_FNAME,UPR_MNAME,count(*) from account_location_data group by UPR_TITLE,UPR_LNAME,UPR_FNAME,UPR_MNAME """) actlocMatchDF = actlocDF.join( actlocGrpDF, (actlocDF.UPR_TITLE == actlocGrpDF.UPR_TITLE) & (actlocDF.UPR_LNAME == actlocGrpDF.UPR_LNAME) & (actlocDF.UPR_FNAME == actlocGrpDF.UPR_FNAME) & (actlocDF.UPR_MNAME == actlocGrpDF.UPR_MNAME)).select( actlocDF['UPR_TITLE'], actlocDF['UPR_FNAME'], actlocDF['UPR_LNAME'], actlocDF['UPR_MNAME'], actlocDF['user_id'], actlocDF['title'], actlocDF['first_name'], actlocDF['last_name'], actlocDF['middle_names'], actlocDF['orig_addr_lines'], actlocDF['std_addr_lines'], actlocDF['orig_post_code'], actlocDF['std_post_code'], actlocDF['dl_file_timestamp'], actlocDF['dl_file_prefix'], actlocDF['dl_filename'], actlocDF['dl_line_no']) nameMatDF = actlocMatchDF.select( 'UPR_TITLE', 'UPR_LNAME', 'UPR_FNAME', 'UPR_MNAME', 'std_addr_lines', 'std_post_code', 'user_id', 'title', 'last_name', 'first_name', 'middle_names', 'orig_addr_lines', 'orig_post_code', 'dl_filename', 'dl_file_timestamp', 'dl_file_prefix', 'dl_line_no') #nameMatDF.persist() #nameMatDF.createOrReplaceTempView("identity_stage_data") print("Names string match step based on string exact match is done") return nameMatDF
def process(df, params=None, log=None): df_transformed = ( df .select( col('id'), upper(col('first_name')).alias("first_name"), upper(col('second_name')).alias("second_name"), col('floor'))) return df_transformed
def clean_centerline(centerline): ''' PREPROCESSING: This function takes in the centerline as the input. The role of this function is to clean the centerline file and split compound house numbers into two column (for each L and R combination) We replace rows that are same in FULL_STREE and ST_LABEL with '0' to avoid the OR operation during join We return: the centerline file after preprocessing ''' centerline = centerline.select('PHYSICALID', 'L_LOW_HN', 'L_HIGH_HN', 'R_LOW_HN', 'R_HIGH_HN', 'FULL_STREE', 'ST_LABEL', 'BOROCODE') centerline = centerline.na.drop(subset=[ 'PHYSICALID', 'L_LOW_HN', 'L_HIGH_HN', 'R_LOW_HN', 'R_HIGH_HN', 'FULL_STREE', 'ST_LABEL', 'BOROCODE' ]) centerline = centerline.withColumn( 'FULL_STREE', F.upper(F.col('FULL_STREE'))).withColumn('ST_LABEL', F.upper(F.col('ST_LABEL'))) house = F.split(centerline['L_LOW_HN'], '-') centerline = centerline.withColumn('L_LOW_HN_1', house.getItem(0).cast('int')) centerline = centerline.withColumn('L_LOW_HN_2', house.getItem(1).cast('int')) house = F.split(centerline['L_HIGH_HN'], '-') centerline = centerline.withColumn('L_HIGH_HN_1', house.getItem(0).cast('int')) centerline = centerline.withColumn('L_HIGH_HN_2', house.getItem(1).cast('int')) house = F.split(centerline['R_LOW_HN'], '-') centerline = centerline.withColumn('R_LOW_HN_1', house.getItem(0).cast('int')) centerline = centerline.withColumn('R_LOW_HN_2', house.getItem(1).cast('int')) house = F.split(centerline['R_HIGH_HN'], '-') centerline = centerline.withColumn('R_HIGH_HN_1', house.getItem(0).cast('int')) centerline = centerline.withColumn('R_HIGH_HN_2', house.getItem(1).cast('int')) print( "Done performing preprocessing for Centerline, now moving to the conditional joins part" ) centerline = centerline.withColumn( 'ST_LABEL', F.when(centerline['FULL_STREE'] == centerline['ST_LABEL'], '0').otherwise(centerline['ST_LABEL'])) return (centerline)
def run_pipeline(): sc = get_session() us_immg_df = process_immigration_data(sc, 'sas_data') us_dem_data, race_counts = get_demographics_info( spark, "us-cities-demographics.csv") us_immg_df=us_immg_df.join(us_dem_data, (upper(us_immg_df.port_city)==upper(us_dem_data.City)) & \ (upper(us_immg_df.port_state)==upper(us_dem_data.State_Code)), how='left') us_immg_df.count() us_immg_df = us_immg_df.drop("City", "State_Code") us_immg_df.drop('arrival_date').write.mode("overwrite").parquet( 'i94-immigration-data.parquet') qa_checks(us_dem_data, race_counts, us_immg_df)
def pre_processing(output_uri): spark = SparkSession.builder.appName("process sample data").getOrCreate() rdd = spark.sparkContext.parallelize([ (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)), (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)), (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0)) ]) df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e']) spark.conf.set('spark.sql.repl.eagerEval.enabled', True) if type(df.c) == type(upper(df.c)) == type(df.c.isNull()): df_new = df.withColumn('upper_c', upper(df.c)) df_new df_new.repartition(1).write.option("header", "true").mode("overwrite").csv(output_uri)
def getTemperature(input_loc): """ Read Global Temperature information Args: input_loc : input folder in EMR HDFS Returns: df_temperature : dataframe contains Global Temperature """ filePath = os.path.join(input_loc, 'GlobalLandTemperaturesByCity.csv') df_temperature = spark.read.csv(filePath, header=True, inferSchema=True) # Remove all dates prior to 2010 df_temperature = df_temperature.filter(df_temperature.dt >= '2010-01-01') # Aggregate by Country df_temperature = df_temperature.groupby(["Country"]).agg( avg("AverageTemperature").alias("AverageTemperature"), first("Latitude").alias("Latitude"), first("Longitude").alias("Longitude")) # uppercase Country so later could be joined with Country dimension table df_temperature = df_temperature.withColumn('Country',upper(col('Country'))) \ .withColumn('AverageTemperature',round('AverageTemperature',2)) return df_temperature
def etl_airport_code(spark, input_dir, output_dir): """Clean the airport code data""" # load data airport_code_data_input_full_file_path = f'{input_dir}/airport-codes_csv.csv' airport_code_spark_df = spark.read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1") \ .load(airport_code_data_input_full_file_path) airport_code_spark_df = airport_code_spark_df \ .withColumnRenamed('name', 'airport_name') \ .filter(F.col('iso_country') == 'US') # split iso_region column into Latitude and Longitude split_iso_region = F.split(airport_code_spark_df['iso_region'], '-') airport_code_spark_df = airport_code_spark_df \ .withColumn('region', split_iso_region.getItem(1)) \ .withColumn('municipality_region', F.concat_ws(', ', F.upper(F.col('municipality')), F.upper(F.col('region')))) new_airport_code_spark_df = airport_code_spark_df \ .drop('iso_region') \ .drop('coordinates') data_output_full_file_path = f'{output_dir}/airport-codes.parquet' new_airport_code_spark_df \ .write \ .options(encoding="ISO-8859-1") \ .mode('overwrite') \ .parquet(data_output_full_file_path)
def clean_violations(violations): violations = violations.na.drop(subset=['Street Name','House Number','Violation County','Issue Date']) violations = violations.select('House Number','Street Name','Violation County', 'Issue Date') violations = violations.withColumn('Street Name', F.upper(F.col('Street Name'))) violations = violations.withColumn("House Number", F.regexp_replace(F.col("House Number"), "[A-Z]", "")) split_year = F.split(violations['Issue Date'],'/') violations = violations.withColumn('Year',split_year.getItem(2)).drop('Issue Date') split_col = F.split(violations['House Number'],'-') violations = violations.withColumn('House_Num1',split_col.getItem(0).cast('int')) violations = violations.withColumn('House_Num2',split_col.getItem(1).cast('int')) boroughs = {'MAN':'1', 'MH':'1', 'MN':'1', 'NEWY':'1', 'NEW Y':'1', 'NY':'1', 'BRONX':'2','BX':'2', 'PBX':'2', 'BK':'3', 'K':'3', 'KING':'3', 'KINGS':'3', 'Q':'4', 'QN':'4', 'QNS':'4', 'QU':'4','QUEEN':'4', 'R':'5', 'RICHMOND':'5'} violations = violations.replace(boroughs, subset='Violation County') violations.createOrReplaceTempView('violations') violations = spark.sql('SELECT * FROM violations WHERE Year >= 2015 AND Year <= 2019') # violations = violations.groupby('House Number','Street Name','Violation County','Year','House_Num1','House_Num2').count() print("Done performing preprocessing for Violations, now moving to Centerline") return(violations)
def clean_centerline(centerline): centerline = centerline.select('PHYSICALID','L_LOW_HN','L_HIGH_HN', 'R_LOW_HN','R_HIGH_HN','FULL_STREE','ST_LABEL','BOROCODE') centerline = centerline.na.drop(subset=['PHYSICALID','L_LOW_HN','L_HIGH_HN', 'R_LOW_HN','R_HIGH_HN','FULL_STREE','ST_LABEL','BOROCODE']) centerline = centerline.withColumn('FULL_STREE', F.upper(F.col('FULL_STREE'))).withColumn('ST_LABEL', F.upper(F.col('ST_LABEL'))) split_col = F.split(centerline['L_LOW_HN'], '-') centerline = centerline.withColumn('L_LOW_HN_1', split_col.getItem(0).cast('int')) centerline = centerline.withColumn('L_LOW_HN_2', split_col.getItem(1).cast('int')) split_col = F.split(centerline['L_HIGH_HN'], '-') centerline = centerline.withColumn('L_HIGH_HN_1', split_col.getItem(0).cast('int')) centerline = centerline.withColumn('L_HIGH_HN_2', split_col.getItem(1).cast('int')) split_col = F.split(centerline['R_LOW_HN'], '-') centerline = centerline.withColumn('R_LOW_HN_1', split_col.getItem(0).cast('int')) centerline = centerline.withColumn('R_LOW_HN_2', split_col.getItem(1).cast('int')) split_col = F.split(centerline['R_HIGH_HN'], '-') centerline = centerline.withColumn('R_HIGH_HN_1', split_col.getItem(0).cast('int')) centerline = centerline.withColumn('R_HIGH_HN_2', split_col.getItem(1).cast('int')) print("Done performing preprocessing for Centerline, now moving to the conditional joins part") return(centerline)
def clean_violations(violations): ''' PREPROCESSING: This function takes in the violations (all five files) as input. In this function, we clean the violations table and drop the unwanted columns. We split house number into two to accomadate for compound house numbers We return: the violations file after pivoting on the years to reduce number of rows ''' violations = violations.na.drop(subset=['Street Name','House Number','Violation County','Issue Date']) violations = violations.select('House Number','Street Name','Violation County', 'Issue Date') violations = violations.withColumn('Street Name', F.upper(F.col('Street Name'))) violations = violations.withColumn("House Number", F.regexp_replace(F.col("House Number"), "[A-Z]", "")) split_year = F.split(violations['Issue Date'],'/') violations = violations.withColumn('Year',split_year.getItem(2)).drop('Issue Date') split_col = F.split(violations['House Number'],'-') violations = violations.withColumn('House_Num1',split_col.getItem(0).cast('int')) violations = violations.withColumn('House_Num2',split_col.getItem(1).cast('int')) boroughs = {'MAN':'1', 'MH':'1', 'MN':'1', 'NEWY':'1', 'NEW Y':'1', 'NY':'1', 'BRONX':'2','BX':'2', 'PBX':'2', 'BK':'3', 'K':'3', 'KING':'3', 'KINGS':'3', 'Q':'4', 'QN':'4', 'QNS':'4', 'QU':'4','QUEEN':'4', 'R':'5', 'RICHMOND':'5'} violations = violations.replace(boroughs, subset='Violation County') violations.createOrReplaceTempView('violations') violations = spark.sql('SELECT * FROM violations WHERE Year >= 2015 AND Year <= 2019') violations_pivot = violations.groupby('Violation County','Street Name','House Number','House_Num1','House_Num2').pivot('Year',["2015","2016","2017","2018","2019"]).count().cache() print("Done performing preprocessing for Violations, now moving to Centerline") return(violations_pivot)
def transform_read_centerline_data(df: DataFrame) -> DataFrame: """Transforming centerline data to make it joinable, below are the things steps in high level 1. Converted ST_LABEL & FULL_STREE to upper case 2. Converted L_LOW_HN & L_HIGH_HN separated by '-' for odd house number 3. Converted R_LOW_HN & R_HIGH_HN separated by '-' for even house number 4. Removed any data having no house number in L_LOW_HN and R_LOW_HN """ df = (df.select("PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME", "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").orderBy( "PHYSICALID", "BOROCODE", "FULL_STREE", "ST_NAME", "L_LOW_HN", "L_HIGH_HN", "R_LOW_HN", "R_HIGH_HN").coalesce(200).withColumn( "ST_NAME", F.upper(F.col("ST_NAME"))).withColumn( "FULL_STREE", F.upper(F.col("FULL_STREE"))).filter( (F.col("L_LOW_HN").isNotNull()) | (F.col("R_LOW_HN").isNotNull()))) df = df.withColumn("L_TEMP_ODD", F.split("L_LOW_HN", "-")).withColumn( "L_LOW_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) df = df.withColumn("L_TEMP_ODD", F.split("L_HIGH_HN", "-")).withColumn( "L_HIGH_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) df = df.withColumn("L_TEMP_ODD", F.split("R_LOW_HN", "-")).withColumn( "R_LOW_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) df = df.withColumn("L_TEMP_ODD", F.split("R_HIGH_HN", "-")).withColumn( "R_HIGH_HN", F.col("L_TEMP_ODD").getItem(0).cast("int") + F.when(F.col("L_TEMP_ODD").getItem(1).isNull(), "0").otherwise( F.col("L_TEMP_ODD").getItem(1)).cast("int") / 1000, ) return df
def city(cName): if cName == "MAN" or cName == "MH" or cName == "MN" or cName == "NEWY" or cName == "NEW Y" or cName == "NY": return "1" elif cName == "BX" or cName == "BRONX": return "2" elif cName == "BK" or cName == "K" or cName == "KING" or cName == "KINGS": return "3" elif cName == "Q" or cName == "QN" or cName == "QNS" or cName == "QU" or cName == "QUEEN": return "4" elif cName == "R" or cName == "RICHMOND": return "5" else: return "*!NULL!*" spark.udf.register("city", city) city = func.udf(city) df1 = df1.select( func.col("House Number").cast("int").alias("House Number"), func.upper(func.col("Street Name")).alias("Street Name"), city(func.col("Violation County")).cast("int").alias("BOROCODE") ) df2 = df2.select( func.col("PHYSICALID").cast("int").alias("PHYSICALID"), func.upper(func.col("FULL_STREE")).alias("FULL_STREE"), func.upper(func.col("ST_LABEL")).alias("ST_LABEL"), func.col("BOROCODE").cast("int").alias("BOROCODE"), func.col("L_LOW_HN").cast("int").alias("L_LOW_HN"), func.col("L_HIGH_HN").cast("int").alias("L_HIGH_HN"), func.col("R_LOW_HN").cast("int").alias("R_LOW_HN"), func.col("R_HIGH_HN").cast("int").alias("R_HIGH_HN") ) df1.join( df2, (df1["BOROCODE"] == df2["BOROCODE"]) & #join 1 #join2 \/ ((df1["Street Name"] == df2["ST_LABEL"]) | (df1["Street Name"] == df2["FULL_STREE"]))).filter( ( (func.col("House Number")%2 != 0) & (func.col("House Number") >= func.col("L_LOW_HN")) & (func.col("House Number") <= func.col("L_HIGH_HN")) ) | ( (func.col("House Number")%2 == 0) & (func.col("House Number") >= func.col("R_LOW_HN")) & (func.col("House Number") <= func.col("R_HIGH_HN")) )).count()
def main(): """ mathod to get TCIA data file list""" bktname = "dataengexpspace" datalist = "s3://dataengexpspace/data/TCIAData/ACRIN-DSC-MR-Brain.json" conf = SparkConf().setAppName("Prepare_TCIAData_list") sctx = SparkContext(conf=conf).getOrCreate() sqlctx = SQLContext(sctx) tciadf = sqlctx.read.json(datalist) tmpdf = tciadf.na.fill("NULL") tmpdf = tmpdf.withColumn('ShortStudyUID', col('StudyInstanceUID').substr(60, 5)) tmpdf = tmpdf.withColumn('ShortInstanceUID', col('SeriesInstanceUID').substr(60, 5)) tmpdf = tmpdf.withColumn('SeriesDescription', upper(col('SeriesDescription'))) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[(]', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[)]', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[*]', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[+]', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'\s+', '_')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[?]*[/]', 'over')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[/]{1}', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[?]$', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'^[_]', '')) tmpdf = tmpdf.withColumn('SeriesDescription', regexp_replace('SeriesDescription', r'[_]$', '')) tmpdf = tmpdf.withColumn('S3objkey', concat(lit('data/TCIAData_p3/'), col('Collection'), lit('/'), col('PatientID'), lit('/'), col('SeriesDate'), lit('-'), col('ShortStudyUID'), lit('/'), col('SeriesNumber'), lit('-'), col('SeriesDescription'), lit('-'), col('ShortInstanceUID'), lit('/blob.zip') ) ) tmpdf = tmpdf.select('SeriesInstanceUID', 'S3objkey') tmpdf.show(5) tmpdf.write.save('file:///tmp/tmpdfjson', format='json', mode='overwrite') s3client = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')) tmpname = glob.glob('/tmp/tmpdfjson/*.json')[0] print(tmpname) response = s3client.upload_file(tmpname, bktname, "data/TCIAData/metadata/filelist_p3.json") if response is not None: print(">>>>>>> Upload problem for file: ", tmpname) print("remove tmp file: {}".format(tmpname)) os.remove(tmpname)
def upper_column(self, column): try: self.spark_df = self.spark_df.withColumn('temp', f.upper(f.col(column))).drop(column)\ .withColumnRenamed('temp', column) return self.get_json_df_response() except Exception as e: print(e) return None
def MyTransform(glueContext, dfc) -> DynamicFrameCollection: import pyspark.sql.functions as F # convert dymamic frame to data frame df = dfc.select(list(dfc.keys())[0]).toDF() df = df.withColumn("title", F.upper(F.col("title"))) # create dynamic frame from dataframe upperDf = DynamicFrame.fromDF(df, glueContext, "filter_votes") return (DynamicFrameCollection({"CustomTransform0": upperDf}, glueContext))
def upper_columns(df: DataFrame, cols: list) -> DataFrame: new_cols = [] for field in df.schema.fields: if field.dataType == T.StringType() and field.name in cols: new_cols.append(F.upper(F.col(field.name)).alias(field.name)) else: new_cols.append(F.col(field.name)) return df.select(*new_cols)
def preprocess_names(spark: SparkSession, data_dir = catalog['clean/whitehouse_logs_cleaned']): whl_df = spark.read.options(delimiter=',').csv(path=data_dir, header=True, inferSchema=True) # created a concatenated Name string for visitor whl_df = whl_df.withColumn('VISITOR_NAME', upper(concat_ws(' ', whl_df.NAMEFIRST, whl_df.NAMEMID, whl_df.NAMELAST))) # created a concatenated Name string for visitee whl_df = whl_df.withColumn('VISITEE_NAME', upper(concat_ws(' ', whl_df.visitee_namefirst, whl_df.visitee_namelast))) # persist whl_df.write.\ mode('overwrite').\ csv(catalog['clean/whitehouse_logs_processed'], header=True)
def add_fiscal_year_and_month_abbr( df, date_fmt: str = 'yyyy/MM/dd', filter_column_year: str = 'voucher_creation_date', filter_column_month: str = 'shipment_pickup_date') -> DataFrame: expr_mapping = { '_fiscal_year': (F.coalesce( F.year(F.add_months(F.to_date(filter_column_year, date_fmt), 3)), F.year(F.add_months(F.to_date(filter_column_month, date_fmt), 3)))), '_month_abbr': (F.coalesce( F.upper( F.date_format(F.to_date(filter_column_year, date_fmt), 'MMM')), F.upper( F.date_format(F.to_date(filter_column_month, date_fmt), 'MMM')))) } select_expr = build_col_expr(expr_mapping) transformed = df.select(F.expr('*'), *select_expr) return transformed
def _process_marketplace(self, source_df, target_file): """ Process Marketplace :param source_df: Spark DataFrame with source data :param target_file: name of the file for storing """ marketplace_df = source_df.select(upper( col("marketplace"))).dropDuplicates() logging.debug(f"Writing {target_file}") marketplace_df.coalesce(1).\ write.csv(path = target_file, sep = ';', header = True, mode = "overwrite", quote = '"', escape = '"')
def house_number_extract(df): #make address_line_1 all uppercase df = df.withColumn('address_line_1', f.upper('address_line_1')) #extract house number or box number into column housenumber df = df.withColumn('housenumber', f.when( f.col('address_line_1').rlike('^[A-Z]{2}'), f.regexp_extract(f.col('address_line_1'),'(BOX\\s)([0-9]+[0-9A-Z.*-]*)', 2)) .otherwise(f.regexp_extract(f.col('address_line_1'),'^([A-Z]*[0-9]+[0-9A-Z.*-]*)', 1))) return df
def getHotelCityData(sparkSession, sourcePath): hotelDimDF = sparkSession.read.format("csv")\ .option("header", "true")\ .option("inferSchema", "false")\ .option("delimiter", "|")\ .load(sourcePath)\ .select("hotel_id", "city_code")\ .withColumnRenamed("hotel_id", "mmt_hotel_id")\ .withColumn("htl_city_code", upper(col("city_code")))\ .dropDuplicates() return hotelDimDF
def uppercase_columns(df: DataFrame, col_list: List) -> DataFrame: """ Rewrite the selected columns with upper cases :param df: dataframe :param col_list: string array of columns to be upper-cased :return: dataframe """ for col in col_list: df = df.withColumn(col, F.upper(F.col(col))) df = df.withColumn(col, F.regexp_replace(F.col(col), 'İ', 'I')) df = df.withColumn(col, F.trim(F.col(col))) logging.info(f"{col_list} columns are converted to uppercase") return df
def getDemographic(input_loc): """ Read Demographic information Args: input_loc : input folder in EMR HDFS Returns: df_demographic : dataframe contains Demographic information """ filePath = os.path.join(input_loc, 'us-cities-demographics.csv') df_demographic = spark.read.csv(filePath, sep=";", header=True, inferSchema=True) # remove rows based on missing values df_demographic = df_demographic.na.drop( subset=("Male Population", "Female Population", "Number of Veterans", "Foreign-born", "Average Household Size")) # pivot operation groupcol = ('City', 'State', 'Median Age', 'Male Population', 'Female Population', 'Total Population', 'Number of Veterans', 'Foreign-born', 'Average Household Size', 'State Code') aggrcol = sum('Count') df_demographic = df_demographic.groupBy( *groupcol).pivot("Race").agg(aggrcol) # group by State df_demographic = df_demographic.groupBy("State Code","State").agg( \ avg("Median Age").alias("avg_medianage"), sum("Male Population").alias("total_male"), sum("Female Population").alias("total_female"), sum("Total Population").alias("total_population"), sum("Number of Veterans").alias("total_veteran"), sum("Foreign-born").alias("total_foreignborn"), sum("American Indian and Alaska Native").alias("total_americannative"), sum("Asian").alias("total_asian"), sum("Black or African-American").alias("total_african"), sum("Hispanic or Latino").alias("total_hispanic"), sum("White").alias("total_white"), min("Average Household Size").alias("min_avghousesize"), max("Average Household Size").alias("max_avghousesize") ) # upper case and rounding df_demographic = df_demographic.withColumn('State',upper(col('State'))) \ .withColumn('avg_medianage',round('avg_medianage',2)) return df_demographic
def transform_parking_violation_data(df: DataFrame, column: str = "Violation County" ) -> DataFrame: """Transforming parking vialation data to make it joinable, below are the things steps in high level 1. Added Borocode 2. Converted house number in case it is separated by '-' 3. Converted 'Street Name' to upper case 4. Removed any data having no house number """ df = (df.select( "Violation County", "House Number", "Street Name", "Summons Number", "Issue Date").distinct().withColumn( "year", F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))).orderBy( "Violation County", "House Number", "Street Name", "year").coalesce(100).groupBy( "Violation County", "House Number", "Street Name", "year").agg({ "Summons Number": "count" }).withColumnRenamed( "count(Summons Number)", "total_cnt").withColumn( "BOROCODE", F.when( F.col(column).isin([ "MAN", "MH", "MN", "NEWY", "NEW Y", "NY" ]), 1).when( F.col(column).isin(["BRONX", "BX"]), 2).when( F.col(column).isin( ["BK", "K", "KING", "KINGS"]), 3).when( F.col(column).isin([ "Q", "QN", "QNS", "QU", "QUEEN" ]), 4).when( F.col(column).isin( ["R", "RICHMOND"]), 5).otherwise(0), )) df = (df.filter(F.col("House Number").isNotNull()).withColumn( "temp", F.split("House Number", "-")).withColumn( "House Number", F.col("temp").getItem(0).cast("int") + F.when(F.col("temp").getItem(1).isNull(), "0").otherwise( F.col("temp").getItem(1)).cast("int") / 1000, ).withColumn("temp", F.col("temp").getItem(0).cast("int")).withColumn( "Street Name", F.upper(F.col("Street Name")))) return df
def processing_airport_data(spark, output_data): """ Description: This function reads the airport code file using the spark read method and performs data cleaning, processing, and finally saves the cleaned data as parquet file to S3. Arguments: spark: spark session. output_data: the output root directory to the s3 bucket. Returns: None """ airportCode_df = spark.read.csv("Airportgeocode.csv", sep=',', header=True) airportCode_df = airportCode_df.withColumn('iso_region_state',F.upper(split("iso_region","-")[1])).drop("iso_region").\ withColumn('ElevationFt',col("elevation_ft").cast('integer')).drop("elevation_ft","_c0") airportCode_df = airportCode_df.na.fill({ 'ElevationFt': 0, 'municipality': 'Nil', 'gps_code': 'Nil', 'iata_code': 'Nil', 'local_code': 'Nil' }) airportCode_df = airportCode_df.dropDuplicates(['ident']) @udf def extractCity(line): import re x = re.search(r"('city':\s*('*\s*\w*(\s*\w*)*\s*'))", line) if x: x = x.group(2) val = x.replace("'", "").strip() else: val = 'Nil' return val airportCode_df = airportCode_df.withColumn("City", extractCity('geocode')) airportCode_df = airportCode_df.dropDuplicates(['City']) print("writing airportCode_df table to s3 bucket") airportCode_df.write.mode("overwrite").parquet( os.path.join(output_data, "airportData")) print("writing airportCode_df table completed")
def cad_to_usd_rate(currency_exchange_rates: DataFrame, fiscal_year: str, month_abbr: str) -> float: """Currently returns latest exchange rate for the given month.""" filtered = ( currency_exchange_rates .where(F.col('currency_code_from') == 'CAD') .where(F.col('currency_code_to') == 'USD') .where(F.year(F.add_months(F.to_date( 'effective_date', 'yyyyMMdd'), 3)) == fiscal_year) .where(F.upper(F.date_format(F.to_date( 'effective_date', 'yyyyMMdd'), 'MMM')) == month_abbr) .sort('effective_date', ascending=False) ) # yapf: disable return filtered.first().conversion_rate_multiplier
from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(2) # COMMAND ---------- from pyspark.sql.functions import initcap df.select(initcap(col("Description"))).show() # COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ----------
import pyspark.sql.functions as F df = sqlContext.createDataFrame([('a', 1), ('b', 2), ('a', 3)], ["key", "value"]) df2 = df.withColumn('key', F.upper(df.key)) df2.groupBy('key').agg(F.avg(df.value)).collect()