def fix_country(col: Column) -> Column: """ Use built-in Spark functions to clean-up the "Country" column e.g. " cAnAdA " -> "Canada" Don't forget about those annoying leading/trailing spaces. """ fixed_country = F.initcap(F.trim(col)) return fixed_country
def test_initcap(): # Because we don't use the same unicode version we need to limit # the charicter set to something more reasonable # upper and lower should cover the corner cases, this is mostly to # see if there are issues with spaces gen = mk_str_gen('([aAbB]{0,5}[ \r\n\t]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select(f.initcap(f.col('a'))))
def load_police_districts(spark, path, output_path): """ Get police districts from CSV and write to parquet files. """ print("-- police districts --") df_sfpd_districts = spark.read.csv(path + "sfpd_districts.csv", header = 'true') \ .withColumn("district_id", monotonically_increasing_id()) \ .select("district_id", col("COMPANY").alias("company"), initcap(col("DISTRICT")).alias("district")) df_sfpd_districts.show() df_sfpd_districts.write.mode("overwrite").orc(output_path + "police_districts")
def get_cleaned_message(self, flat_message_df): cleaned_message_df = flat_message_df.select( F.col("id"), \ F.when(F.col("first_name").isNull(), 'NULL').otherwise(F.col("first_name")).alias("first_name"), \ F.col("last_name"), \ F.when(F.col("email").isNull(), 'NULL').otherwise(F.col("email")).alias("email"), \ F.col("gender"), \ F.col("ip_address"),\ F.date_format(F.col("date"), 'dd/MM/yyyy').alias("date"), \ F.initcap(F.col("country")).alias("country"), \ F.col("timestamp") \ ).withColumn("ip_addr",F.when(F.col("ip_address").rlike(IP_REGEX),F.col("ip_address")).otherwise(DEFAULT_IP)).drop(F.col("ip_address")) return cleaned_message_df
def run(): # Build session sparkql = SparkSession.builder.master('local[1]').getOrCreate() # Load config informaiton if __name__ == '__main__': people_path = config['defaults']['ch3']['ep1']['passenger_input'].get(str) save_path = config['defaults']['ch3']['ep1']['passenger_output'].get(str) bq_table = config['defaults']['ch3']['ep1']['passenger_table'].get(str) logger.info(f"Loading passenger info from {people_path}") # read csv file into spark dataframe passengers_df = sparkql.read.csv(people_path, header=True) logger.info(f"There are {passengers_df.count()} rows") # Load the passenger data and make sure the names have initial capitalization logger.info("Cleaning names and creating full name") passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\ .withColumn('middle_name', initcap(col('middle_name')))\ .withColumn('last_name', initcap(col('last_name'))) # Create full_name column passengers_df = passengers_df.withColumn( 'full_name', concat_ws(" ", col('first_name'), col('middle_name'), col('last_name'))) logger.info("Creating sha2 uid from email") # Create a sha2 uid based on the email passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256)) logger.info(f"Saving file to {save_path}") # Save dataframe as a parquet file passengers_df.write.parquet(save_path) logger.info("Uploading file to BigQuery") # Upload the file as an external table in BigQuery gbq_load(bq_table, save_path)
def load_passengers(self, passenger_filename, passenger_output): """ Function to load the passenger data from csv in GCS, clean, add UID, and upload to BigQuery :param passenger_filename: str input file name :param passenger_output: str of project.dataset.table to save passenger data """ self.passenger_filename = passenger_filename self.passenger_output = passenger_output people_path = 'gs://{}/{}'.format(self.bucket, passenger_filename) logger.info(f"Loading passenger info from {self.bucket}.{passenger_filename}") passengers_df = self.sparkql.read.csv(people_path, header=True) # Use withColumn and initcap to standardize the names passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\ .withColumn('middle_name', initcap(col('middle_name')))\ .withColumn('last_name', initcap(col('last_name'))) # Create full_name column passengers_df = passengers_df.withColumn('full_name', concat_ws(" ", col('first_name'), col('middle_name'), col('last_name'))) passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256)) # Write to BigQuery logger.info(f"Writing file to {passenger_output}") passengers_df.write.format('bigquery') \ .option('table', passenger_output) \ .save() self.passengers_df = passengers_df
from pyspark.sql.functions import desc df.orderBy(desc("salary")).show(5) df.sort(desc("salary")).show(5) """操作Number""" # 1. 取次方 透過pwd from pyspark.sql.functions import expr, pow expected_salary = df.select(expr("name"), pow(col("salary"),2)-10000) # 2. 四捨五入 round() df.describe().show() """操作String""" # 1. 將首字母轉為大寫 from pyspark.sql.functions import initcap, lower,upper df.select(initcap(col("name"))).show() df.select(lower(col("name"))).show() df.select(upper(col("name"))).show() # 2. 刪除空格 : from pyspark.sql.functions import lpad, ltrim, rtrim, rpad, trim # 3. 模糊查詢: df.filter(col('name').like('%nd%')).show() # 4. 去重: df.select('name').dropDuplicates().show() # 5. 分割字符串 : df.withColumn("splited_name",split(df_webpages["name"],",")[1]).show()
def test_initcap_space(): # we see a lot more space delim gen = StringGen('([aAbB]{0,5}[ ]{1,2}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select(f.initcap(f.col('a'))))
Person("Sidhartha", "Ray", 32, None, "Programmer"), Person("Pratik", "Solanki", 22, 176.7, None), Person("Ashok ", "Pradhan", 62, None, None), Person(" ashok", "Pradhan", 42, 125.3, "Chemical Engineer"), Person("Pratik", "Solanki", 22, 222.2, "Teacher") ]) people_df.show() people_df.groupBy("firstName").agg(first("weightInLbs")).show() people_df.groupBy(trim(lower(col('firstName')))).agg(first("weightInLbs")).show() people_df.groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show() people_df.sort(col("weightInLbs").desc()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show() people_df.sort(col("weightInLbs").asc_nulls_last()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show() corrected_people_df = people_df\ .withColumn("firstName", initcap("firstName"))\ .withColumn("firstName", ltrim(initcap("firstName")))\ .withColumn("firstName", trim(initcap("firstName")))\ corrected_people_df.groupBy("firstName").agg(first("weightInLbs")).show() corrected_people_df = corrected_people_df\ .withColumn("fullName", format_string("%s %s", "firstName", "lastName"))\ corrected_people_df.show() corrected_people_df = corrected_people_df\ .withColumn("weightInLbs", coalesce("weightInLbs", lit(0)))\ corrected_people_df.show()
df.stat.approxQuantile("UnitPrice", quantileProbs, relError) df.stat.crosstab("StockCode", "Quantity").show() df.stat.freqItems(["StockCode", "Quantity"]).show() ##creating a unique sequence id from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(10) #working with strings #perform case converstions from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad df.select(initcap(col("Description"))).show(5) df.select(col("Description"), initcap(col("Description")), lower(col("Description")), upper(col("Description"))).show(5) string_with_space = " hello " df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)), trim(lit(string_with_space))).show() #regular expressions #working with dates, timestamps from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp dateDF = spark.range(10).withColumn("today", current_date()).withColumn( "now", current_timestamp())
#verificando la estructura del dataframe WordlCupMatches df_wcm.printSchema() print('El dataframe df_wc tiene ' + str(df_wcm.count()) + ' registros.') print('El dataframe df_wc tiene ' + str(df_wcm.distinct().count()) + ' registros distintos.') # COMMAND ---------- from pyspark.sql.functions import desc, asc, col, column, expr, instr, length, substring, regexp_replace, trim, lit, initcap, sum, concat # #Transformaciones al datraframe WordlCupPlayers # El campo Event guarda un dato del tipo "G43' G87'", lo que significa que el jugador usar dos goles, por lo que necesitamos contar el número de "G" # De igual manera podemos obtener el número de penales y tarjetas df_wcp1=df_wcp.withColumn('POSICION_JUGADOR',expr("case when position='C' THEN 'Captain' WHEN position='GK' THEN 'Goalkeeper' ELSE 'Other' end "))\ .withColumn('NOMBRE_JUGADOR', initcap(regexp_replace('Player Name','�','u')))\ .withColumn('NUMERO_GOLES',length('Event')-length(trim(regexp_replace('Event','G',''))))\ .withColumn('NUMERO_PENALES',length('Event')-length(trim(regexp_replace('Event','P',''))))\ .withColumn('NUMERO_PENALES_FALLADOS',length('Event')-length(trim(regexp_replace('Event','MP',''))))\ .withColumn('NUMERO_TARJETAS_ROJAS',length('Event')-length(trim(regexp_replace('Event','R','')))) #Reemplazo los valores nulos con 0 df_wcp1=df_wcp1.withColumn('NUMERO_GOLES',expr("case when NUMERO_GOLES is null then 0 else NUMERO_GOLES end "))\ .withColumn('NUMERO_PENALES',expr("case when NUMERO_PENALES is null then 0 else NUMERO_PENALES end "))\ .withColumn('NUMERO_PENALES_FALLADOS',expr("case when NUMERO_PENALES_FALLADOS is null then 0 else NUMERO_PENALES_FALLADOS end "))\ .withColumn('NUMERO_TARJETAS_ROJAS',expr("case when NUMERO_TARJETAS_ROJAS is null then 0 else NUMERO_TARJETAS_ROJAS end "))\ .withColumnRenamed('Team Initials','INICIALES_PAIS')\ .drop('Player Name','Position','Shirt Number') #Sumarizo por nombre de jugador , posición e iniciales de país df_wcp_rep=df_wcp1.select('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS','NUMERO_GOLES','NUMERO_PENALES','NUMERO_PENALES_FALLADOS','NUMERO_TARJETAS_ROJAS')\ .groupby('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS')\
df = spark.read.csv(path="transactions.csv", header=True, inferSchema=False, sep=";") df1 = lower_header(data_frame=df) df2 = df1.select( df1.id, df1.created_at, df1.merchant_id, df1.valor, df1.n_parcelas, functions.round(df1.valor / df1.n_parcelas, 2), functions.initcap(df1.nome_no_cartao), functions.when( functions.lower(df1.status) == "refunded", "refused" ).when( functions.lower(df1.status) == "in process", "processing" ).otherwise(functions.lower(df1.status)), df1.card_id, df1.iso_id, df1.card_brand, functions.regexp_replace(functions.lower(df1.documento), pattern=r"(\D+)", replacement="") ) df3 = casting_fields(data_frame=df2, json_schema=get_json_schema_mapping("types_mapping.json")) df3.write.csv(path="output/sanitize_transactions", header=True, sep=";")
.format("jdbc") \ .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \ .option("dbtable", "airbnb") \ .option("user", "test") \ .option("password", "test") \ .option("driver", "org.postgresql.Driver") \ .load() df = df.withColumn('bedrooms', F.round(df['bedrooms'], 0)) df = df.filter(~F.col('city').contains("/")) df = df.filter(~F.col('city').contains(",")) df = df.filter(~F.col('city').contains("-")) df = df.filter(~F.col('city').contains("^[0-9]*$")) df = df.filter(~F.col('city').contains("*")) df = df.withColumn('city', F.ltrim(df.city)) df = df.withColumn("city", F.initcap(F.col("city"))) df = df.filter(~df.city.rlike("[ ,;{}()\n\t=]")) df = df.filter(~df.city.rlike("[^0-9A-Za-z]")) df = df.filter(~F.col('city').contains("(")) df = df.groupBy('city', 'bedrooms').agg( F.avg('average').alias('average'), F.first('state')) df = df.withColumnRenamed('first(state)', 'state') df = df.sort('city') df1 = df.withColumn('average', F.round(df['average'], 0)) df1.write \ .format("jdbc") \ .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \ .option("dbtable", "bnbclean") \ .option("user", "test") \ .option("password", "test") \
def init_cap_and_trim_all(column_name: str) -> Column: return initcap(trim_all(column_name))
from pyspark.sql.functions import initcap, concat_ws, col, sha2 # Build session sparkql = SparkSession.builder.master('yarn').getOrCreate() # Load passenger data bucket = <your bucket> sparkql.conf.set('temporaryGcsBucket', bucket) #this gives our job a temporary bucket to use when writint bucket_path = 'gs://{}/'.format(bucket) people_path = bucket_path + 'passengers_1k.csv' passengers_df = sparkql.read.csv(people_path, header=True) # Use withColumn and initcap to standardize the names passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\ .withColumn('middle_name', initcap(col('middle_name')))\ .withColumn('last_name', initcap(col('last_name'))) # Create full_name column passengers_df = passengers_df.withColumn('full_name', concat_ws(" ", col('first_name'), col('middle_name'), col('last_name'))) passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256)) bq_dataset = <your dataset> bq_table = 'passengers' passengers_df.write.format('bigquery') \ .option('table', '{}.{}'.format(bq_dataset, bq_table)) \
def test_initcap_special_chars(): gen = mk_str_gen('ʼn([aAbB13ȺéŸ]{0,5}){1,5}') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).select( f.initcap(f.col('a'))))
df.selectExpr('round(2.5)', 'bround(2.5)').show(2) # Unique ID # We can add a unique ID to each row by using the function monotonically_increasing_id # The function generates a unique value for each row, starting with 0 from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(2) # Working with String from pyspark.sql.functions import initcap, lower, upper df.select(initcap(col('Description'))).show(2) # select initcap(Description) from dfTable df.select(col('Description'), lower(col('Description')), upper(col('Description'))).show(2) df.selectExpr( 'Description', 'lower(Description)', 'upper(lower(Description))').show(2) # select description, lower(Description), upper(lower(Description)) from dfTable from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim
def process_biomarkers( self, biomarkers_df: DataFrame, source_df: DataFrame, disease_df: DataFrame, drugs_df: DataFrame ) -> DataFrame: """The diverse steps to prepare and enrich the input table""" biomarkers_enriched = ( biomarkers_df .select( 'Biomarker', 'IndividualMutation', array_distinct(split(col('Alteration'), ';')).alias('alterations'), array_distinct(split(col('Gene'), ';')).alias('gene'), split(col('AlterationType'), ';').alias('alteration_types'), array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'), array_distinct(split(col('Drug'), ';|,')).alias('drug'), 'DrugFullName', 'Association', 'gDNA', array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'), array_distinct(split(col('Source'), ';')).alias('source') ) .withColumn('confidence', explode(col('confidence'))) .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name'))) .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', '')) .withColumn('drug', explode(col('drug'))) .withColumn('drug', translate(col('drug'), '[]', '')) .withColumn('gene', explode(col('gene'))) .replace(to_replace=GENENAMESOVERRIDE, subset=['gene']) .withColumn('gene', upper(col('gene'))) # At this stage alterations and alteration_types are both arrays # Disambiguation when the biomarker consists of multiple alterations is needed # This is solved by: # 1. Zipping both fields - tmp consists of a list of alteration/type tuples # 2. tmp is exploded - tmp consists of the alteration/type tuple # 3. alteration & alteration_type columns are overwritten with the elements in the tuple .withColumn( 'tmp', self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types'))) .withColumn('tmp', explode(col('tmp'))) .withColumn('alteration_type', element_at(col('tmp'), 2)) .withColumn( 'alteration', when( ~col('IndividualMutation').isNull(), col('IndividualMutation') ) .otherwise(element_at(col('tmp'), 1)) ) .drop('tmp') # Clean special cases on the alteration string .withColumn( 'alteration', when( col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.', col('Biomarker') # 'NRAS (12,13,59,61,117,146)' ) .when( # Cleans strings like 'ARAF:.' col('alteration').contains(':.'), translate(col('alteration'), ':.', '') ) .when( # Fusion genes are described with '__' # biomarker is a cleaner representation when there's one alteration (col('alteration').contains('__')) & (~col('Biomarker').contains('+')), col('Biomarker') ) .otherwise(col('alteration')) ) # Split source into literature and urls # literature contains PMIDs # urls are enriched from the source table if not a CT .withColumn('source', explode(col('source'))) .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source'))) .join(source_df, on='source', how='left') .withColumn( 'literature', when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2)) ) .withColumn( 'urls', when( col('source').startswith('NCT'), struct( lit('Clinical Trials').alias('niceName'), concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url') ) ) .when( (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')), struct(col('niceName'), col('url')) ) ) # The previous conditional clause creates a struct regardless of # whether any condition is met. The empty struct is replaced with null .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls'))) # Enrich data .withColumn('functionalConsequenceId', col('alteration_type')) .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId']) .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association']) .join(disease_df, on='tumor_type', how='left') .withColumn('drug', upper(col('drug'))) .withColumn( # drug class is coalesced when the precise name of the medicine is not provided 'drug', when(col('drug') == '', col('DrugFullName')).otherwise(col('drug'))) .join(drugs_df, on='drug', how='left') .withColumn('drug', initcap(col('drug'))) # Translate variantId .withColumn( 'variantId', when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA'))) ) # Assign a GO ID when a gene expression data is reported .withColumn( 'geneExpressionId', when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')), 'GO_0010628' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')), 'GO_0010629' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')), 'GO_0010467' ) ) # Create variant struct .withColumn( 'variant', when( col('alteration_type') != 'EXPR', struct( col('alteration').alias('name'), col('variantId').alias('id'), col('functionalConsequenceId') ) ) ) # Create geneExpression struct .withColumn( 'geneExpression', when( col('alteration_type') == 'EXPR', struct( col('alteration').alias('name'), col('geneExpressionId').alias('id')) ) ) ) pre_evidence = ( biomarkers_enriched .withColumn('datasourceId', lit('cancer_biomarkers')) .withColumn('datatypeId', lit('affected_pathway')) .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource') .withColumnRenamed('drug', 'drugFromSource') # diseaseFromSourceMappedId, drugId populated above .withColumnRenamed('Association', 'drugResponse') # confidence, literature and urls populated above .withColumnRenamed('gene', 'targetFromSourceId') .withColumnRenamed('Biomarker', 'biomarkerName') # variant, geneExpression populated above .drop( 'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId', 'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url') ) # Group evidence self.evidence = ( pre_evidence .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId', 'drugResponse', 'targetFromSourceId', 'diseaseFromSource', 'diseaseFromSourceMappedId', 'confidence', 'biomarkerName') .agg( collect_set('literature').alias('literature'), collect_set('urls').alias('urls'), collect_set('variant').alias('variant'), collect_set('geneExpression').alias('geneExpression'), ) # Replace empty lists with null values .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature'))) .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls'))) .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant'))) .withColumn( 'geneExpression', when(size(col('geneExpression')) == 0, lit(None)) .otherwise(col('geneExpression'))) # Collect variant info into biomarkers struct .withColumn( 'biomarkers', struct( 'variant', 'geneExpression' )) .drop('variant', 'geneExpression') .distinct() ) return self.evidence
# Databricks notebook source ##문자열 데이터 타입 from pyspark.sql.functions import initcap, col, lower, upper #data load df = spark.read.format("csv")\ .option("header","true")\ .option("inferSchema","true")\ .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv') #data schema 확인 df.printSchema() #initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환 df.select(initcap(col("Description"))).show(2, False) #lower // upper df.select(lower(col("StockCode"))).show(2) #공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim) from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2) ##정규 표현식 #description컬럼의 값을 COLOR 값으로 치환 from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2)
def etl_world_temperature(spark, input_dir, output_dir): """Clean the temperature data""" # load data data_input_full_file_path = f'{input_dir}/GlobalLandTemperaturesByCity.csv' world_temperature_spark_df = spark.read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1") \ .load(data_input_full_file_path) # just take temperature data after 2003-01-01 and only keep the US data world_temperature_spark_df = world_temperature_spark_df \ .filter(F.col('dt') >= datetime(2003, 1, 1)) \ .filter(F.col('Country') == 'United States') # parse month and day us_temperature_spark_df = world_temperature_spark_df \ .withColumn('month', F.month(F.col('dt'))) \ .withColumn('day', F.dayofmonth(F.col('dt'))) \ .drop(F.col('dt')) # groupby columns and get the new avg temperature avg_us_temperature_spark_df = us_temperature_spark_df \ .groupBy(['month', 'day', 'City', 'Country', 'Latitude', 'Longitude']) \ .agg(F.mean('AverageTemperature')) \ .withColumnRenamed('avg(AverageTemperature)', 'AverageTemperature') \ .withColumn('month', F.col('month').cast('integer')) \ .withColumn('day', F.col('day').cast('integer')) # covert DMS Lat and Lon to numeric format to get state info with an udf func avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .withColumn('Latitude', F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double')) .otherwise(-1*F.when(F.col('Latitude').rlike('N'), F.regexp_replace('Latitude', 'N', '').cast('double')))) \ .withColumn('Longitude', F.when(F.col('Longitude').rlike('W'), -1 * F.regexp_replace('Longitude', 'W', '').cast('double')) .otherwise(F.when(F.col('Longitude').rlike('W'), F.regexp_replace('Longitude', 'W', '').cast('double')))) # define a udf function to get state based on lat and lon by using reverse_geocoder library # https://github.com/thampiman/reverse-geocoder def _helper_get_state_(lat, lon): coor = (lat, lon) result = rg.search(coor) return result[0].get('admin1') _helper_get_state_udf = F.udf(lambda x, y: _helper_get_state_(x, y), StringType()) avg_us_temperature_spark_df = avg_us_temperature_spark_df\ .withColumn('state', _helper_get_state_udf(F.col('Latitude'), F.col('Longitude'))) # load i94addr dictionary - map the i94addr values i94addr_dictionary_input_full_file_path = f'{input_dir}/dictionary_data/i94addr_dictionary.csv' i94addr_dictionary_spark_df = spark \ .read \ .format('csv') \ .options(header='true', inferSchema='true', encoding="ISO-8859-1") \ .load(i94addr_dictionary_input_full_file_path) i94addr_dictionary_spark_df = i94addr_dictionary_spark_df \ .withColumn('init_cap_value', F.initcap(F.col('value'))) avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .join(i94addr_dictionary_spark_df, avg_us_temperature_spark_df.state == i94addr_dictionary_spark_df.init_cap_value, 'left') \ .drop('init_cap_value') \ .drop('value') \ .withColumnRenamed('key', 'state_code') avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .withColumnRenamed("Country", "country") \ .withColumnRenamed("City", "city") \ .withColumnRenamed("Latitude", "latitude") \ .withColumnRenamed("Longitude", "longitude") \ .withColumnRenamed("AverageTemperature", "avg_temperature") avg_us_temperature_spark_df = avg_us_temperature_spark_df \ .withColumn('city_state_code', F.concat_ws(', ', F.upper(F.col('city')), F.upper(F.col('state_code')))) avg_us_temperature_spark_df = avg_us_temperature_spark_df.select( 'month', 'day', 'city', 'state', 'state_code', 'city_state_code', 'avg_temperature').distinct() # output clean data data_output_full_file_path = f'{output_dir}/USCitiesTemperaturesByMonth.parquet' avg_us_temperature_spark_df \ .write \ .options(encoding="ISO-8859-1") \ .mode('overwrite') \ .parquet(data_output_full_file_path)
def process_immigration(spark, input_bucket, output_bucket): ''' Processing Immigration Data Arguments: spark = Spark Cluster input_bucket = S3 Bucket name - Input output_bucket = S3 Bucket name - Output ''' ## Loading Data df = spark.read.parquet('{}data/*.parquet'.format(input_bucket)) ## Loading Dicts from Bucket # Load Port Dict df_port = spark.read.json( '{}data/i94prtl.json'.format(input_bucket)).toPandas() df_port=spark.createDataFrame(df_port.transpose().reset_index()\ .rename(columns={'index':'i94port',0:'Port'})) # Load Origin Dict df_country = spark.read.json( '{}data/i94cntyl.json'.format(input_bucket)).toPandas() df_country=spark.createDataFrame(df_country.transpose().reset_index().\ rename(columns={'index':'i94cit',0:'Origin_Country'})) # Load Travelmode Dict df_mode = spark.read.json( '{}data/i94model.json'.format(input_bucket)).toPandas() df_mode=spark.createDataFrame(df_mode.transpose().reset_index().\ rename(columns={'index':'i94mode',0:'Travelmode'})) # Load Destination Dict df_dest = spark.read.json( '{}data/i94addrl.json'.format(input_bucket)).toPandas() df_dest=spark.createDataFrame(df_dest.transpose().reset_index().\ rename(columns={'index':'i94addr',0:'Destination'})) # Load Visa Dict df_visa = spark.read.json( '{}data/i94visa.json'.format(input_bucket)).toPandas() df_visa=spark.createDataFrame(df_visa.transpose().reset_index().\ rename(columns={'index':'i94visa',0:'Visa'})) ## Joining Data # Join Port df = df.join(df_port, df.i94port == df_port.i94port, how='left').drop(df_port.i94port) # Join Country df = df.join(df_country, df.i94cit == df_country.i94cit, how='left').drop(df_country.i94cit) # Join Travelmode df = df.join(df_mode, df.i94mode == df_mode.i94mode, how='left').drop(df_mode.i94mode) # Join Destination df = df.join(df_dest, df.i94addr == df_dest.i94addr, how='left').drop(df_dest.i94addr) # Join Visa Data df = df.join(df_visa, df.i94visa == df_visa.i94visa, how='left').drop(df_visa.i94visa) ## Cleaning Data # Dropping Column col_to_drop = [ 'visapost', 'occup', 'entdepu', 'count', 'admnum', 'matflag', 'dtaddto', 'insnum', 'fltno', 'i94cit', 'i94mode', 'i94addr', 'i94visa' ] df = df.drop(*col_to_drop) # Strip Whitespaces df_clean = df.withColumn('Port', trim(col('Port'))) # Clean Invalid Ports df_port_clean=df_clean.filter(~col('Port').contains('No PORT Code'))\ .filter(~col('Port').contains('Collapsed'))\ .filter(col('Port').isNotNull()) # Invalid Country Codes df_country_clean=df_port_clean.filter(~col('Origin_Country').contains('INVALID'))\ .filter(~col('Origin_Country').contains('Collapsed'))\ .filter(col('Origin_Country').isNotNull()) # Remove Null-Values in Travlemode df_travelmode_clean = df_country_clean.filter( col('Travelmode').isNotNull()) # Remove Null-Values in Destination df_destination_clean = df_travelmode_clean.filter( col('Destination').isNotNull()) # Get City Names # df_city = df_destination_clean.withColumn( "City", initcap(split(col("Port"), ',')[0])).drop('Port') # Quality Check if df_city.count() < 1: raise AssertionError('Error in ETL; Immigration ROW-Count is to low!') else: logging.info('Quality Check succeded') ## Writing Data df_city.write.parquet( '{}processed/cleaned_immigration'.format(output_bucket), mode='overwrite') print('Data cleaned and saved')
# COMMAND ---------- df.stat.freqItems(["StockCode", "Quantity"]).show() # COMMAND ---------- from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(2) # COMMAND ---------- from pyspark.sql.functions import initcap df.select(initcap(col("Description"))).show() # COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"),
"frequent items" print(retail_df.stat.freqItems(['Quantity', 'UnitPrice']).show()) "approx quantile" colName = "UnitPrice" quantileProbs = [0.5] relError = 0.05 print(retail_df.stat.approxQuantile("UnitPrice", quantileProbs, relError)) #%% " string manipulations " "changing column case" from pyspark.sql.functions import initcap, upper, lower, col print( retail_df.select(initcap(col("Description")), lower(col('Description')), upper(col('Description'))).show(5, False)) #%% "regex replace" from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" print( retail_df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2)) #%% "regex translate" from pyspark.sql.functions import translate, lit print( retail_df.select(translate(lit("Descreeption"), "leet", "1627"),
def compile_capitalize(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.initcap(src_column)
def process_i94_cit_res_data(spark: SparkSession, df_right: DataFrame) -> DataFrame: """ Processes mapping file between countries_of_the_world.csv and i94_data.parquet """ # Filename of input file filename = 's3://data-eng-capstone-cf/staging/i94_cit_res_data.csv' # Read into a spark dataframe df = spark.read.csv(filename, header=True) # Cast country_id as an IntegerType() df = df.withColumn('country_id', df.country_id.cast(IntegerType())) # Make country unique, by appending '(<country_id>)' to string name # Only do when country equals INVALID: STATELESS or INVALID: UNITED STATES df = df.withColumn( 'country', F.when(df.country.isin('INVALID: STATELESS', 'INVALID: UNITED STATES'), F.concat(df.country, F.lit(' ('), df.country_id, F.lit(')'))).otherwise(df.country)) # Add foreign key column so can join to df_cow df = df.withColumn('country_join', F.initcap('country')) # Manual adjustments df = df.withColumn( 'country_join', F.when( df.country == 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)', 'Mexico').when( df.country == 'ANTIGUA-BARBUDA', 'Antigua & Barbuda').when( df.country == 'BAHAMAS', 'Bahamas, The').when( df.country == 'BOSNIA-HERZEGOVINA', 'Bosnia & Herzegovina').when( df.country == 'BRITISH VIRGIN ISLANDS', 'British Virgin Is.').when( df.country == 'CENTRAL AFRICAN REPUBLIC', 'Central African Rep.').when( df.country == 'GAMBIA', 'Gambia, The').when( df.country == 'GUINEA-BISSAU', 'Guinea-Bissau'). when(df.country == 'MAYOTTE (AFRICA - FRENCH)', 'Mayotte').when( df.country == 'MICRONESIA, FED. STATES OF', 'Micronesia, Fed. St.').when( df.country == 'NORTH KOREA', 'Korea, North').when( df.country == 'MICRONESIA, FED. STATES OF', 'Micronesia, Fed. St.').when( df.country == 'MICRONESIA, FED. STATES OF', 'Micronesia, Fed. St.').when( df.country == 'SOUTH KOREA', 'Korea, South').when(df.country == 'ST. HELENA', 'Saint Helena'). when(df.country == 'ST. KITTS-NEVIS', 'Saint Kitts & Nevis').when( df.country == 'ST. LUCIA', 'Saint Lucia').when( df.country == 'ST. PIERRE AND MIQUELON', 'St Pierre & Miquelon').when( df.country == 'ST. VINCENT-GRENADINES', 'Saint Vincent and the Grenadines').when( df.country == 'TRINIDAD AND TOBAGO', 'Trinidad & Tobago').when( df.country == 'TURKS AND CAICOS ISLANDS', 'Turks & Caicos Is').when( df.country == 'WALLIS AND FUTUNA ISLANDS', 'Wallis and Futuna').when( df.country == 'CHINA, PRC', 'China').otherwise(df.country_join)) # Define country_fk via left outer join df = df.join(df_right, df.country_join == df_right.country, how='left') \ .select('country_id', df.country, df_right.country.alias('country_fk')) # Check schema and count df.printSchema() df.count() # Return transformed dataframe return df