def time_series_dataframe(dataframe,time_field): # Parameters: # datadrame: must be a spark dataframe # field: String value. It must match a single column name # About nulls values: This function does not consider null values on its calculations # Importing Libraries and Modules import pyspark.sql.functions as F # Dropping null values. Creating a new column with timestamp format. Dropping original time_field. Converting spark dataframe into pandas dataframe. Setting 'timestamp' as dataframe index. Sorting new index df = dataframe.dropna(how='any',subset = time_field).withColumn('timestamp',F.to_utc_timestamp(time_field,tz = 'UTC')).drop(time_field).toPandas().set_index('timestamp').sort_index() return df # This functions returns a pandas dataframe
def to_timestamp(obj, column, tzone='UTC'): f = F.col(column) datecol_type = obj.select(column).dtypes[0][1] if datecol_type not in ['timestamp', 'date']: f = F.to_timestamp(f) if tzone != 'UTC': f = F.to_utc_timestamp(f, tzone) return f
def clean_IOT_data(iot_df: DataFrame) -> DataFrame: '''Clean the IOT data ''' # convert unix timestamp to timestamp iot_df = iot_df.withColumn('time', f.to_utc_timestamp(f.from_unixtime(f.col('timestamp')/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') ) iot_df = iot_df.drop(*['cca2', 'cn', 'timestamp', 'scale']) iot_df = (iot_df.withColumn('Date', f.to_date('time')) .withColumn('Month', f.month('Date')) .withColumn('Year', f.year('Date')) ) return iot_df
def local_to_utc(self, localTimeZone, subset=None): if subset is not None: tsCol = ( x for x in self.inputSchema if str(x.dataType) == "TimestampType" and x.name in subset) else: tsCol = (x for x in self.inputSchema if str(x.dataType) == "TimestampType") for x in tsCol: self.inputDf = self.inputDf.withColumn( x.name, to_utc_timestamp(col(x.name), localTimeZone)) return self.inputDf
def changeTimezone(self, fromTimezone, toTimezone, subset=None): if subset is not None: tsCol = ( x for x in self.inputSchema if str(x.dataType) == "TimestampType" and x.name in subset) else: tsCol = (x for x in self.inputSchema if str(x.dataType) == "TimestampType") for x in tsCol: self.inputDf = self.inputDf.withColumn( x.name, to_utc_timestamp(col(x.name), fromTimezone)) self.inputDf = self.inputDf.withColumn( x.name, from_utc_timestamp(col(x.name), toTimezone)) return self.inputDf
def clean_data(self, conversions, attribution): #Handle missing values attribution = attribution.dropna(how='any') conversions = conversions.dropna(how='any') Data = attribution.join(conversions, on=['Conv_ID'], how='inner') # Change conv_date to datetime format timeFmt = "yyyy-MM-dd" Data = Data.withColumn( 'Conv_Date', to_utc_timestamp( unix_timestamp(col('Conv_Date'), timeFmt).cast('timestamp'), 'UTC')) return Data
def test_generate_select_expression_for_TimestampMonth_spark2( self, input_value, value, spark_session, spark_context): source_key, name = "day_of_birth", "birthday" input_df = get_input_df(spark_session, spark_context, source_key, input_value) input_df = input_df.withColumn( source_key, F.to_utc_timestamp(input_df["attributes"]["data"][source_key], "yyyy-MM-dd")) result_column = custom_types._generate_select_expression_for_TimestampMonth( source_column=input_df[source_key], name=name) output_df = input_df.select(result_column) assert output_df.schema.fieldNames() == [name], "Renaming of column" assert output_df.schema[name].dataType.typeName( ) == "timestamp", "Casting of column" output_value = output_df.first()[name] if output_value: output_value = datetime.date.strftime(output_value, format("%Y-%m-%d")) else: output_value = None assert output_value == value, "Processing of column value"
df_nycflights.minute)) df_nycflights.show() df_nycflights = df_nycflights. \ withColumn('timestamp', concat_ws(' ', df_nycflights.date, df_nycflights.time)) df_nycflights.show() '''Next we will convert the StringType to TimeStampType''' df_nycflights = df_nycflights. \ withColumn('timestamp', to_utc_timestamp(df_nycflights.timestamp,'GMT')) df_nycflights.show() '''Next we make the column timestamp as first column''' '''Note columns will return all column name''' df_nycflights = df_nycflights. \ select(df_nycflights.columns[-1:] + df_nycflights.columns[0:-1]) df_nycflights.show() '''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created''' df_nycflights = df_nycflights. \ drop('year'). \ drop('month'). \
from random import randint from pyspark.sql import functions as F SCHEMA_VERSION = "1" INPUT_FILE_NAME = "schema_v{nr}.json".format(nr=SCHEMA_VERSION) DATE_COLUMNS_TO_CONVERT = [("attributes.birthday", "birthday")] rdd_text = spark.sparkContext.textFile(INPUT_FILE_NAME) rdd_text.coalesce(2).saveAsTextFile( "schema_v{nr}/textFiles".format(nr=SCHEMA_VERSION)) rdd_seq = rdd_text.map(lambda x: (randint(1000, 100000), bytearray(x, "utf-8"))) rdd_seq.coalesce(2).saveAsSequenceFile( "schema_v{nr}/sequenceFiles".format(nr=SCHEMA_VERSION)) df = spark.read.json(rdd_text) for input_col, output_col in DATE_COLUMNS_TO_CONVERT: df = df.withColumn( output_col, F.to_utc_timestamp(timestamp=F.to_timestamp(df[input_col]), tz="Europe/Vienna")) df.coalesce(2).write.parquet( "schema_v{nr}/parquetFiles".format(nr=SCHEMA_VERSION))
def main(): """ Get observations near locations from SmartMet Server Data start and end time and timestep is fetched from the data. Dataset is assumed coherent in means of time and locations. I.e. timestep is assumed to be constant between start and end time. """ log1=logging.getLogger("driver") output_directory = 'gs://{}/hadoop/tmp/bigquery/pyspark_output'.format(bucket) output_files = output_directory + '/part-*' # The trains stations data in stations.json # The type of trains and their delay in gratu_a_b_2010-14.csv JSON_PATH="gs://trains-data/data/stations.json" CSV_PATH="gs://trains-data/data/full/gratu_a_b_2010-14.csv" train_stations_df = spark.read \ .json(JSON_PATH) # parameters for weather data to be fetched from Smartmet server params, names = read_parameters('parameters_shorten.txt') # base URL for the surface data baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?format=ascii&separator=,&producer=fmi&tz=local&timeformat=xml×tep=60&numberofstations=5&maxdistance=100000¶m={params}'.format(params=','.join(params)) urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlons=%s,%s' % (x.stationShortCode,baseurl,x.latitude,x.longitude)]).repartition(16) data = urlist.map(read_from_URL)\ .filter(lambda x: x != -1)\ .flatMap(lambda x:x.splitlines())\ .map(lambda x: x.split(',')) newColumns=names+["trainstation"] schemaString = ' '.join(str(x) for x in newColumns) fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) # Apply the schema to the RDD. station_weather_df = spark.createDataFrame(data, schema) station_weather_df = station_weather_df.withColumn("time", to_utc_timestamp(station_weather_df.time, "%Y-%m-%dT%H")) # calculate max_precipitation 3h and max_precipitation6h col="max_precipitation1h" # to change the "no precipiation values" -1.0 to 0.0 station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col] == -1.0, 0.0).otherwise(station_weather_df[col])) # using window functions to calculate the precipitation for the # previous 3 hours and 6 hours w3 = w.partitionBy("trainstation")\ .orderBy(station_weather_df["time"])\ .rowsBetween(-2,0) station_weather_df =station_weather_df.withColumn("max_precipitation3h",f.sum("max_precipitation1h").over(w3)) w6 = w.partitionBy("trainstation")\ .orderBy(station_weather_df["time"])\ .rowsBetween(-5,0) station_weather_df =station_weather_df.withColumn("max_precipitation6h",f.sum("max_precipitation1h").over(w6)) # making the surface observation dataframe cols = station_weather_df.columns # list of all columns for col in cols: station_weather_df = station_weather_df.fillna({col:"-99"}) station_weather_df = station_weather_df.withColumn(col, f.when(station_weather_df[col].isin("null", "nan", "NaN", "NULL"),"-99").otherwise(station_weather_df[col])) log1.info("Retrieved surface data") ## Get flash data baseurl = 'http://data.fmi.fi/fmi-apikey/9fdf9977-5d8f-4a1f-9800-d80a007579c9/timeseries?param=time,peak_current&producer=flash&tz=local&timeformat=xml&format=ascii&separator=,' urlist= train_stations_df.rdd.flatMap(lambda x : ['%s#%s&latlon=%s,%s:30' % (x.stationShortCode,baseurl,x.latitude,x.longitude)]) data = urlist.map(getFlash)\ .filter(lambda x: x != -1)\ .flatMap(lambda x:x.splitlines())\ .map(lambda x: x.split(',')) schemaString = 'time peakcurrent trainstation' fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) flash_df = spark.createDataFrame(data, schema) flash_df = flash_df.withColumn("time", to_utc_timestamp(flash_df.time, "%Y%m%dT%HMS")) # find the count of flashes in each hour extended = (flash_df .withColumn("date", f.col("time").cast("date")) .withColumn("hour", f.hour(f.col("time")))) flash_aggs = extended.groupBy("trainstation", "date", "hour").count() flash_aggs = flash_aggs.withColumn('time', f.concat(f.col("date"), f.lit("T"), f.col("hour"))) flash =flash_aggs.withColumn('time',to_utc_timestamp(flash_aggs.time,"%Y-%m-%dT%H")).select("time", f.col("count").alias("flashcount"),"trainstation") log1.info("Retrieved flash data") # Combining surface and flash data cond = [flash.time == station_weather_df.time, flash.trainstation == station_weather_df.trainstation ] station_weather_flash_df = station_weather_df.alias('a').join(flash.alias('b'),cond, 'outer').select('a.*', 'b.flashcount').fillna({'flashcount':'0'}) # Reading the train type and delay data df = spark.read \ .csv(CSV_PATH) # combining the date and time columns and selecting the relevant columns df = df.withColumn('t', f.concat(f.col("_c0"), f.lit("T"), f.col("_c1"))).select("t","_c3", "_c4", "_c9", "_c7", "_c5") # converting the time to utc timestamp and adding 1 hour df = df.withColumn('t',to_utc_timestamp(df.t,"%Y-%m-%dT%H") + f.expr('INTERVAL 1 HOUR')) trains_df = df.select(f.col("t").alias("time"),f.col("_c3").alias("trainstation"), f.col("_c4").alias("train_type"), f.col("_c9").alias("train_count"), f.col("_c7").alias("total_delay"), f.col("_c5").alias("delay")) # Combining the weather data both surface and flash with #he train delay and type data cond = [trains_df.time == station_weather_flash_df.time, trains_df.trainstation == station_weather_flash_df.trainstation ] trains_station_weather_flash_delay_df = trains_df.join(station_weather_flash_df, cond).drop(station_weather_flash_df.time).drop(station_weather_flash_df.trainstation) log1.info("Created the dataframe with train delay and weather observations Finished!\n") # Saving the data to BigQuery (trains_station_weather_flash_delay_df .write.format('json').save(output_directory)) # Shell out to bq CLI to perform BigQuery import. subprocess.check_call( 'bq load --source_format NEWLINE_DELIMITED_JSON ' '--replace ' '--autodetect ' '{dataset}.{table} {files}'.format( dataset=output_dataset, table=output_table, files=output_files ).split()) # Manually clean up the staging_directories, otherwise BigQuery # files will remain indefinitely. output_path = spark._jvm.org.apache.hadoop.fs.Path(output_directory) output_path.getFileSystem(spark._jsc.hadoopConfiguration()).delete( output_path, True) elapsed_time = time.time() - start_time log1.info("Elapsed time to retreive train delay and observation data and save to bq {:10.3f}".format(elapsed_time))
# create spark session my_spark = SparkSession\ .builder\ .appName("SpaceBar")\ .config("spark.mongodb.input.uri", input_uri)\ .config("spark.mongodb.output.uri", output_uri)\ .config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.12:2.4.2')\ .getOrCreate() # read data from mongodb df = my_spark.read.format('com.mongodb.spark.sql.DefaultSource').load() df = df.withColumn('id' , monotonically_increasing_id()) df = df.select('id', 'lang', 'timestamp_ms', 'source') #convert timestamp df2 = df.withColumn('time', F.to_utc_timestamp(F.from_unixtime(F.col("timestamp_ms")/1000,'yyyy-MM-dd HH:mm:ss'),'UTC')) #select tool from source df3 = df2.withColumn('tool', when(col('source').contains('tweetdeck'),'Tweetdeck') .when(col("source").contains("mobile"),"mobile") .when(col("source").contains("android"),"android") .when(col("source").contains("iPhone"),"iPhone") .when(col("source").contains("Android"),"android") .when(col("source").contains("tweetbot"),"Tweetbot") .when(col("source").contains("iPad"),"iPad") .otherwise("Unknown")) df4 = df3.select('time', 'timestamp_ms', 'lang', 'tool') df4.show() #export to csv df4.toPandas().to_csv('data.csv')
savings_accountsDF = spark.read.json('data/savings_accounts/*.json') savings_accountsDF.createOrReplaceTempView("savings_accountsView") spark.sql("select * from savings_accountsView").show() # create denormalized joined table view denormalized = spark.sql( "SELECT A.ts as A_ts, A.id as A_id, A.op as A_op, A.data as A_data, A.set as A_set, B.ts as B_ts, B.id as B_id, B.op as B_op, B.data as B_data, B.set as B_set, C.ts as C_ts, C.id as C_id, C.op as C_op, C.data as C_data, C.set as C_set FROM accountsView A FULL JOIN cardsView B ON B.ts = A.ts FULL JOIN savings_accountsView C ON C.ts = A.ts" ).cache() denormalized.createOrReplaceTempView("denormalizedView") spark.sql("select * from denormalizedView").show(truncate=False) # convert millis to datetime denormalized = denormalized.withColumn( "A_datetime", functions.to_utc_timestamp( functions.from_unixtime( functions.col("A_ts") / 1000, 'yyyy-MM-dd HH:mm:ss'), 'UTC')) denormalized = denormalized.withColumn( "B_datetime", functions.to_utc_timestamp( functions.from_unixtime( functions.col("B_ts") / 1000, 'yyyy-MM-dd HH:mm:ss'), 'UTC')) denormalized = denormalized.withColumn( "C_datetime", functions.to_utc_timestamp( functions.from_unixtime( functions.col("C_ts") / 1000, 'yyyy-MM-dd HH:mm:ss'), 'UTC')) # get transactions denormalized.select( "B_datetime", "B_set.credit_used", "C_datetime",
def to_gmt(df): return df.withColumn("timestamp", F.to_utc_timestamp("timestamp", df.time_zone))
def my_count(ds): ds.agg(*[count(c).alias(c) for c in ds.columns]).show() my_count(df) df = df.dropna(how='any') my_count(df) # dealwith date from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, col timeFmt = "MM/dd/yy HH:mm" df = df.withColumn('NewInvoiceDate', to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'utc')) # calculate total price from pyspark.sql.functions import round df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2)) # calculate the time difference from pyspark.sql.functions import min, max, sum, datediff date_max = df.select(max('NewInvoiceDate')).toPandas() current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), \ 'yy-MM-dd HH:mm').cast('timestamp'), 'UTC') # Calculatre Duration df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))
df.printSchema() # Visualizando o aspecto dos dados # In[17]: df.show(5, truncate=False) # Conversão de colunas [dropoff_datetime, pickup_datetime] do tipo String para tipo TimeStamp. # In[18]: #DataFrame Convertido (dfc) dfc = df.withColumn( 'dropoff_datetime', F.to_utc_timestamp('dropoff_datetime', "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")).withColumn( 'pickup_datetime', F.to_utc_timestamp('pickup_datetime', "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")) dfc.printSchema() # Visualizando o aspecto dos dados, em especial os campos dataTime # In[19]: dfc.show(5, False) # Criando uma view chamada trip. # In[20]:
return word else: return "nonTag" ## Converting date string format def getDate(x): if x is not None: return str( datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y').replace( tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S")) date_fn = udf(getDate, StringType()) ## Converting datatype in spark dataframe df = df.withColumn("created_at", to_utc_timestamp(date_fn("created_at"), "UTC")) words = df.select( explode(split(df.text, " ")).alias("word"), df.created_at.alias("timestamp")) extract_tags_udf = udf(extract_tags, StringType()) resultDF = words.withColumn("tags", extract_tags_udf(words.word))\ .withColumn("timestamp", words.timestamp) windowedCounts = resultDF.where(resultDF.tags != "nonTag").groupBy( window(words.timestamp, window_length, sliding_interval), words.word).count().orderBy('window') query = windowedCounts.writeStream\ .outputMode("complete")\ .format("console")\
data = data.withColumn('label', col(labelCol)) return data.select('features', 'label') elif indexCol and not labelCol: # for unsupervised learning return data.select(indexCol, 'features') elif not indexCol and not labelCol: # for unsupervised learning return data.select('features') # dealwith the invoiceDate from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col timeFmt = "MM/dd/yy HH:mm" ds = ds.withColumn('NewInvoiceDate', to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'utc')) # analyze missing values counts import pyspark.sql.functions as F df.select([F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns]) # remove blank string df = df.filter("colName != ''") # calculate outlier, drop outlier cols = numeric_features bounds = {} for col in cols: quantiles = df.approxQuantile(col,[0.25,0.75], 0.05) IQR = quantiles[1] - quantiles[0]
df_history = df_history.join(organizations_dropcol, df_history.encounters_provider == organizations_dropcol.organizations_id,how='left') df_history = df_history.join(providers_dropcol, df_history.organizations_id == providers_dropcol.providers_organization,how='left') ## Calculate percent of Null Values print ('Null values percentage in each column:') print (null_perc(df_history)) ## Dropping duplicate value columns drop_column_list = ['procedures_patient','medications_patient','immunizations_encounter', 'conditions_encounter','procedures_encounter','medications_encounter','imaging_studies_encounter', 'providers_organization','medications_payer_coverage', 'conditions_patient','conditions_code','procedures_reasondescription','encounters_provider','organizations_revenue','medications_base_cost','medications_totalcost','procedures_reasoncode'] df_history = df_history.select([column for column in df_history.columns if column not in drop_column_list]) df_history = df_history.filter(df_history.encounters_encounterclass.isin('inpatient','ambulatory','urgentcare','outpatient','emergency')) ## Convert data type from string to timestamp df_history = df_history.withColumn("encounters_stop", to_utc_timestamp("encounters_stop", "GMT")) df_history = df_history.withColumn("encounters_start", to_utc_timestamp("encounters_start", "GMT")) print('encounters time duration values') df_history.show(truncate=100) ##Subtract encounter stop and encounter start to calculate admission duration for the patient df_history = df_history.withColumn("admission_duration", unix_timestamp("encounters_stop")-unix_timestamp("encounters_start")) #df_history = df_history.withColumn("admission_duration", col("encounters_stop")-col("encounters_start")) df_history.printSchema() print('admission duration values') df_history.show(truncate=100) ##Replace empty strings with required values df_history = df_history.replace('','NM','patient_marital')
iot_df.filter(f.col('cca3').isNull() | f.col('device_id').isNull()).show() # no nulls for country or id # COMMAND ---------- iot_df[['device_id']].distinct().count() # devices are the same as the number of rows, so we only have one observation per device # COMMAND ---------- # MAGIC %md # MAGIC Note the many different ways in which we can create the timestamp column. Although they look different, their essence is the same, namely, they specify an expression. **A column in Spark is just the specification of an expression for a computation to be performed for each row when an action is triggered**. # COMMAND ---------- # for instance, the following expression specifies a column f.to_utc_timestamp(f.from_unixtime(iot_df['timestamp']/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') # COMMAND ---------- iot_df = iot_df.withColumn('time',f.to_utc_timestamp(f.from_unixtime(f.col('timestamp')/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam')) # these also work, and are wholly equivalent: # iot_df.select(f.expr('*'), f.to_utc_timestamp(f.from_unixtime(iot_df['timestamp']/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam').alias('time') # iot_df.select(f.expr('*'), f.expr("to_utc_timestamp(from_unixtime(timestamp/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') AS time")) # iot_df.selectExpr('*', "to_utc_timestamp(from_unixtime(timestamp/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') AS time") # HOWEVER, this does not work, as dataframes are immutable. While the above specify a (lazy) transformation, # the command below tries to make an assignment: # iot_df['timestamp'] = f.to_utc_timestamp(f.from_unixtime(iot_df['timestamp']/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') # the right hand side of the assignment is fine; it's the assignment that's the problem
ax.set_xticklabels(df.columns, rotation='vertical',fontsize=7) plt.title('Missing Value Occurrence') plt.show() ## Delete rows with null value from pyspark.sql.functions import count def data_count(inp): inp.agg(*[count(i).alias(i) for i in inp.columns]).show() data_count(df) # raw data df_new = df.dropna(how='any') data_count(df_new) # after we drop rows with null values ## Change datetime format from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col time_format = "MM/dd/yy HH:mm" df_new = df_new.withColumn('NewInvoiceDate', to_utc_timestamp(unix_timestamp(col('InvoiceDate'),time_format).cast('timestamp'),'UTC')) df_new.show(5) ## Calculate total price and create the column from pyspark.sql.functions import round df_new = df_new.withColumn('TotalPrice', round( df.Quantity * df.UnitPrice, 2 )) ## Calculate time diff from pyspark.sql.functions import mean, min, max, sum date_max = df_new.select(max('NewInvoiceDate')).toPandas() ## Calculate duration df_new = df_new.withColumn('Duration', datediff(lit(date_max.iloc[0][0]), 'NewInvoiceDate')) df_new.show(5) ## Calculate rfm
def _add_times(self, iot_df: DataFrame) -> DataFrame: iot_df = iot_df.withColumn('time', f.to_utc_timestamp(f.from_unixtime(f.col('timestamp')/1000, 'yyy-MM-dd hh:mm:ss'), 'Europe/Amsterdam') )
trim(col("wind_direction")).cast("int").alias("wind_direction"))\ .withColumn("ID", regexp_replace("ID", "^0+", ""))\ .repartition(30, "ob_time") if debug: wx_df.printSchema() # Observation time in rainfall data is recorded for rainfall in last 1 hour # To offset the hour difference, we are using UTC+1 as timezone such that the time in dataframe will be one hour later rain_df = rainhrly_df.select(trim(col("src_id")).alias("SRC_ID"), trim(col("id_type")).alias("ID_T"), trim(col("met_domain_name")).alias("MET_DOMA"), trim(col("id")).alias("ID"), to_utc_timestamp( trim(col("ob_end_time")), "GMT+1").alias("ob_time"), trim(col("prcp_amt")).cast( "float").alias("prcp_amt"), trim(col("prcp_dur")).cast("float").alias("prcp_dur"))\ .withColumn("ID", regexp_replace("ID", "^0+", ""))\ .filter("MET_DOMA = 'SREW'")\ .repartition(30, "ob_time") if debug: rain_df.printSchema() # rain_df.show() combined_weather_df = wx_df.join(rain_df, ["SRC_ID", "ob_time"], "inner")\ .select(["ob_time", "air_temperature", "wind_speed", "wind_direction", "prcp_amt", "prcp_dur", year("ob_time").alias("year"),