def recordstream(df, epoch_id): # First splitting the value from Spark DF to get the timestamp from data and later applying window on the datetime split_col = F.split(df.value, ',') df = df.withColumn( 'TimeStamp', F.to_timestamp(F.regexp_replace(split_col.getItem(0), '"', ''), 'yyyy-mm-dd HH:mm:ssss')) df = df.withColumn('RT_Temp', split_col.getItem(1).cast(tp.DoubleType())) df = df.withColumn( 'Nu_Temp', F.regexp_replace(split_col.getItem(2), '"', '').cast(tp.DoubleType())) df = df.drop('value') # Saving input stream to master data set dfw = df.selectExpr('TimeStamp as ts', 'RT_Temp', 'Nu_Temp') dfw.write.saveAsTable(name='tsa.turbine_master', format='hive', mode='append') dfp = df.select('TimeStamp', 'RT_Temp', 'Nu_Temp') if len(dfp.take(1)) != 0: # print('Calling Predictions & Model path is',g_model) df_final = fitVar(2, dfp, g_model) df_final.show(5) df_final.write.saveAsTable(name='tsa.batch_predictions', format='hive', mode='append')
def preprocessing(spark: SparkSession, pppath: Path, datadir: str): print("--- preprocessing -----------------------") schema = T.StructType([ T.StructField('year', T.IntegerType(), True), T.StructField('month', T.IntegerType(), True), T.StructField('dn', T.IntegerType(), True), T.StructField('wday', T.IntegerType(), True), T.StructField('snap', T.IntegerType(), True), T.StructField('dept_id', T.StringType(), True), T.StructField('item_id', T.StringType(), True), T.StructField('store_id', T.StringType(), True), T.StructField('sales', T.DoubleType(), True), T.StructField('flag_ram', T.IntegerType(), True), T.StructField('Sales_Pred', T.DoubleType(), True) ]) csv_path = str(Path(datadir, "Sales5_Ab2011_InklPred.csv")) print(f"--- Reading: '{csv_path}'") sales5: DataFrame = spark.read.csv(csv_path, header='true', schema=schema) \ .withColumn("label", F.col('sales')) ppdf = prepro(sales5) print(f"--- Writing: '{pppath}'") ppdf.write \ .format("parquet") \ .mode("overwrite") \ .save(str(pppath))
def process_song_data(spark, input_bucket, output_data): """ Reads the songs dataset, transforms it, creating artists and songs table in parquet files""" song_data = get_files_paths_s3(input_bucket, "song_data") # song_data = get_local_song_data() # specify schema to improve read speed song_schema = T.StructType()\ .add("num_songs", T.IntegerType())\ .add("artist_id", T.StringType())\ .add("artist_latitude", T.DoubleType())\ .add("artist_longitude", T.DoubleType())\ .add("artist_location", T.StringType())\ .add("artist_name", T.StringType())\ .add("song_id", T.StringType())\ .add("title", T.StringType())\ .add("duration", T.DoubleType())\ .add("year", T.IntegerType()) df = spark.read.schema(song_schema).json(song_data) songs_table = df.select( ["song_id", "title", "artist_id", "year", "duration"]) songs_table.write.partitionBy("year", "artist_id")\ .parquet("{}/songs_table.parquet".format(output_data), mode="overwrite") # Since the data is song based there can be duplicated artists artists_table = df.selectExpr(["artist_id", "artist_name as name", "artist_location as location", "artist_latitude as latitude", "artist_longitude as longitude"])\ .dropDuplicates(["artist_id"]) artists_table.write.parquet("{}artists_table.parquet".format(output_data), mode="overwrite")
def import_twitter_data(spark_session, tweets_file_path): """Imports the twitter data and returns resulting DataFrame. Args: spark_session -- An active SparkSession. tweets_file_path -- A file path. """ tweets_schema = types.StructType([ types.StructField('id', types.LongType()), types.StructField('timestamp', types.LongType(), nullable=False), types.StructField('postalCode', types.StringType()), types.StructField('lon', types.DoubleType(), nullable=False), types.StructField('lat', types.DoubleType(), nullable=False), types.StructField('tweet', types.StringType(), nullable=False), types.StructField('user_id', types.LongType()), types.StructField('application', types.StringType()), types.StructField('source', types.StringType()) ]) tweets_df = spark_session.read.csv(tweets_file_path, escape='"', header='true', schema=tweets_schema, mode='DROPMALFORMED') tweets_df = tweets_df.select(['timestamp', 'lon', 'lat', 'tweet']) return tweets_df
def predict(df, epoch_id): split_col = F.split(df.value, ',') # df = df.withColumn('TimeStamp', F.to_timestamp(F.regexp_replace(split_col.getItem(0), '"', ''), # 'yyyy-mm-dd HH:mm:ss.SSS')) df = df.withColumn( 'TimeStamp', F.regexp_replace(split_col.getItem(0), '"', '').cast(tp.TimestampType())) df = df.withColumn('RT_Temp', split_col.getItem(1).cast(tp.DoubleType())) df = df.withColumn( 'Nu_Temp', F.regexp_replace(split_col.getItem(2), '"', '').cast(tp.DoubleType())) df = df.drop('value') dfw = df.select('TimeStamp', 'RT_Temp', 'Nu_Temp') if len(dfw.take(1)) != 0: df_final = fitVar(2, dfw) # Converting selective columns from prediction dataframe to single column dataframe value df_final = df_final.withColumn('value', (F.concat( col("TS"), lit(","), col("RT_Temp"), lit(","), col("RT_Temp_Predict"), lit(","), col("Nu_Temp"), lit(","), col("Nu_Temp_Predict"), lit(","), col("RMSE_Score"))).cast(tp.StringType())) ds = df_final.select('value') # ds.show(5) # Sending each row of dataframe on Kafka message print('Now sending Message on Kafka topic', sink_topic) ds.selectExpr("CAST(value AS STRING)")\ .write\ .format("kafka")\ .option("kafka.bootstrap.servers", broker)\ .option("topic", sink_topic)\ .save()
def spark(): try: from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.sql import types conf = SparkConf() conf.set("spark.jars.ivy", "/home/jovyan/.ivy2/") conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.driver.memory", "4g") conf.set("spark.sql.shuffle.partitions", "12") sc = SparkContext.getOrCreate(conf=conf) spark = SparkSession(sc) udfs = [ ("jaro_winkler_sim", "JaroWinklerSimilarity", types.DoubleType()), ("jaccard_sim", "JaccardSimilarity", types.DoubleType()), ("cosine_distance", "CosineDistance", types.DoubleType()), ("Dmetaphone", "DoubleMetaphone", types.StringType()), ("QgramTokeniser", "QgramTokeniser", types.StringType()), ("Q3gramTokeniser", "Q3gramTokeniser", types.StringType()), ("Q4gramTokeniser", "Q4gramTokeniser", types.StringType()), ("Q5gramTokeniser", "Q5gramTokeniser", types.StringType()), ("DmetaphoneAlt", "DoubleMetaphoneAlt", types.StringType()), ] for a, b, c in udfs: spark.udf.registerJavaFunction(a, "uk.gov.moj.dash.linkage." + b, c) rt = types.ArrayType( types.StructType([ types.StructField("_1", types.StringType()), types.StructField("_2", types.StringType()), ])) spark.udf.registerJavaFunction( name="DualArrayExplode", javaClassName="uk.gov.moj.dash.linkage.DualArrayExplode", returnType=rt, ) SPARK_EXISTS = True except: SPARK_EXISTS = False if SPARK_EXISTS: print("Spark exists, running spark tests") yield spark else: spark = None logger.error("Spark not available") print("Spark not available") yield spark
def main(link, key): crime_schema = schema_def() # API request is made to retrieve data from Vancouver Open Data urllib.request.urlretrieve(link, "Vancouver.zip") compressed_file = zipfile.ZipFile('Vancouver.zip') csv_file = compressed_file.open('crimedata_csv_all_years.csv') pd_crimes = pd.read_csv(csv_file) # Creation of Spark DataFrame df_crime_init = spark.createDataFrame(pd_crimes, schema=crime_schema).cache() # Tagging City to Vancouver df_crime_init = df_crime_init.withColumn("city", functions.lit("Vancouver")) # UDF to apply UUID for entire dataframe genuuid = functions.udf(lambda: str(uuid.uuid4())) df_crime_init = df_crime_init.withColumn("uuid", genuuid()).cache() # Changing NaN values to 0 for numeric columns & Filtering those rows that has not latitude and longitude df_crime_init = df_crime_init.na.fill(0) df_crime = df_crime_init.where((df_crime_init["X"] > 0) | (df_crime_init["Y"] > 0)) # Conversion of UTM co-orindates to Latitude and Longitude utm_udf_x = functions.udf( lambda x, y: utm.to_latlon(x, y, 10, 'U')[0].item(), types.DoubleType()) utm_udf_y = functions.udf( lambda x, y: utm.to_latlon(x, y, 10, 'U')[1].item(), types.DoubleType()) df_crime = df_crime.withColumn( 'lat', utm_udf_x(functions.col('X'), functions.col('Y'))) df_crime = df_crime.withColumn( 'long', utm_udf_y(functions.col('X'), functions.col('Y'))) # Creating a new dataframe to store those records that does not have co-ordinates. We would need this for further study df_crime_nan = df_crime_init.where(df_crime_init["X"] == 0.0) df_crime_nan = df_crime_nan.withColumn("lat", functions.lit(0.0)) df_crime_nan = df_crime_nan.withColumn("long", functions.lit(0.0)) # Union of both dataframes df_crime_full = df_crime.union(df_crime_nan) # Calling Cassandra DB crime_type to tag subtypes to one common type crimepred = spark.read.format("org.apache.spark.sql.cassandra").options( table='crime_type', keyspace=key).load() crimepred.registerTempTable("crimetype") df_crime_full.registerTempTable("crime") df_table = spark.sql( "select c.city,t.type as crime_type,c.uuid,c.subtype as crimesub_type,c.hour,c.lat,c.long,c.month,c.neighbourhood,c.year from crime c left join crimetype t on c.subtype = t.sub_type" ) df_table = df_table.withColumn('count', functions.lit(1)) df_na = df_table.where(df_table["crime_type"].isNull()).show() #df_crime_nan.show() print(df_table.count()) # Loading of data into base_table df_table.write.format("org.apache.spark.sql.cassandra").options( table='base_table', keyspace=key).mode('overwrite').save()
def setUp(self): sc = SparkContext.getOrCreate() sql_context = SQLContext(sc) struct_feat = [T.StructField('f1', T.FloatType())] struct_lab = [T.StructField('l1', T.StringType())] self.default_param_dict = { 'algorithm': 'LogisticRegression', 'elasticNetParam': (0.0, 0.5), 'fitIntercept': True, 'labelCol': 'label', 'maxIter': (100, 150), 'predictionCol': 'prediction', 'probabilityCol': 'probability', 'rawPredictionCol': 'rawPrediction', 'regParam': (0.0, 0.5), 'threshold': (0.0, 0.5), 'tol': (1e-06, 0.01) } self.default_features = [ T.StructField('AarsVaerk_1', T.DoubleType(), True), T.StructField('AarsVaerk_2', T.DoubleType(), True), T.StructField('AarsVaerk_3', T.DoubleType(), True) ] self.default_standard = True self.workflow = ExecuteWorkflowClassification(self.default_param_dict, self.default_standard, self.default_features)
def main(): schema = types.StructType([ types.StructField('station', types.StringType(), True), types.StructField('date', types.StringType(), True), types.StructField('element', types.StringType(), True), types.StructField('value1', types.DoubleType(), True), types.StructField('mflag', types.StringType(), True), types.StructField('qflag', types.StringType(), True), ]) df = spark.read.csv(inputs, schema) getRange = functions.udf(get_dif, types.DoubleType()) df_by_date = df.select('station', 'date', 'value1')\ .where((df.element == 'TMAX') | (df.element == 'TMIN'))\ .groupBy('station', 'date') \ .agg(functions.collect_list('value1').alias('range'))\ .withColumn('range', getRange('range')) df_by_date = df_by_date.where(df_by_date.range > 1).sort('date', ascending=True) df_max = df_by_date.groupBy('date').max('range').select( 'date', functions.col('max(range)').alias('range')) joined_df = df_max.join(df_by_date, ["date", "range"], 'inner') joined_df = joined_df.select('date', 'station', 'range') joined_df.show() if not os.path.exists(output): os.makedirs(output) joined_df.write.csv(output, sep=' ', mode='overwrite')
def sensor_schema(): sen_schema = types.StructType([ types.StructField('timestamp', types.StringType()), types.StructField('X', types.DoubleType()), types.StructField('Y', types.DoubleType()), types.StructField('Z', types.DoubleType()), ]) return sen_schema
def clean_input(dataframe, start, end): input_columns = [ "client_id", "timestamp", "is_default_browser", "search_counts", "country", "profile_creation_date", "channel", "os", "hours", ] columns = {col: F.col(col) for col in input_columns} # normalize countries against a whitelist columns["country"] = ( F.when(F.col("country").isin(countries), F.col("country")) .otherwise("Other") .alias("country") ) # clean operating system based on CEP naming scheme pattern = { "Windows": ["Windows%", "WINNT%"], "Mac": ["Darwin%"], "Linux": ["%Linux%", "%BSD%", "%SunOS%"], } columns["os"] = column_like("os", pattern, "Other") # rename normalized channel to channel columns["channel"] = F.col("normalized_channel") # convert profile creation date into seconds (day -> seconds) columns["profile_creation_date"] = ( F.when(F.col("profile_creation_date") >= 0, F.col("profile_creation_date") * seconds_per_day) .otherwise(0.0) .cast(types.DoubleType()) ) # generate hours of usage from subsession length (seconds -> hours) columns["hours"] = ( F.when((F.col("subsession_length") >= 0) & (F.col("subsession_length") < 180 * seconds_per_day), F.col("subsession_length") / seconds_per_hour) .otherwise(0.0) .cast(types.DoubleType()) ) # clean the dataset clean = ( dataframe .where(F.col("submission_date_s3") >= start) .where(F.col("submission_date_s3") < end) .select([expr.alias(name) for name, expr in columns.iteritems()]) ) return clean
def get_schema(): return t.StructType([ t.StructField("ReportAsOfEOD", t.StringType(), True), t.StructField("LoanID", t.StringType(), True), t.StructField("Date", t.StringType(), True), t.StructField("PrincipalRepayment", t.DoubleType(), True), t.StructField("InterestRepayment", t.DoubleType(), True), t.StructField("LateFeesRepayment", t.DoubleType(), True), ])
def get_schema(): return TableSchema( [ t.StructField("UserName", t.StringType(), True), t.StructField("Loans", t.LongType(), False), t.StructField("TotalInterestRepayment", t.DoubleType(), True), t.StructField("TotalLateFeesRepayment", t.DoubleType(), True), ], primary_key="UserName", )
def process_columns(self, data_frame: DataFrame) -> DataFrame: return (data_frame.withColumn( "conversion_rate_multiplier", F.col("conversion_rate_multiplier").substr(1, 8).cast( T.IntegerType()) + F.col("conversion_rate_multiplier").substr( 9, 7).cast(T.DoubleType()) * .0000001).withColumn( "conversion_rate_divisor", F.col("conversion_rate_divisor").substr(1, 8).cast( T.IntegerType()) + F.col("conversion_rate_divisor").substr(9, 7).cast( T.DoubleType()) * .0000001).drop("value"))
def getSchema(): # noqa: N802 return t.StructType( [ t.StructField("COUNTYFP", t.IntegerType(), True), t.StructField("NEVER", t.DoubleType(), True), t.StructField("RARELY", t.DoubleType(), True), t.StructField("SOMETIMES", t.DoubleType(), True), t.StructField("FREQUENTLY", t.DoubleType(), True), t.StructField("ALWAYS", t.DoubleType(), True), t.StructField("INSERT_TS", t.TimestampType(), True), ] )
def pivot(trades, prices): """ Pivot and fill the columns on the event id so that each row contains a column for each id + column combination where the value is the most recent non-null value for that id. For example, given the above input tables the expected output is: +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ | id| timestamp| bid| ask|price|quantity|10_bid|10_ask|10_price|10_quantity|20_bid|20_ask|20_price|20_quantity| +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ | 10|1546300799000| 37.5|37.51| null| null| 37.5| 37.51| null| null| null| null| null| null| | 10|1546300800000| null| null| 37.5| 100.0| 37.5| 37.51| 37.5| 100.0| null| null| null| null| | 10|1546300801000| null| null|37.51| 100.0| 37.5| 37.51| 37.51| 100.0| null| null| null| null| | 10|1546300802000|37.51|37.52| null| null| 37.51| 37.52| 37.51| 100.0| null| null| null| null| | 20|1546300804000| null| null|12.67| 300.0| 37.51| 37.52| 37.51| 100.0| null| null| 12.67| 300.0| | 10|1546300806000| 37.5|37.51| null| null| 37.5| 37.51| 37.51| 100.0| null| null| 12.67| 300.0| | 10|1546300807000| null| null| 37.5| 200.0| 37.5| 37.51| 37.5| 200.0| null| null| 12.67| 300.0| +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ :param trades: DataFrame of trade events :param prices: DataFrame of price events :return: A DataFrame of the combined events and pivoted columns. """ trades_prices = trades. \ join(prices, ['id', 'timestamp'], 'outer'). \ select('id', 'timestamp', 'bid', 'ask', 'price', 'quantity'). \ orderBy(asc("timestamp")) unique_ids = trades_prices.select('id').distinct().collect() result = None for row in unique_ids: id = str(row.id) dyn_columns = trades_prices. \ withColumn("bid", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('bid')).cast(T.DoubleType()))).\ withColumn("ask", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('ask')).cast(T.DoubleType()))).\ withColumn("price", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('price')).cast(T.DoubleType()))).\ withColumn("quantity", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('quantity')).cast(T.DoubleType()))).\ withColumn(id+"_id", when(col("id") == row.id, lit(id).cast(T.IntegerType())).otherwise(lit(id).cast(T.IntegerType()))).\ withColumn(id + "_bid", func.last('bid', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \ withColumn(id + "_ask", func.last('ask', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \ withColumn(id + "_price", func.last('price', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \ withColumn(id + "_quantity", func.last('quantity', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))).\ drop('bid', 'ask', 'price', 'quantity', id + "_id") if result is None: result = trades_prices.join(dyn_columns, ['id', 'timestamp'], how='outer') else: result = result.join(dyn_columns, ['id', 'timestamp'], how='outer') return result.orderBy('timestamp')
def get_schema(): return TableSchema( [ t.StructField("ReportAsOfEOD", t.DateType(), True), t.StructField("LoanID", t.StringType(), True), t.StructField("Date", t.DateType(), True), t.StructField("PrincipalRepayment", t.DoubleType(), True), t.StructField("InterestRepayment", t.DoubleType(), True), t.StructField("LateFeesRepayment", t.DoubleType(), True), ], primary_key=["LoanID", "Date"], # partition_by = "Date" #---takes a very long time )
def prepro(spark: SparkSession, datadir: Path, nam: str): def pp(s5: DataFrame) -> DataFrame: stages = [] catvars = ['dept_id', 'item_id', 'store_id', 'wday'] for v in catvars: stages += [StringIndexer(inputCol=v, outputCol=f"i{v}")] stages += [ OneHotEncoderEstimator(inputCols=[f"i{v}" for v in catvars], outputCols=[f"v{v}" for v in catvars]) ] stages += [ VectorAssembler(inputCols=[ 'vwday', 'vitem_id', 'vdept_id', 'vstore_id', 'flag_ram', 'snap', 'dn', 'month', 'year' ], outputCol='features') ] pip: Pipeline = Pipeline(stages=stages) pipm = pip.fit(s5) df: DataFrame = pipm.transform(s5) return df.drop('idept_id', 'iitem_id', 'istore_id', 'iwday', 'vdept_id', 'vtem_id', 'vstore_id', 'vwday') print("--- preprocessing -----------------------") schema = stype.StructType([ stype.StructField('year', stype.IntegerType(), True), stype.StructField('month', stype.IntegerType(), True), stype.StructField('dn', stype.IntegerType(), True), stype.StructField('wday', stype.IntegerType(), True), stype.StructField('snap', stype.IntegerType(), True), stype.StructField('dept_id', stype.StringType(), True), stype.StructField('item_id', stype.StringType(), True), stype.StructField('store_id', stype.StringType(), True), stype.StructField('sales', stype.DoubleType(), True), stype.StructField('flag_ram', stype.IntegerType(), True), stype.StructField('Sales_Pred', stype.DoubleType(), True), ]) csv_path = datadir / "Sales5_Ab2011_InklPred.csv" print(f"--- Reading: '{csv_path}'") sales5: DataFrame = spark.read.csv(str(csv_path), header='true', schema=schema) \ .withColumn("label", sfunc.col('sales')) ppdf = pp(sales5) print(f"--- Writing: '{nam}'") hlp.writeToDatadirParquet(ppdf, nam)
def load_prices(spark): data = [ (10, 1546300799000, 37.50, 37.51), (10, 1546300802000, 37.51, 37.52), (10, 1546300806000, 37.50, 37.51), ] schema = T.StructType([ T.StructField("id", T.LongType()), T.StructField("timestamp", T.LongType()), T.StructField("bid", T.DoubleType()), T.StructField("ask", T.DoubleType()), ]) return spark.createDataFrame(data, schema)
def load_trades(spark): data = [ (10, 1546300800000, 37.50, 100.000), (10, 1546300801000, 37.51, 100.000), (20, 1546300804000, 12.67, 300.000), (10, 1546300807000, 37.50, 200.000), ] schema = T.StructType([ T.StructField("id", T.LongType()), T.StructField("timestamp", T.LongType()), T.StructField("price", T.DoubleType()), T.StructField("quantity", T.DoubleType()), ]) return spark.createDataFrame(data, schema)
def weighted_predict(df, epoch_id): split_col = F.split(df.value, ',') # df = df.withColumn('TimeStamp', F.to_timestamp(F.regexp_replace(split_col.getItem(0), '"', ''), # 'yyyy-mm-dd HH:mm:ss.SSS')) df = df.withColumn( 'TimeStamp', F.regexp_replace(split_col.getItem(0), '"', '').cast(tp.TimestampType())) df = df.withColumn('RT_Temp', split_col.getItem(1).cast(tp.DoubleType())) df = df.withColumn('RT_Temp_Predict', split_col.getItem(2).cast(tp.DoubleType())) df = df.withColumn('Nu_Temp', split_col.getItem(3).cast(tp.DoubleType())) df = df.withColumn('Nu_Temp_Predict', split_col.getItem(4).cast(tp.DoubleType())) df = df.withColumn( 'RMSE_Score', F.regexp_replace(split_col.getItem(4), '"', '').cast(tp.DoubleType())) df = df.drop('value') # df.show() sp_df = df.select('TimeStamp','RT_Temp','RT_Temp_Predict','Nu_Temp','Nu_Temp_Predict','RMSE_Score')\ .where("topic='{}'".format(str(sp_topic))) bt_df = df.select('TimeStamp','RT_Temp','RT_Temp_Predict','Nu_Temp','Nu_Temp_Predict','RMSE_Score')\ .where("topic='{}'".format(str(bl_topic))) # print("Speed Layer Predictions....") # sp_df.show(5) # print("Batch Layer Predictions....") # bt_df.show(5) df_final = (sp_df.alias('sp').join( bt_df.alias('bt'), on=sp_df['TimeStamp'] == bt_df['TimeStamp'], how='inner' ).selectExpr( 'sp.TimeStamp as TS', 'round(sp.RT_Temp,3) as RT_Temp', 'round(sp.RT_Temp_Predict,3) as Speed_RT_Temp', 'round(bt.RT_Temp_Predict,3) as Batch_RT_Temp', 'round(({}*sp.RT_Temp_Predict + {}*bt.RT_Temp_Predict),3) as Wt_RT_Temp' .format(str(s_wt), str(b_wt)), 'round(sp.Nu_Temp,3) as Nu_Temp', 'round(sp.Nu_Temp_Predict,3) as Speed_Nu_Temp', 'round(bt.Nu_Temp_Predict,3) as Batch_Nu_Temp', 'round(({}*sp.Nu_Temp_Predict + {}*bt.Nu_Temp_Predict),3) as Wt_Nu_Temp' .format(str(s_wt), str(b_wt)), 'round(sp.RMSE_Score,3) as Speed_RMSE', 'round(bt.RMSE_Score,3) as Batch_RMSE')) df_final.show(5) # df = spark.sql("select * FROM default.turbine") df_final.write.saveAsTable(name='tsa.serving_predictions', format='hive', mode='append')
def verification(self, candDF, threshold): """ Input: $candDF is the output DataFrame from the 'filtering' function. $threshold is a float value between (0, 1] Output: Return a new DataFrame $resultDF that represents the ER result. It has five columns: id1, joinKey1, id2, joinKey2, jaccard Comments: There are two differences between $candDF and $resultDF (1) $resultDF adds a new column, called jaccard, which stores the jaccard similarity between $joinKey1 and $joinKey2 (2) $resultDF removes the rows whose jaccard similarity is smaller than $threshold """ def get_jaccard_similarity(set_1, set_2): set_1 = set(set_1) set_2 = set(set_2) return len(set_1 & set_2) * 1.00 / len(set_1 | set_2) * 1.00 calculate_jaccard = functions.udf(get_jaccard_similarity, types.DoubleType()) candDF = candDF.withColumn( 'jaccard', calculate_jaccard(candDF['joinKey1'], candDF['joinKey2'])) candDF = candDF.filter(candDF.jaccard >= threshold) return candDF
def _generate_select_expression_for_extended_string_to_double( source_column, name): """ More robust conversion from StringType to DoubleType. Is able to additionally handle (compared to implicit Spark conversion): * Preceding whitespace * Trailing whitespace * Preceeding and trailing whitespace * underscores as thousand separators Hint ---- Please have a look at the tests to get a better feeling how it behaves under tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and tests/data/test_fixtures/mapper_custom_data_types_fixtures.py Example ------- >>> from spooq.transformer import Mapper >>> >>> input_df.head(3) [Row(input_string=" 21474838464.70 "), Row(input_string="Hello"), Row(input_string="21_474_838_464.70")] >>> mapping = [("output_value", "input_string", "extended_string_to_double")] >>> output_df = Mapper(mapping).transform(input_df) >>> output_df.head(3) [Row(input_string=21474838464.7), Row(input_string=None), Row(input_string=21474838464.70)] """ return F.regexp_replace(F.trim(source_column), "_", "").cast(T.DoubleType()).alias(name)
def test_fit_model_multiclass(self): model = create_mnist_model() optimizer = tf.keras.optimizers.Adadelta(1.0) loss = tf.keras.losses.categorical_crossentropy for num_cores in [2, constants.TOTAL_BUFFER_MEMORY_CAP_GIB + 1]: with spark_session('test_fit_model_multiclass', cores=num_cores) as spark: df = create_mnist_data(spark) with local_store() as store: keras_estimator = hvd.KerasEstimator( num_proc=num_cores, store=store, model=model, optimizer=optimizer, loss=loss, metrics=['accuracy'], feature_cols=['features'], label_cols=['label_vec'], batch_size=2, epochs=2, verbose=2) keras_model = keras_estimator.fit(df).setOutputCols(['label_prob']) pred_df = keras_model.transform(df) argmax = udf(lambda v: float(np.argmax(v)), returnType=T.DoubleType()) pred_df = pred_df.withColumn('label_pred', argmax(pred_df.label_prob)) preds = pred_df.collect() assert len(preds) == df.count() row = preds[0] label_prob = row.label_prob.toArray().tolist() assert label_prob[int(row.label_pred)] == max(label_prob)
def extract_embedding(spark, glove_model_path, output_folder): glove = Glove.load(glove_model_path) dictionary_schema = T.StructType([ T.StructField('index', T.IntegerType(), True), T.StructField('standard_concept_id', T.IntegerType(), True) ]) dictionary_df = spark.createDataFrame([ Row(index=k, standard_concept_id=int(v)) for k, v in glove.inverse_dictionary.items() ], dictionary_schema) vector_schema = T.StructType([ T.StructField('index', T.IntegerType(), True), T.StructField('vector', T.ArrayType(T.DoubleType()), True) ]) vector_df = spark.createDataFrame([ Row(index=idx, vector=vector.tolist()) for idx, vector in enumerate(glove.word_vectors) ], vector_schema) dictionary_df.join(vector_df, 'index').select( 'standard_concept_id', 'vector').write.mode('overwrite').parquet(output_folder)
def get_coefficients( split_urls_and_word_frequency_orders: DataFrame, s: float, additional_weight_function: Callable[[int], float] = lambda e: 1 ) -> DataFrame: """ :param split_urls_and_word_frequency_orders: A DataFrame of split URLs and word frequency orders with columns: id, url, split_url, word_frequency_orders. :param s: s parameter of Zipf distribution. :param additional_weight_function: additional weight function to be applied additional weight beside Zipf to word vector. :return: A DataFrame of split URLs and coefficient of each term with columns: id, url, split_url, coefficients """ def calculate_coefficients(word_frequency_orders): coefficients = [] for i in range(len(word_frequency_orders)): coefficients.append( additional_weight_function(i) * URLVectorCalculator.get_zipf_coefficient( word_frequency_orders[i], s)) return coefficients get_coefficients_udf = F.udf(calculate_coefficients, T.ArrayType(T.DoubleType())) split_urls_and_coefficients = split_urls_and_word_frequency_orders \ .select("id", "url", "split_url", get_coefficients_udf("word_frequency_orders").alias("coefficients")) return split_urls_and_coefficients
def sum_word_vectors( urls_and_weighted_word_vectors: DataFrame) -> DataFrame: """ Sums weighted word vectors and their corresponding coefficients for each URL. :param urls_and_weighted_word_vectors: A DataFrame of URLs and weighted word vectors with columns: id, url, pos, word, weighted_word_vector, coefficient. :return: A DataFrame of URLs and their corresponding sum of word vectors and sum of coefficients with columns: id, url, split_url, coefficients, summed_vectors, summed_coefficients. """ word_array_sorter_udf = F.udf( URLVectorCalculator.sort_list_of_2_tuples_by_0th_item, T.ArrayType(T.StringType())) coefficient_array_sorter_udf = F.udf( URLVectorCalculator.sort_list_of_2_tuples_by_0th_item, T.ArrayType(T.DoubleType())) vector_size = len( urls_and_weighted_word_vectors.select( 'weighted_word_vector').first()[0]) return urls_and_weighted_word_vectors \ .groupBy("id", "url") \ .agg(F.collect_list(F.struct("pos", "word")).alias("positions_and_words"), F.collect_list(F.struct("pos", "coefficient")).alias("positions_and_coefficients"), F.sum("coefficient").alias("summed_coefficients"), F.array(*[F.sum(F.col("weighted_word_vector")[i]) for i in range(vector_size)]).alias("summed_vectors")) \ .select("id", "url", "summed_coefficients", "summed_vectors", word_array_sorter_udf("positions_and_words").alias("split_url"), coefficient_array_sorter_udf("positions_and_coefficients").alias("coefficients"))
def cond_pandas(pyData): groupby_columns = ['grp', 'subgrp'] agg_columns = ['mean_of_C', 'max_of_D', 'cond_var_of_E', 'cond_var_of_E2'] df = spark.createDataFrame(pyData) postAggSchema = DataTypes.StructType( [x for x in DataPointSchema.fields if x.name in groupby_columns] + [ DataTypes.StructField(name, DataTypes.DoubleType(), False) for name in agg_columns ]) # @pandas_udf(postAggSchema, PandasUDFType.GROUPED_MAP) def inner_agg_method(dfPartition): group_key = dfPartition['grp'].iloc[0] subgroup_key = dfPartition['subgrp'].iloc[0] C = dfPartition['C'] D = dfPartition['D'] posE = dfPartition[dfPartition.E < 0]['E'] return pd.DataFrame([[ group_key, subgroup_key, C.mean(), D.max(), posE.var(), posE \ .agg(lambda E: \ ((E * E).sum() - E.sum()**2/E.count())/(E.count()-1)) \ .mean(), ]], columns=groupby_columns + agg_columns) # aggregates = df \ .groupby(df.grp, df.subgrp).apply(inner_agg_method) \ .orderBy('grp', 'subgrp') return aggregates, None
def calc(df): ## function to calculate the appoximating function and its derivative def foo(x,y): y_arr = np.array(y) gy = g(y_arr) gp = gprime(y_arr) x_arr = np.array(x) res = np.outer(gy,x_arr) return([res.flatten().tolist(), gp.tolist()]) udf_foo = f.udf(foo, t.ArrayType(t.ArrayType(t.DoubleType()))) df2 = df.withColumn("vals", udf_foo("features","Y")) df2 = df2.select("id", f.col("vals").getItem(0).alias("gy"), f.col("vals").getItem(1).alias("gy_")) GY_ = np.array(df2.agg(f.array([f.sum(f.col("gy")[i]) for i in range(n_comp**2)])).collect()[0][0]).reshape(n_comp,n_comp)/num_rows GY_AVG_V = np.array(df2.agg(f.array([f.avg(f.col("gy_")[i]) for i in range(n_comp)])).collect()[0][0]).reshape(n_comp,1)*V return(GY_, GY_AVG_V)
def get_spark(): conf = SparkConf() # Load in a jar that provides extended string comparison functions such as Jaro Winkler. # Splink # No longer needed in spark 3.0? # conf.set("spark.driver.extraClassPath", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars", "jars/scala-udf-similarity-0.0.7.jar") conf.set("spark.jars.packages", "graphframes:graphframes:0.8.0-spark3.0-s_2.12") # WARNING: # These config options are appropriate only if you're running Spark locally!!! conf.set("spark.driver.memory", "4g") conf.set("spark.sql.shuffle.partitions", "8") sc = SparkContext.getOrCreate(conf=conf) sc.setCheckpointDir("temp_graphframes/") spark = SparkSession(sc) # Register UDFs from pyspark.sql import types spark.udf.registerJavaFunction( "jaro_winkler_sim", "uk.gov.moj.dash.linkage.JaroWinklerSimilarity", types.DoubleType(), ) spark.udf.registerJavaFunction("Dmetaphone", "uk.gov.moj.dash.linkage.DoubleMetaphone", types.StringType()) return spark