def save_authors(): connectionProperties = { #"customSchema": 'id INT, co_list STRING, last_page INT, saved INT', "customSchema": 'id LONG, full_name STRING, subject_areas STRING, document_count INT,cited_by_count INT,citations_count INT,h_index INT,coauthors_count INT,affiliation_current STRING,cat STRING,country STRING,docs_fetched INT,last_page INT,is_sample INT,citations INT', "driver": 'org.sqlite.JDBC' } df = sqlContext.read.jdbc(url='jdbc:sqlite:../iconic.db', table='author', properties=connectionProperties) full_name = F.concat_ws(' ', F.get_json_object(df.full_name, '$[*].surname'), F.get_json_object(df.full_name, '$[*].given-name')) country = F.get_json_object(df.affiliation_current, '$.affiliation-country') city = F.get_json_object(df.affiliation_current, '$.affiliation-city') university = F.get_json_object(df.affiliation_current, '$.affiliation-name') country_name = F.udf(lambda c: get_country_name(c), StringType()) df = df.withColumn('full_name', full_name) df = df.withColumn('country', country_name(country)) df = df.withColumn('city', city) df = df.withColumn('university', university) df = df.drop('document_count', 'h_index', 'citations_count', 'coauthors_count', 'affiliation_current', 'docs_fetched', 'last_page', 'is_sample', 'citations') df.write.mode('overwrite').parquet(os.path.abspath('./data/authors'))
def load_evaluation(events, start_date, end_date): content_items = events \ .filter(events['event'] == 'ContentItem_Received'.lower()) \ .filter(events['year'] >= start_date.year).filter(events['year'] <= end_date.year) \ .filter(events['time'] >= start_date).filter(events['time'] <= end_date) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(events['MessageId'.lower()].isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['ModelId']".lower()) == 'WaterfallTopRank'.lower()) \ .drop_duplicates(subset=['MessageId']) \ .select( events['time'], events['CarrierName'.lower()].alias('carrier_name'), events['DaysFromEulaApproval'.lower()].astype('float').alias('days_from_eula'), events['ProgramName'.lower()].alias('program_name'), events['MessageId'.lower()].alias('message_id'), events['MessageType'.lower()].alias('message_type'), events['DeviceVendor'.lower()].alias('device_vendor'), F.get_json_object( events['properties'], "$['properties']['InstallationSource']".lower() ).alias('installation_source'), F.get_json_object( events['properties'], "$['properties']['ModelRecommendationProbability']".lower() ).cast('float').alias('waterfall_p') ) \ .fillna(value='unset') # FIXME: empty string value print('received notification loaded') return content_items
def transform_stream(kinesis_stream): kinesis_stream \ .selectExpr("CAST(sequenceNumber AS STRING)", "CAST(data AS STRING)") \ .withColumn("value", col("data").cast(StringType())) \ .withColumn("measure_id", col("sequenceNumber").cast(StringType())) \ .select( col("measure_id"), get_json_object("value", "$.sensorId").alias("sensor_id"), get_json_object("value", "$.currentTemperature").alias("current_temperature"), get_json_object("value", "$.status").alias("status") ) \ .writeStream \ .foreach(lambda rdd: rdd.foreach(process_row)) \ .start()
def load_time_on_page(events, start_date, end_date): time_on_page = events \ .filter(events['event'] == 'ContentItem_TimeOnPage'.lower()) \ .filter(events['year'] >= start_date.year).filter(events['year'] <= end_date.year) \ .filter(events['time'] >= start_date).filter(events['time'] <= end_date) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(events['MessageId'.lower()].isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['TimeOnPage']".lower()).isNotNull()) \ .groupBy(F.col('MessageId'.lower()).alias('message_id')) \ .agg(F.max(F.get_json_object( events['properties'], "$['properties']['TimeOnPage']".lower()).astype('float') ).alias('time_on_page')) print('TimeOnPage events loaded') return time_on_page
def filter_raw_data(df): """Extract only relevant columns of data we require for topic modelling. NOTE: The unix timestamp from MongoDB is divided by 1000 here because of the extra 3 zeros at the end (we don't need milliseconds). """ dataDF = df.select('_id', 'publishedAt', 'outlet', 'url', 'title', 'body', 'peopleFemaleCount', 'peopleMaleCount', 'sourcesFemaleCount', 'sourcesMaleCount') \ .withColumn('id', get_ids(f.col('_id'))) \ .withColumn('unix_timestamp', f.get_json_object(df.publishedAt, "$.$date") / 1000) \ .withColumn('string_timestamp', f.from_unixtime(f.col('unix_timestamp'))) \ .withColumn('timestamp', f.col('string_timestamp').cast(t.TimestampType())) \ .drop('_id', 'publishedAt', 'unix_timestamp', 'string_timestamp') return dataDF
def kafkaCast(stream: DataStreamReader, args: dict) -> DataStreamReader: """Return a new dataframe with normalized attrs from `value` Args: stream (Stream): processed stream. Returns: Stream: casted stream. """ json_objects = [] for u in ["uuid", "capability", "timestamp", "value"]: json_objects.append(get_json_object(stream.value, '$.' + u).alias(u)) return stream.select(json_objects)
def transform_data_with_udf(clickstream_data, purchase_data): window1 = Window.partitionBy('userId').orderBy('eventTime') window2 = Window.orderBy('sessionId') clickstream_data = (clickstream_data.withColumn( 'appOpenFlag', app_open_flag_udf(clickstream_data['eventType'])).withColumn( 'sessionId', sum(col('appOpenFlag')).over(window1)).withColumn( 'attr', attributes_udf( clickstream_data['eventType'], clickstream_data['attributes'])).withColumn( 'campaign_id', when( get_json_object('attr', '$.campaign_id').isNotNull(), get_json_object('attr', '$.campaign_id')).otherwise(None) ).withColumn( 'channel_id', when( get_json_object('attr', '$.channel_id').isNotNull(), get_json_object( 'attr', '$.channel_id')).otherwise(None)).withColumn( 'purchase_id', when( get_json_object( 'attr', '$.purchase_id').isNotNull(), get_json_object( 'attr', '$.purchase_id')).otherwise(None)). withColumn( 'campaignId', last(col('campaign_id'), ignorenulls=True).over( window2.rowsBetween( Window.unboundedPreceding, 0))).withColumn( 'channelId', last(col('channel_id'), ignorenulls=True).over( window2.rowsBetween( Window.unboundedPreceding, 0)))) target_df = clickstream_data.join( purchase_data, clickstream_data['purchase_id'] == purchase_data['purchaseId'], JOIN_TYPE.LEFT) return target_df.select(col('purchaseId'), col('purchaseTime'), col('billingCost'), col('isConfirmed'), col('sessionId'), col('campaignId'), col('channelId'))
def load_received_notifications(events, start_date, end_date): # datetime objects are serializable only from spark 2.2.1 content_items = events \ .filter(events['event'] == 'ContentItem_Received'.lower()) \ .filter(events['year'] >= start_date.year).filter(events['year'] <= end_date.year) \ .filter(events['time'] >= start_date).filter(events['time'] <= end_date) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(events['MessageId'.lower()].isNotNull()) \ .drop_duplicates(subset=['MessageId']) \ .select( events['time'], events['CarrierName'.lower()].alias('carrier_name'), events['DaysFromEulaApproval'.lower()].astype('float').alias('days_from_eula'), events['MessageId'.lower()].alias('message_id'), events['MessageType'.lower()].alias('message_type'), events['DeviceVendor'.lower()].alias('device_vendor'), ) print('received notification loaded') return content_items
def save_collaborations(): auths_struct = StructType([StructField("list", ArrayType(StringType()))]) connectionProperties = { "customSchema": 'abs_id LONG, authors STRING, published STRING, cited_by INT, keywords STRING, coll_count INT, h_index INT, message STRING, saved INT', "driver": 'org.sqlite.JDBC' } df = sqlContext.read.jdbc(url='jdbc:sqlite:../iconic.db', table='collaboration', properties=connectionProperties) df = df.withColumn('authors', F.get_json_object(df.authors, '$[*].authid')) map_auths = F.udf(lambda s: '{{"list": {}}}'.format(s), StringType()) df = df.withColumn('auths', map_auths(df.authors)) df = df.withColumn('authors', F.from_json(df.auths, auths_struct)) df = df.withColumn("authors_size", F.size(F.col('authors.list'))) new_df = df.select('abs_id', 'published', 'cited_by', 'keywords', F.explode(df.authors.list).alias('author_id')) new_df.write.parquet(os.path.abspath('./data/collaborations'))
def spark_process(collection): start_timer = time.perf_counter() adg_hive_select_query = "select * from %s" % collection.staging_hive_table df = get_dataframe_from_staging(adg_hive_select_query) raw_df = df.select( df.data, F.get_json_object(df.data, "$.message.encryption.encryptedEncryptionKey").alias( "encryptedKey"), F.get_json_object(df.data, "$.message.encryption.keyEncryptionKeyId").alias( "keyEncryptionKeyId"), F.get_json_object( df.data, "$.message.encryption.initialisationVector").alias("iv"), # The below piece of code is commented and worked around as its truncating the db object #F.get_json_object(df.data, "$.message.dbObject").alias("dbObject"), F.get_json_object(df.data, "$.message.db").alias("db_name"), F.get_json_object(df.data, "$.message.collection").alias("collection_name"), F.get_json_object(df.data, "$.message._id").alias("id")) key_df = raw_df.withColumn( "key", get_plain_key(raw_df["encryptedKey"], raw_df["keyEncryptionKeyId"])) decrypted_df = key_df.withColumn( "decrypted_db_object", decryption(key_df["key"], key_df["iv"], key_df["data"])) validated_df = decrypted_df.withColumn( "validated_db_object", validation(decrypted_df["decrypted_db_object"])) sanitised_df = validated_df.withColumn( "sanitised_db_object", sanitise(validated_df["validated_db_object"], validated_df["db_name"], validated_df["collection_name"])) clean_df = sanitised_df.withColumnRenamed("sanitised_db_object", "val") values = clean_df.select("val") parquet_location = persist_parquet(collection.collection_name, values) prefix = ("${file_location}/" + collection.collection_name + "/" + collection.collection_name + ".parquet") tag_objects(prefix, tag_value=collection.tag_value) create_hive_on_published(parquet_location, collection.collection_name) end_timer = time.perf_counter() time_taken = round(end_timer - start_timer) time_taken = str(datetime.timedelta(seconds=time_taken)) the_logger.info( f"time taken for {collection.collection_name}: {time_taken}")
# COMMAND ---------- df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .selectExpr("explode(complex_map)").show(2) # COMMAND ---------- jsonDF = spark.range(1).selectExpr(""" '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""") # COMMAND ---------- from pyspark.sql.functions import get_json_object, json_tuple jsonDF.select( get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"), json_tuple(col("jsonString"), "myJSONKey")).show(2) # COMMAND ---------- from pyspark.sql.functions import to_json df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct"))) # COMMAND ---------- from pyspark.sql.functions import from_json from pyspark.sql.types import * parseSchema = StructType( (StructField("InvoiceNo", StringType(), True), StructField("Description", StringType(), True)))
.getOrCreate() spark.conf.set("spark.sql.shuffle.partitions", 2) spark.sparkContext.setLogLevel("ERROR") msft_df = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers","localhost:9092")\ .option("subscribe","capstone14")\ .load() msft_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") msft_df2 = msft_df.select( func.get_json_object(func.col("value").cast("string"), '$[*].open').alias('open'), func.get_json_object(func.col("value").cast("string"), '$[*].high').alias('high'), func.get_json_object(func.col("value").cast("string"), '$[*].low').alias('low'), func.get_json_object(func.col("value").cast("string"), '$[*].close').alias('close'), func.get_json_object(func.col("value").cast("string"), '$[*].volume').alias('volume'), func.get_json_object(func.col("value").cast("string"), '$[*].date').alias('date')) msft_df2.printSchema() msft_df3 = msft_df2 \ .writeStream \
config("spark.ui.port", "0"). \ config('spark.cassandra.connection.host', '127.0.0.1'). \ getOrCreate() clicks_df = spark. \ read. \ format("org.apache.spark.sql.cassandra"). \ options(table = "clickevents", keyspace = "webanalytics"). \ load() clicks_df.show(truncate=False) from pyspark.sql.functions import col, get_json_object clicks_df = clicks_df. \ withColumn("x", get_json_object(col("value"), "$.x").cast("long")). \ withColumn("y", get_json_object(col("value"), "$.y").cast("long")). \ drop("value") clicks_df.printSchema() pandas_df = clicks_df.toPandas() import plotly.express as px fig = px.scatter(x=pandas_df["x"], y=pandas_df["y"]) fig.update_yaxes(autorange="reversed") from PIL import Image img = Image.open(
.add('source', StringType()) \ .add('type', StringType()) \ .add('data', StringType()) \ .add('ts', StringType()) src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema) ## Build out some new columns with data extracted from the JSON string df = src \ .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \ .withColumn('agerange', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].agerange'), \ StructType().add('high', IntegerType()).add('low', IntegerType()) \ ) \ ) \ .withColumn('smile', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].smile'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('eyeglasses', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].eyeglasses'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \
df = get_router_data(source_schema, last_loaded_date, max_createdmsec) cdlhosts_data_cnt = df.count() print("Incremental Data to be loaded to TDM Table =" + str(cdlhosts_data_cnt) + " records") if (cdlhosts_data_cnt > 0): df1 = df.withColumn('eventdate', regexp_replace(col("datum"), "-", "")) hst = df1.filter("host.interfacetype = 'WIFI'")\ .select(col("eventid").alias("id"),\ col("assetid").alias("networkdeviceid"),\ col("host.hostmac").alias("MacAddress"),\ col("host.hostname").alias("HostName"),\ "eventtime",\ "eventdate",\ f.get_json_object("host.params", "$.opaquedata.band").alias("band")\ ) dup = hst.dropDuplicates() hst_cnt = dup.count() print("Total source count is : " + str(hst_cnt)) #######cdl_acscoll_wifiparam_prq table data ######## Wifi_params_max_createdmsec = get_last_loadValue("audit_value", audit_schema, WifiParams_source_table, tdm_table) print("Last loaded data has Wifi_params_max_createdmsec = " + str(Wifi_params_max_createdmsec))
df_transactions = df_transactions.join( df_cotizaciones, df_transactions['dt'] == df_cotizaciones['fecha'], 'inner') df_transactions = df_transactions.withColumn( 'amount_usd', F.col("amount") / F.col("venta_uala")) df_transactions = df_transactions.join( df_authorizations, df_transactions["authorization_id"] == df_authorizations["authorization_id"], "left") df_transactions = df_transactions.select( "dt", "account_from", "amount_usd", "transaction_type", F.get_json_object(df_transactions.metadata, '$.mcc').alias('mcc')) df_transactions = df_transactions.withColumn( 'year_month', F.date_format(df_transactions.dt, 'YYYY-MM')) df_transactions = df_transactions.drop("dt") df_transactions = df_transactions.join( df_param_mcc, df_transactions["mcc"] == df_param_mcc["mcc_param"], "left") #.na.fill('otros') df_transactions.fillna('otros', subset=['category']) ##### ARMAMOS COMPRAS POR CATEGORIA USANDO MCC ####### df_transactions = df_transactions.withColumn( 'new_transaction_type',
def extractJsonFromString(df, attrs): json_objects = [] for u in attrs: json_objects.append(get_json_object(df.value, '$.' + u).alias(u)) return json_objects
from pyspark.sql import SparkSession from pyspark.sql.functions import col,get_json_object,json_tuple spark = SparkSession.builder.appName("Pyspark example").getOrCreate() jsonDF= spark.range(1).selectExpr(""" '{"key": {"value":[1,2,3]}}' as jsonString """) jsonDF.select( get_json_object(col("jsonString"),"$.key.value[0]").alias("column"), json_tuple(col("jsonString"),"key")).show(2)
spark = SparkSession \ .builder \ .appName("Kafka2Files") \ .getOrCreate() schema = StructType()\ .add("field1", IntegerType())\ .add("field2", StringType())\ .add("field3", StringType()) lines = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", brokers) \ .option("subscribe", topic) \ .load()\ .select(get_json_object(col("value").cast("string"),"$.field1").alias('user_id'),\ get_json_object(col("value").cast("string"),"$.field2").alias('type'))\ .where(col("type").startswith("#")) query = lines \ .writeStream \ .format("csv") \ .option("format", "append") \ .option("path", save2file) \ .option("checkpointLocation", path2Checkpoint) \ .outputMode("append") \ .start() query.awaitTermination()
## Read in data by pointing to its's table name in Glue Data Catalog schema = StructType() \ .add('source', StringType()) \ .add('type', StringType()) \ .add('data', StringType()) \ .add('ts', StringType()) src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema) ## Build out some new columns with data extracted from the JSON string df = src \ .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \ .withColumn('agerange', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].agerange'), \ StructType().add('high', IntegerType()).add('low', IntegerType()) \ ) \ ) \ .withColumn('smile', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].smile'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('eyeglasses', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].eyeglasses'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \
"Append").format("console").option( "checkpointLocation", "./checkpoint/").start()) # output to console with json parse schema = (StructType().add("timestamp", TimestampType()).add( "open", IntegerType()).add("high", IntegerType()).add( "low", IntegerType()).add("close", IntegerType()).add("volume", IntegerType())) df.select( df["key"].cast("string"), from_json( df["value"].cast("string"), schema)).writeStream.outputMode("Append").format("console").start() # output to csv ds = (df.selectExpr( df["key"].cast("string"), from_json(df["value"].cast("string"), schema)).writeStream.outputMode("Append").format("csv").option( "path", "./csv_output/").option("checkpointLocation", "./checkpoint/").start()) # output with window function ds = (df.select( get_json_object(df.value.cast("string"), "$.timestamp").cast("timestamp").alias("timestamp"), get_json_object(df.value.cast("string"), "$.close").cast("float").alias("close"), ).groupby(window("timestamp", "5 seconds")).avg( "close").writeStream.outputMode("Update").format("console").start())
def transform_data(clickstream_data, user_purchases_data): window1 = Window.partitionBy('userId').orderBy('eventTime') window2 = Window.orderBy('sessionId') appOpenFlagDefaultVal = 0 clickstream_data = ( clickstream_data.withColumn( 'appOpenFlag', when( (col('eventType') == Event_Type.OPEN), monotonically_increasing_id()).otherwise(appOpenFlagDefaultVal) ).withColumn( 'sessionId', sum(col('appOpenFlag')).over(window1)).withColumn( 'attrs', when( col('eventType') == Event_Type.PURCHASE, clickstream_data['attributes'].substr( lit(2), length('attributes') - lit(2))).otherwise(col('attributes'))).withColumn( 'attr', when( col('eventType') == Event_Type.PURCHASE, regexp_replace(col('attrs'), '""', "'")).otherwise(col('attrs'))). withColumn( 'campaign_id', when( get_json_object( 'attr', '$.campaign_id').isNotNull(), get_json_object( 'attr', '$.campaign_id')).otherwise(None)).withColumn( 'channel_id', when( get_json_object( 'attr', '$.channel_id').isNotNull(), get_json_object( 'attr', '$.channel_id')).otherwise(None)).withColumn( 'purchase_id', when( get_json_object( 'attr', '$.purchase_id').isNotNull(), get_json_object( 'attr', '$.purchase_id')).otherwise(None)). withColumn( 'campaignId', last(col('campaign_id'), ignorenulls=True).over( window2.rowsBetween(Window.unboundedPreceding, 0))).withColumn( 'channelId', last(col('channel_id'), ignorenulls=True).over( window2.rowsBetween(Window.unboundedPreceding, 0)))) target_df = clickstream_data.join( user_purchases_data, clickstream_data['purchase_id'] == user_purchases_data['purchaseId'], JOIN_TYPE.LEFT) return target_df.select(col('purchaseId'), col('purchaseTime'), col('billingCost'), col('isConfirmed'), col('sessionId'), col('campaignId'), col('channelId'))
def get_sentiment(text): from textblob import TextBlob try: tweet = TextBlob(text) return tweet.sentiment.polarity except: return None # Define your function getSentiment = UserDefinedFunction(get_sentiment, StringType()) # Apply the UDF using withColumn outDF = (kinesisDF .selectExpr("cast(data as string)") .withColumn('id', get_json_object(col("data"),"$[0].id")) .withColumn('ts', get_json_object(col("data"),"$[0].ts")) .withColumn('tweet', get_json_object(col("data"),"$[0].tweet")) .withColumn('tag', get_json_object(col("data"),"$[0].tag")) .withColumn('sentiment', getSentiment(col("tweet")).cast(FloatType())) .withColumn('datetime', to_timestamp(regexp_extract( col('ts'), '\\w\\w\\w \\d\\d? (\\d+):(\\d+):(\\d+)',0), "MMM dd HH:mm:ss")) .select(col('sentiment'),col('datetime'), col('tag'), col('tweet')) .withColumn('sentiment_cnt', col("sentiment")) .withColumn('tweet_dup', col("tweet")) .groupBy(col("tag"), window(col('datetime'), "5 seconds").alias("timewindow")) .agg({'sentiment_cnt': 'count','sentiment':'avg','tweet':'first','tweet_dup':'last'}) .withColumnRenamed("count(sentiment_cnt)","notweets") .withColumnRenamed("avg(sentiment)","avgsentiment") .withColumnRenamed('first(tweet)',"tweet1") .withColumnRenamed('last(tweet_dup)',"tweet2")
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--output_path', help='output path in s3 including bucket without file name') args = parser.parse_args() if args.output_path: output_path = args.output_path else: raise ValueError('missing argument - output_path') model_name = 'photography' spark_config = SparkConf().setAppName(model_name) dependencies = get_dependencies() spark_context = SparkContext(conf=spark_config, pyFiles=dependencies) hive_context = HiveContext(spark_context) is_photo_related = F.udf( lambda s: True if ('camera' in s) or ('video' in s) else False, types.BooleanType()) get_event_score = F.udf(score_single_event, types.FloatType()) # received notification at least as many as viewed fix_received = F.udf(lambda received, view: max(received, view), types.FloatType()) # TODO: switch to l1 home_events_uuid events = hive_context.table('l2_sprint.mixpanel_home') # choose photography related content interactions from notifications # (devicefeatures_attribute exists only for notification items, not future cards) # relevant content started approximately '2017-10-31' content_items = events \ .filter(events['event'].isin( [x.lower() for x in ['ContentItem_Received', 'ContentItem_View', 'ContentItem_Click', 'ContentItem_TimeOnPage', 'ContentItem_PageScroll']]) ) \ .filter(events['Time'.lower()] > '2017-10-31') \ .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['MessageType']".lower()).isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['devicefeatures_attribute']").isNotNull()) \ .filter(is_photo_related(F.get_json_object( events['properties'], "$['properties']['devicefeatures_attribute']"))) # assign score for each interactions content_items = content_items \ .withColumn( 'score', get_event_score( events['event'], F.get_json_object(events['properties'], "$['properties']")) ) # aggregate score per user, item, event, action (action to differentiate clicks). # use max on properties for score because page scroll sends intermediate states for example. # use max on properties in case it's null or empty string in one of the events content_items = content_items \ .groupBy( F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'), events['event'], F.get_json_object( events['properties'], "$['properties']['MessageType']".lower()).alias('topic'), F.get_json_object( events['properties'], "$['properties']['ActionId']".lower()).alias('action') ) \ .agg( F.max(F.get_json_object( events['properties'], "$['properties']['AtlasUniqueUserId']".lower())).alias('user_id'), F.max('CarrierName'.lower()).alias('carrier_name'), F.max('DeviceModel'.lower()).alias('device_model'), F.max('DeviceModelName'.lower()).alias('device_model_name'), F.max('DeviceOsType'.lower()).alias('device_os_type'), F.max('DeviceVendor'.lower()).alias('device_vendor'), F.max('score').alias('score') ) # FIXME fix view according action events received_content_items = content_items \ .groupBy('device_id') \ .pivot('event', ['ContentItem_Received'.lower(), 'ContentItem_View'.lower()]).sum('score') \ .fillna(0.0) \ .select( 'device_id', fix_received(F.col('contentitem_received'), F.col('contentitem_view')).alias('receive')) # calculate final score for user. content_items = content_items \ .filter(events['event'] != 'ContentItem_Received'.lower()) \ .groupBy('device_id') \ .agg( F.max('user_id').alias('user_id'), F.max('carrier_name').alias('carrier_name'), F.max('device_model').alias('device_model'), F.max('device_model_name').alias('device_model_name'), F.max('device_os_type').alias('device_os_type'), F.max('device_vendor').alias('device_vendor'), F.sum('score').alias('total_score') ) \ .join(received_content_items, 'device_id', 'left') \ .withColumn('score', F.round(F.col('total_score') / F.col('receive'))) \ .drop('total_score', 'receive') \ .withColumn('photography_interest', F.lit(None)) # choose users who completed user interest questionnaire interests = events \ .filter(events['event'] == 'Timeline_OnboardingMessage_Click'.lower()) \ .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \ .filter(F.get_json_object( events['properties'], "$['properties']['IsTest']".lower()) == 'false') \ .filter(F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \ .filter(F.get_json_object( events['properties'], "$['properties']['ActionId']".lower()) == 'done') # assign score for photography interest interests = interests \ .withColumn( 'score', get_event_score( events['event'], F.get_json_object(events['properties'], "$['properties']")) ) # subset relevant properties and drop duplicated devices # (assuming each user should answer questionnaire ones) interests = interests \ .select( F.get_json_object( events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'), F.get_json_object( events['properties'], "$['properties']['AtlasUniqueUserId']".lower()).alias('user_id'), events['CarrierName'.lower()].alias('carrier_name'), events['DeviceModel'.lower()].alias('device_model'), events['DeviceModelName'.lower()].alias('device_model_name'), events['DeviceOsType'.lower()].alias('device_os_type'), events['DeviceVendor'.lower()].alias('device_vendor'), 'score' ) \ .drop_duplicates(['device_id']) \ .withColumn('photography_interest', F.when(F.col('score') > 0, 1.0).otherwise(0.0)) # assregate content and interest scores # use max on properties in case it's null or empty string in one of the events photography_user = content_items.union(interests) \ .groupBy('device_id') \ .agg( F.max('user_id').alias('user_id'), F.max('carrier_name').alias('carrier_name'), F.max('device_model').alias('device_model'), F.max('device_model_name').alias('device_model_name'), F.max('device_os_type').alias('device_os_type'), F.max('device_vendor').alias('device_vendor'), F.sum('score').alias('score'), F.max('photography_interest').alias('photography_interest') ) dgx = hive_context.table('l2_asurion.demographics_dbo_source_dgx') mobileid = hive_context.table('l3_sprint.mobileid') # FIXME: decrypt ethnicityrollup, dob, ethnicity photography_user_augmented = photography_user \ .join(mobileid.select('mobileuid', 'subid'), photography_user['user_id'] == mobileid['mobileuid'], 'left') \ .join(dgx.select('source_dfx_id', 'nameprefix', 'state', 'age_range', 'income_range_vds', 'gender', 'marital_status', 'dwelling_type', 'home_ownership', 'length_of_residence', 'presence_of_children', 'mail_public_responder_indicator', 'mail_responsive_buyer_indicator', 'home_value_range', 'networthindicator_rollup', 'wealth_decile', 'homeandlandvalue', 'first_mortgage_amount', 'level_of_education', 'head_of_household', 'professionalrollup', 'premover', 'active_fitness_interest', 'golf_interest', 'traveler', 'green_advocate'), mobileid['subid'] == dgx['source_dfx_id'], 'left') apps = hive_context.read.parquet(APPS_PATH) photography_user_augmented = photography_user_augmented \ .join(apps, photography_user_augmented['device_id'] == apps['deviceId'], 'left') photography_user_augmented.write.csv('s3://' + output_path, mode='overwrite', compression='gzip', header=True)
.master('spark://master:7077') \ .getOrCreate() # Get the json-formatted data from Kafka stream kafka_movies = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "147.46.216.122:9092") \ .option("subscribe", "movie") \ .load() # Define the relational schema schema = StructType().add("title", StringType()).add("genre", StringType()).add("year", LongType()) # Change the JSON events into relational tuples relational_movies = kafka_movies.select([get_json_object(col("value").cast("string"), "$.{}".format(c)).alias(c) for c in ["title", "genre", "year"]]) # Change the type of year from string to integer relational_movies = relational_movies.select(col("title"), col("genre"), relational_movies.year.cast('integer').alias('year')) # Select the movie titles with year < 2000 results = relational_movies.select("title").where("year < 2000") query = results \ .writeStream \ .outputMode("append") \ .format("console") \ .start() query.awaitTermination(60)
.appName("ReadFromKafkaStoreToMysql") \ .getOrCreate() # readStream for reading data from kafka topic test df = spark \ .readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "localhost:9094") \ .option("subscribe", "test") \ .load() # Cast Value of dataframe in string and aliasing it as json data = df.selectExpr("CAST(value AS STRING) as json") # Create schema for streaming json objects new_data = data.withColumn("id",F.get_json_object("json",'$.id'))\ .withColumn("Product",F.get_json_object("json",'$.Product')) \ .withColumn("Price",F.get_json_object("json",'$.Price')) \ .withColumn("Payment_Type",F.get_json_object("json","$.Payment_Type")) \ .withColumn("Name",F.get_json_object("json","$.Name")).drop("json","$.json") # Function to write eachbatch of kafka to mysql def writeToSQL(df, epochId): df.write.format("jdbc")\ .option("url","jdbc:mysql://localhost:3306/praveen")\ .option("driver","com.mysql.cj.jdbc.Driver")\ .option("dbtable","product")\ .option("user","root")\ .option("password","root")\ .mode("append")\
from pyspark.sql import SparkSession from pyspark.sql.functions import col, struct, split, explode, expr, create_map, get_json_object, json_tuple if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() df = spark.range(1).selectExpr(""" '{"myJsonKey": {"myJsonValue": [1, 2, 3]}}' as jsonString """) df.select(get_json_object(col('jsonString'), "$.myJsonKey.myJsonValue[0]")).show() df.select(json_tuple(col('jsonString'), "myJsonKey")).show()
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .selectExpr("explode(complex_map)").show(2) # COMMAND ---------- jsonDF = spark.range(1).selectExpr(""" '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""") # COMMAND ---------- from pyspark.sql.functions import get_json_object, json_tuple jsonDF.select( get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column", json_tuple(col("jsonString"), "myJSONKey")).show(2) # COMMAND ---------- from pyspark.sql.functions import to_json df.selectExpr("(InvoiceNo, Description) as myStruct")\ .select(to_json(col("myStruct"))) # COMMAND ---------- from pyspark.sql.functions import from_json from pyspark.sql.types import * parseSchema = StructType((
def transform_data(clickstream_df: DataFrame, purchases_df: DataFrame, logger: Log4jWrapper) -> DataFrame: """ Builds purchases attribution dataset from mobile app clickstream and user purchases data. :param df: dataset to transform as DataFrame. :return: transformed DataFrame. """ user_window: Window = Window.partitionBy("userId").orderBy("eventTime") logger.warning("build leading events dataset ...") leading_events_df = (clickstream_df.filter( F.col("eventType").isin({"app_open", "app_close"})).withColumn( "nextEventId", F.lead(F.col("eventId"), 1).over(user_window)).select( F.col("userId"), F.col("eventTime").alias("startSessionEventTime"), F.col("eventType").alias("startSessionEventType"), F.col("attributes"), F.col("nextEventId")).filter( F.col("startSessionEventType") == "app_open").withColumn( "campaignId", F.get_json_object( F.col("attributes"), "$.campaign_id")).withColumn( "channelId", F.get_json_object( F.col("attributes"), "$.channel_id")).drop("attributes")) leading_events_df.show() logger.warning("calculate user sessions ...") sessions_df = (leading_events_df.alias("leading_events").join( clickstream_df.alias("all_events"), on=F.col("leading_events.nextEventId") == F.col("all_events.eventId"), how="left").select( F.col("leading_events.userId"), F.col("leading_events.startSessionEventTime"), F.col("leading_events.campaignId"), F.col("leading_events.channelId"), F.col("all_events.eventTime").alias("endSessionEventTime"), ).withColumn("sessionId", F.monotonically_increasing_id())) sessions_df.show() logger.warning("append session to each event ...") sessioned_purchases_df = (clickstream_df.alias("c").filter( F.col("c.eventType") == "purchase").join( sessions_df.alias("s"), on=[ F.col("c.userId") == F.col("s.userId"), F.col("c.eventTime") >= F.col("s.startSessionEventTime"), (F.col("c.eventTime") <= F.col("s.endSessionEventTime")) | F.col("s.endSessionEventTime").isNull(), ]).select( F.col("s.userId"), F.col("s.sessionId"), F.col("s.campaignId"), F.col("s.channelId"), F.get_json_object(F.col("c.attributes"), "$.purchase_id").alias("purchaseId"), ).orderBy(F.col("userId"), F.col("eventTime"))) sessioned_purchases_df.show() logger.warning("build purchases attribution ...") projection_df = (sessioned_purchases_df.alias("s").join( purchases_df.alias("p"), on=F.col("p.purchaseId") == F.col("s.purchaseId")).select( F.col("p.purchaseId"), F.col("p.purchaseTime"), F.col("p.billingCost"), F.col("p.isConfirmed"), F.col("s.sessionId"), F.col("s.campaignId"), F.col("s.channelId"))) projection_df.show() return projection_df
def explode_json_column(df: DataFrame, column: str, json_schema: StructType) -> DataFrame: """Create new columns extracting properties from a JSON column. Example: >>> from pyspark import SparkContext >>> from pyspark.sql import session >>> from butterfree.testing.dataframe import create_df_from_collection >>> from butterfree.core.extract.pre_processing import explode_json_column >>> from pyspark.sql.types import ( ... ArrayType, ... IntegerType, ... StringType, ... StructField, ... StructType, ... ) >>> spark_context = SparkContext.getOrCreate() >>> spark_session = session.SparkSession(spark_context) >>> data = [{"json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}'}] >>> df = create_df_from_collection(data, spark_context, spark_session) >>> df.collect() [Row(json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}')] >>> json_column_schema = StructType( ... [ ... StructField("a", IntegerType()), ... StructField("b", StringType()), ... StructField("c", IntegerType()), ... StructField("d", ArrayType(IntegerType())), ... ] >>> explode_json_column( ... df, column='json_column', json_schema=json_column_schema ... ).collect() [ Row( json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}', a=123, b='abc', c=123, d=[1, 2, 3] ) ] Args: df: input dataframe with the target JSON column. column: column name that is going to be exploded. json_schema: expected schema from that JSON column. Not all "first layer" fields need to be mapped in the json_schema, just the desired columns. If there is any JSON field that is needed to be cast to a struct, the declared expected schema (a StructType) need to have the exact same schema as the presented record, if don't, the value in the resulting column will be null. Returns: dataframe with the new extracted columns from the JSON column. """ for field in json_schema: if field.dataType.typeName() in JSON_TYPE_NAMES: df = df.withColumn( field.name, from_json( get_json_object(df[column], "$.{}".format(field.name)), schema=field.dataType, ), ) else: # non-collection data types df = df.withColumn( field.name, get_json_object(df[column], "$.{}".format(field.name)).cast( field.dataType), ) return df
# COMMAND ---------- # MAGIC %md ###### Extract and flatten some fields # COMMAND ---------- attrDF = bizDF.select( col("business_id"), col("name").alias("BusinessName"), col("categories"), col("address"), col("city"), col("state"), col("postal_code"), col("is_open"), col("review_count"), col("stars").alias("Rating"), col("attributes.AcceptsInsurance"), col("attributes.AgesAllowed"), regexp_replace(regexp_replace(col("attributes.Alcohol"), 'u\'', ''), '\'', '').alias("Alcohol"), get_json_object("Ambience", "$.touristy").alias("HasTouristyAmbience"), get_json_object("Ambience", "$.hipster").alias("HasHipsterAmbience"), get_json_object("Ambience", "$.romantic").alias("HasRomanticAmbience"), get_json_object("Ambience", "$.intimate").alias("HasIntimateAmbience"), get_json_object("Ambience", "$.trendy").alias("HasTrendyAmbience"), get_json_object("Ambience", "$.upscale").alias("HasUpscaleAmbience"), get_json_object("Ambience", "$.classy").alias("HasClassyAmbience"), get_json_object("Ambience", "$.casual").alias("HasCasualAmbience"), get_json_object("DietaryRestrictions", "$.dairy-free").alias("IsDairyFree"), get_json_object("DietaryRestrictions", "$.gluten-free").alias("IsGlutenFree"), get_json_object("DietaryRestrictions", "$.vegan").alias("IsVegan"), get_json_object("DietaryRestrictions", "$.kosher").alias("IsKosher"), get_json_object("DietaryRestrictions", "$.halal").alias("IsHalal"), get_json_object("DietaryRestrictions", "$.soy-free").alias("IsSoyFree"),