Пример #1
0
def save_authors():
    connectionProperties = {
        #"customSchema": 'id INT, co_list STRING, last_page INT, saved INT',
        "customSchema":
        'id LONG, full_name STRING, subject_areas STRING, document_count INT,cited_by_count INT,citations_count INT,h_index INT,coauthors_count INT,affiliation_current STRING,cat STRING,country STRING,docs_fetched INT,last_page INT,is_sample INT,citations INT',
        "driver": 'org.sqlite.JDBC'
    }

    df = sqlContext.read.jdbc(url='jdbc:sqlite:../iconic.db',
                              table='author',
                              properties=connectionProperties)
    full_name = F.concat_ws(' ', F.get_json_object(df.full_name,
                                                   '$[*].surname'),
                            F.get_json_object(df.full_name, '$[*].given-name'))
    country = F.get_json_object(df.affiliation_current,
                                '$.affiliation-country')
    city = F.get_json_object(df.affiliation_current, '$.affiliation-city')
    university = F.get_json_object(df.affiliation_current,
                                   '$.affiliation-name')

    country_name = F.udf(lambda c: get_country_name(c), StringType())

    df = df.withColumn('full_name', full_name)
    df = df.withColumn('country', country_name(country))
    df = df.withColumn('city', city)
    df = df.withColumn('university', university)
    df = df.drop('document_count', 'h_index', 'citations_count',
                 'coauthors_count', 'affiliation_current', 'docs_fetched',
                 'last_page', 'is_sample', 'citations')

    df.write.mode('overwrite').parquet(os.path.abspath('./data/authors'))
Пример #2
0
def load_evaluation(events, start_date, end_date):
    content_items = events \
        .filter(events['event'] == 'ContentItem_Received'.lower()) \
        .filter(events['year'] >= start_date.year).filter(events['year'] <= end_date.year) \
        .filter(events['time'] >= start_date).filter(events['time'] <= end_date) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(events['MessageId'.lower()].isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['ModelId']".lower()) == 'WaterfallTopRank'.lower()) \
        .drop_duplicates(subset=['MessageId']) \
        .select(
            events['time'],
            events['CarrierName'.lower()].alias('carrier_name'),
            events['DaysFromEulaApproval'.lower()].astype('float').alias('days_from_eula'),
            events['ProgramName'.lower()].alias('program_name'),
            events['MessageId'.lower()].alias('message_id'),
            events['MessageType'.lower()].alias('message_type'),
            events['DeviceVendor'.lower()].alias('device_vendor'),
            F.get_json_object(
                events['properties'], "$['properties']['InstallationSource']".lower()
            ).alias('installation_source'),
            F.get_json_object(
                events['properties'], "$['properties']['ModelRecommendationProbability']".lower()
            ).cast('float').alias('waterfall_p')
        ) \
        .fillna(value='unset')      # FIXME: empty string value

    print('received notification loaded')
    return content_items
Пример #3
0
 def transform_stream(kinesis_stream):
     kinesis_stream \
         .selectExpr("CAST(sequenceNumber AS STRING)", "CAST(data AS STRING)") \
         .withColumn("value", col("data").cast(StringType())) \
         .withColumn("measure_id", col("sequenceNumber").cast(StringType())) \
         .select(
             col("measure_id"),
             get_json_object("value", "$.sensorId").alias("sensor_id"),
             get_json_object("value", "$.currentTemperature").alias("current_temperature"),
             get_json_object("value", "$.status").alias("status")
         ) \
         .writeStream \
         .foreach(lambda rdd: rdd.foreach(process_row)) \
         .start()
def load_time_on_page(events, start_date, end_date):
    time_on_page = events \
        .filter(events['event'] == 'ContentItem_TimeOnPage'.lower()) \
        .filter(events['year'] >= start_date.year).filter(events['year'] <= end_date.year) \
        .filter(events['time'] >= start_date).filter(events['time'] <= end_date) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(events['MessageId'.lower()].isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['TimeOnPage']".lower()).isNotNull()) \
        .groupBy(F.col('MessageId'.lower()).alias('message_id')) \
        .agg(F.max(F.get_json_object(
            events['properties'], "$['properties']['TimeOnPage']".lower()).astype('float')
            ).alias('time_on_page'))

    print('TimeOnPage events loaded')
    return time_on_page
Пример #5
0
def filter_raw_data(df):
    """Extract only relevant columns of data we require for topic modelling.
       NOTE: The unix timestamp from MongoDB is divided by 1000 here because of the
       extra 3 zeros at the end (we don't need milliseconds).
    """
    dataDF = df.select('_id', 'publishedAt', 'outlet', 'url', 'title', 'body', 'peopleFemaleCount',
                       'peopleMaleCount', 'sourcesFemaleCount', 'sourcesMaleCount') \
        .withColumn('id', get_ids(f.col('_id'))) \
        .withColumn('unix_timestamp', f.get_json_object(df.publishedAt, "$.$date") / 1000) \
        .withColumn('string_timestamp', f.from_unixtime(f.col('unix_timestamp'))) \
        .withColumn('timestamp', f.col('string_timestamp').cast(t.TimestampType())) \
        .drop('_id', 'publishedAt', 'unix_timestamp', 'string_timestamp')
    return dataDF
Пример #6
0
def kafkaCast(stream: DataStreamReader, args: dict) -> DataStreamReader:
    """Return a new dataframe with normalized attrs from `value`

    Args:
        stream (Stream): processed stream.

    Returns:
        Stream: casted stream.
    """
    json_objects = []
    for u in ["uuid", "capability", "timestamp", "value"]:
        json_objects.append(get_json_object(stream.value, '$.' + u).alias(u))
    return stream.select(json_objects)
Пример #7
0
def transform_data_with_udf(clickstream_data, purchase_data):
    window1 = Window.partitionBy('userId').orderBy('eventTime')
    window2 = Window.orderBy('sessionId')

    clickstream_data = (clickstream_data.withColumn(
        'appOpenFlag',
        app_open_flag_udf(clickstream_data['eventType'])).withColumn(
            'sessionId',
            sum(col('appOpenFlag')).over(window1)).withColumn(
                'attr',
                attributes_udf(
                    clickstream_data['eventType'],
                    clickstream_data['attributes'])).withColumn(
                        'campaign_id',
                        when(
                            get_json_object('attr',
                                            '$.campaign_id').isNotNull(),
                            get_json_object('attr',
                                            '$.campaign_id')).otherwise(None)
                    ).withColumn(
                        'channel_id',
                        when(
                            get_json_object('attr',
                                            '$.channel_id').isNotNull(),
                            get_json_object(
                                'attr',
                                '$.channel_id')).otherwise(None)).withColumn(
                                    'purchase_id',
                                    when(
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id').isNotNull(),
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id')).otherwise(None)).
                        withColumn(
                            'campaignId',
                            last(col('campaign_id'), ignorenulls=True).over(
                                window2.rowsBetween(
                                    Window.unboundedPreceding, 0))).withColumn(
                                        'channelId',
                                        last(col('channel_id'),
                                             ignorenulls=True).over(
                                                 window2.rowsBetween(
                                                     Window.unboundedPreceding,
                                                     0))))

    target_df = clickstream_data.join(
        purchase_data,
        clickstream_data['purchase_id'] == purchase_data['purchaseId'],
        JOIN_TYPE.LEFT)

    return target_df.select(col('purchaseId'), col('purchaseTime'),
                            col('billingCost'), col('isConfirmed'),
                            col('sessionId'), col('campaignId'),
                            col('channelId'))
def load_received_notifications(events, start_date, end_date):
    # datetime objects are serializable only from spark 2.2.1
    content_items = events \
        .filter(events['event'] == 'ContentItem_Received'.lower()) \
        .filter(events['year'] >= start_date.year).filter(events['year'] <= end_date.year) \
        .filter(events['time'] >= start_date).filter(events['time'] <= end_date) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(events['MessageId'.lower()].isNotNull()) \
        .drop_duplicates(subset=['MessageId']) \
        .select(
            events['time'],
            events['CarrierName'.lower()].alias('carrier_name'),
            events['DaysFromEulaApproval'.lower()].astype('float').alias('days_from_eula'),
            events['MessageId'.lower()].alias('message_id'),
            events['MessageType'.lower()].alias('message_type'),
            events['DeviceVendor'.lower()].alias('device_vendor'),
        )

    print('received notification loaded')
    return content_items
Пример #9
0
def save_collaborations():
    auths_struct = StructType([StructField("list", ArrayType(StringType()))])

    connectionProperties = {
        "customSchema":
        'abs_id LONG, authors STRING, published STRING, cited_by INT, keywords STRING, coll_count INT, h_index INT, message STRING, saved INT',
        "driver": 'org.sqlite.JDBC'
    }

    df = sqlContext.read.jdbc(url='jdbc:sqlite:../iconic.db',
                              table='collaboration',
                              properties=connectionProperties)

    df = df.withColumn('authors', F.get_json_object(df.authors, '$[*].authid'))

    map_auths = F.udf(lambda s: '{{"list": {}}}'.format(s), StringType())
    df = df.withColumn('auths', map_auths(df.authors))
    df = df.withColumn('authors', F.from_json(df.auths, auths_struct))
    df = df.withColumn("authors_size", F.size(F.col('authors.list')))
    new_df = df.select('abs_id', 'published', 'cited_by', 'keywords',
                       F.explode(df.authors.list).alias('author_id'))

    new_df.write.parquet(os.path.abspath('./data/collaborations'))
Пример #10
0
def spark_process(collection):
    start_timer = time.perf_counter()
    adg_hive_select_query = "select * from %s" % collection.staging_hive_table
    df = get_dataframe_from_staging(adg_hive_select_query)
    raw_df = df.select(
        df.data,
        F.get_json_object(df.data,
                          "$.message.encryption.encryptedEncryptionKey").alias(
                              "encryptedKey"),
        F.get_json_object(df.data,
                          "$.message.encryption.keyEncryptionKeyId").alias(
                              "keyEncryptionKeyId"),
        F.get_json_object(
            df.data, "$.message.encryption.initialisationVector").alias("iv"),
        # The below piece of code is  commented and worked around as its truncating the db object
        #F.get_json_object(df.data, "$.message.dbObject").alias("dbObject"),
        F.get_json_object(df.data, "$.message.db").alias("db_name"),
        F.get_json_object(df.data,
                          "$.message.collection").alias("collection_name"),
        F.get_json_object(df.data, "$.message._id").alias("id"))
    key_df = raw_df.withColumn(
        "key",
        get_plain_key(raw_df["encryptedKey"], raw_df["keyEncryptionKeyId"]))
    decrypted_df = key_df.withColumn(
        "decrypted_db_object",
        decryption(key_df["key"], key_df["iv"], key_df["data"]))
    validated_df = decrypted_df.withColumn(
        "validated_db_object", validation(decrypted_df["decrypted_db_object"]))
    sanitised_df = validated_df.withColumn(
        "sanitised_db_object",
        sanitise(validated_df["validated_db_object"], validated_df["db_name"],
                 validated_df["collection_name"]))
    clean_df = sanitised_df.withColumnRenamed("sanitised_db_object", "val")
    values = clean_df.select("val")
    parquet_location = persist_parquet(collection.collection_name, values)
    prefix = ("${file_location}/" + collection.collection_name + "/" +
              collection.collection_name + ".parquet")
    tag_objects(prefix, tag_value=collection.tag_value)
    create_hive_on_published(parquet_location, collection.collection_name)
    end_timer = time.perf_counter()
    time_taken = round(end_timer - start_timer)
    time_taken = str(datetime.timedelta(seconds=time_taken))
    the_logger.info(
        f"time taken for {collection.collection_name}: {time_taken}")
# COMMAND ----------

df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)

# COMMAND ----------

jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

# COMMAND ----------

from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"),
                    "$.myJSONKey.myJSONValue[1]").alias("column"),
    json_tuple(col("jsonString"), "myJSONKey")).show(2)

# COMMAND ----------

from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))

# COMMAND ----------

from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType(
    (StructField("InvoiceNo", StringType(),
                 True), StructField("Description", StringType(), True)))
Пример #12
0
        .getOrCreate()

    spark.conf.set("spark.sql.shuffle.partitions", 2)
    spark.sparkContext.setLogLevel("ERROR")

    msft_df = spark\
            .readStream\
            .format("kafka")\
            .option("kafka.bootstrap.servers","localhost:9092")\
            .option("subscribe","capstone14")\
            .load()

    msft_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

    msft_df2 = msft_df.select(
        func.get_json_object(func.col("value").cast("string"),
                             '$[*].open').alias('open'),
        func.get_json_object(func.col("value").cast("string"),
                             '$[*].high').alias('high'),
        func.get_json_object(func.col("value").cast("string"),
                             '$[*].low').alias('low'),
        func.get_json_object(func.col("value").cast("string"),
                             '$[*].close').alias('close'),
        func.get_json_object(func.col("value").cast("string"),
                             '$[*].volume').alias('volume'),
        func.get_json_object(func.col("value").cast("string"),
                             '$[*].date').alias('date'))

    msft_df2.printSchema()

    msft_df3 = msft_df2 \
        .writeStream \
Пример #13
0
    config("spark.ui.port", "0"). \
    config('spark.cassandra.connection.host', '127.0.0.1'). \
    getOrCreate()

clicks_df = spark. \
    read. \
    format("org.apache.spark.sql.cassandra"). \
    options(table = "clickevents", keyspace = "webanalytics"). \
    load()

clicks_df.show(truncate=False)

from pyspark.sql.functions import col, get_json_object

clicks_df = clicks_df. \
    withColumn("x", get_json_object(col("value"), "$.x").cast("long")). \
    withColumn("y", get_json_object(col("value"), "$.y").cast("long")). \
    drop("value")

clicks_df.printSchema()

pandas_df = clicks_df.toPandas()

import plotly.express as px

fig = px.scatter(x=pandas_df["x"], y=pandas_df["y"])

fig.update_yaxes(autorange="reversed")

from PIL import Image
img = Image.open(
Пример #14
0
  .add('source', StringType()) \
  .add('type', StringType()) \
  .add('data', StringType()) \
  .add('ts', StringType())

src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/',
                      format='parquet',
                      schema=schema)

## Build out some new columns with data extracted from the JSON string

df = src \
  .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \
  .withColumn('agerange', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].agerange'), \
      StructType().add('high', IntegerType()).add('low', IntegerType()) \
    ) \
  ) \
  .withColumn('smile', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].smile'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
  .withColumn('eyeglasses', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].eyeglasses'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
    df = get_router_data(source_schema, last_loaded_date, max_createdmsec)

    cdlhosts_data_cnt = df.count()

    print("Incremental Data to be loaded to TDM Table =" +
          str(cdlhosts_data_cnt) + " records")
if (cdlhosts_data_cnt > 0):
    df1 = df.withColumn('eventdate', regexp_replace(col("datum"), "-", ""))
    hst = df1.filter("host.interfacetype = 'WIFI'")\
                                    .select(col("eventid").alias("id"),\
                                    col("assetid").alias("networkdeviceid"),\
                                    col("host.hostmac").alias("MacAddress"),\
                                    col("host.hostname").alias("HostName"),\
                                    "eventtime",\
                                    "eventdate",\
                                    f.get_json_object("host.params", "$.opaquedata.band").alias("band")\
                                    )
    dup = hst.dropDuplicates()
    hst_cnt = dup.count()
    print("Total source count is : " + str(hst_cnt))

    #######cdl_acscoll_wifiparam_prq table data ########

    Wifi_params_max_createdmsec = get_last_loadValue("audit_value",
                                                     audit_schema,
                                                     WifiParams_source_table,
                                                     tdm_table)

    print("Last loaded data has Wifi_params_max_createdmsec = " +
          str(Wifi_params_max_createdmsec))
Пример #16
0
df_transactions = df_transactions.join(
    df_cotizaciones, df_transactions['dt'] == df_cotizaciones['fecha'],
    'inner')

df_transactions = df_transactions.withColumn(
    'amount_usd',
    F.col("amount") / F.col("venta_uala"))

df_transactions = df_transactions.join(
    df_authorizations, df_transactions["authorization_id"] ==
    df_authorizations["authorization_id"], "left")

df_transactions = df_transactions.select(
    "dt", "account_from", "amount_usd", "transaction_type",
    F.get_json_object(df_transactions.metadata, '$.mcc').alias('mcc'))

df_transactions = df_transactions.withColumn(
    'year_month', F.date_format(df_transactions.dt, 'YYYY-MM'))

df_transactions = df_transactions.drop("dt")

df_transactions = df_transactions.join(
    df_param_mcc, df_transactions["mcc"] == df_param_mcc["mcc_param"],
    "left")  #.na.fill('otros')

df_transactions.fillna('otros', subset=['category'])

##### ARMAMOS COMPRAS POR CATEGORIA USANDO MCC #######
df_transactions = df_transactions.withColumn(
    'new_transaction_type',
Пример #17
0
def extractJsonFromString(df, attrs):
    json_objects = []
    for u in attrs:
        json_objects.append(get_json_object(df.value, '$.' + u).alias(u))
    return json_objects
Пример #18
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,get_json_object,json_tuple
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()

jsonDF= spark.range(1).selectExpr(""" 
	'{"key":
			{"value":[1,2,3]}}' 
					as jsonString """)

jsonDF.select(
	get_json_object(col("jsonString"),"$.key.value[0]").alias("column"),
	json_tuple(col("jsonString"),"key")).show(2)
Пример #19
0
    spark = SparkSession \
        .builder \
        .appName("Kafka2Files") \
        .getOrCreate()

    schema = StructType()\
        .add("field1", IntegerType())\
        .add("field2", StringType())\
        .add("field3", StringType())

    lines = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", brokers) \
        .option("subscribe", topic) \
        .load()\
        .select(get_json_object(col("value").cast("string"),"$.field1").alias('user_id'),\
                get_json_object(col("value").cast("string"),"$.field2").alias('type'))\
        .where(col("type").startswith("#"))

    query = lines \
     .writeStream \
     .format("csv") \
     .option("format", "append") \
     .option("path", save2file) \
     .option("checkpointLocation", path2Checkpoint) \
     .outputMode("append") \
     .start()

    query.awaitTermination()
## Read in data by pointing to its's table name in Glue Data Catalog
schema = StructType() \
  .add('source', StringType()) \
  .add('type', StringType()) \
  .add('data', StringType()) \
  .add('ts', StringType())

src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema)

## Build out some new columns with data extracted from the JSON string

df = src \
  .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \
  .withColumn('agerange', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].agerange'), \
      StructType().add('high', IntegerType()).add('low', IntegerType()) \
    ) \
  ) \
  .withColumn('smile', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].smile'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
  .withColumn('eyeglasses', \
    F.from_json( \
      F.get_json_object('data', '$.facedetails[*].eyeglasses'), \
      StructType().add('confidence', DoubleType()).add('value', BooleanType()) \
    ) \
  ) \
                        "Append").format("console").option(
                            "checkpointLocation", "./checkpoint/").start())

# output to console with json parse
schema = (StructType().add("timestamp", TimestampType()).add(
    "open", IntegerType()).add("high", IntegerType()).add(
        "low", IntegerType()).add("close",
                                  IntegerType()).add("volume", IntegerType()))

df.select(
    df["key"].cast("string"), from_json(
        df["value"].cast("string"),
        schema)).writeStream.outputMode("Append").format("console").start()

# output to csv
ds = (df.selectExpr(
    df["key"].cast("string"),
    from_json(df["value"].cast("string"),
              schema)).writeStream.outputMode("Append").format("csv").option(
                  "path", "./csv_output/").option("checkpointLocation",
                                                  "./checkpoint/").start())

# output with window function
ds = (df.select(
    get_json_object(df.value.cast("string"),
                    "$.timestamp").cast("timestamp").alias("timestamp"),
    get_json_object(df.value.cast("string"),
                    "$.close").cast("float").alias("close"),
).groupby(window("timestamp", "5 seconds")).avg(
    "close").writeStream.outputMode("Update").format("console").start())
Пример #22
0
def transform_data(clickstream_data, user_purchases_data):
    window1 = Window.partitionBy('userId').orderBy('eventTime')
    window2 = Window.orderBy('sessionId')
    appOpenFlagDefaultVal = 0

    clickstream_data = (
        clickstream_data.withColumn(
            'appOpenFlag',
            when(
                (col('eventType') == Event_Type.OPEN),
                monotonically_increasing_id()).otherwise(appOpenFlagDefaultVal)
        ).withColumn(
            'sessionId',
            sum(col('appOpenFlag')).over(window1)).withColumn(
                'attrs',
                when(
                    col('eventType') == Event_Type.PURCHASE,
                    clickstream_data['attributes'].substr(
                        lit(2),
                        length('attributes') -
                        lit(2))).otherwise(col('attributes'))).withColumn(
                            'attr',
                            when(
                                col('eventType') == Event_Type.PURCHASE,
                                regexp_replace(col('attrs'), '""',
                                               "'")).otherwise(col('attrs'))).
        withColumn(
            'campaign_id',
            when(
                get_json_object(
                    'attr',
                    '$.campaign_id').isNotNull(),
                get_json_object(
                    'attr', '$.campaign_id')).otherwise(None)).withColumn(
                        'channel_id',
                        when(
                            get_json_object(
                                'attr', '$.channel_id').isNotNull(),
                            get_json_object(
                                'attr',
                                '$.channel_id')).otherwise(None)).withColumn(
                                    'purchase_id',
                                    when(
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id').isNotNull(),
                                        get_json_object(
                                            'attr',
                                            '$.purchase_id')).otherwise(None)).
        withColumn(
            'campaignId',
            last(col('campaign_id'), ignorenulls=True).over(
                window2.rowsBetween(Window.unboundedPreceding, 0))).withColumn(
                    'channelId',
                    last(col('channel_id'), ignorenulls=True).over(
                        window2.rowsBetween(Window.unboundedPreceding, 0))))

    target_df = clickstream_data.join(
        user_purchases_data,
        clickstream_data['purchase_id'] == user_purchases_data['purchaseId'],
        JOIN_TYPE.LEFT)
    return target_df.select(col('purchaseId'), col('purchaseTime'),
                            col('billingCost'), col('isConfirmed'),
                            col('sessionId'), col('campaignId'),
                            col('channelId'))
Пример #23
0
def get_sentiment(text):
    from textblob import TextBlob
    try:
      tweet = TextBlob(text)
      return tweet.sentiment.polarity
    except:
      return None

# Define your function
getSentiment = UserDefinedFunction(get_sentiment, StringType())
# Apply the UDF using withColumn

outDF = (kinesisDF
       .selectExpr("cast(data as string)")
       .withColumn('id', get_json_object(col("data"),"$[0].id"))
       .withColumn('ts', get_json_object(col("data"),"$[0].ts"))
       .withColumn('tweet', get_json_object(col("data"),"$[0].tweet"))
       .withColumn('tag', get_json_object(col("data"),"$[0].tag"))
       .withColumn('sentiment', getSentiment(col("tweet")).cast(FloatType()))
       .withColumn('datetime',
                 to_timestamp(regexp_extract( col('ts'), '\\w\\w\\w \\d\\d? (\\d+):(\\d+):(\\d+)',0), "MMM dd HH:mm:ss"))
       .select(col('sentiment'),col('datetime'), col('tag'), col('tweet'))
       .withColumn('sentiment_cnt', col("sentiment"))
       .withColumn('tweet_dup', col("tweet"))
       .groupBy(col("tag"), window(col('datetime'), "5 seconds").alias("timewindow"))
       .agg({'sentiment_cnt': 'count','sentiment':'avg','tweet':'first','tweet_dup':'last'})
       .withColumnRenamed("count(sentiment_cnt)","notweets")
       .withColumnRenamed("avg(sentiment)","avgsentiment")
       .withColumnRenamed('first(tweet)',"tweet1")
       .withColumnRenamed('last(tweet_dup)',"tweet2")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output_path',
        help='output path in s3 including bucket without file name')
    args = parser.parse_args()
    if args.output_path:
        output_path = args.output_path
    else:
        raise ValueError('missing argument - output_path')

    model_name = 'photography'
    spark_config = SparkConf().setAppName(model_name)

    dependencies = get_dependencies()
    spark_context = SparkContext(conf=spark_config, pyFiles=dependencies)
    hive_context = HiveContext(spark_context)

    is_photo_related = F.udf(
        lambda s: True if ('camera' in s) or ('video' in s) else False,
        types.BooleanType())

    get_event_score = F.udf(score_single_event, types.FloatType())

    # received notification at least as many as viewed
    fix_received = F.udf(lambda received, view: max(received, view),
                         types.FloatType())

    # TODO: switch to l1 home_events_uuid
    events = hive_context.table('l2_sprint.mixpanel_home')

    # choose photography related content interactions from notifications
    # (devicefeatures_attribute exists only for notification items, not future cards)
    # relevant content started approximately '2017-10-31'
    content_items = events \
        .filter(events['event'].isin(
            [x.lower() for x in ['ContentItem_Received', 'ContentItem_View', 'ContentItem_Click',
                                 'ContentItem_TimeOnPage', 'ContentItem_PageScroll']])
        ) \
        .filter(events['Time'.lower()] > '2017-10-31') \
        .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['MessageType']".lower()).isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['devicefeatures_attribute']").isNotNull()) \
        .filter(is_photo_related(F.get_json_object(
            events['properties'], "$['properties']['devicefeatures_attribute']")))

    # assign score for each interactions
    content_items = content_items \
        .withColumn(
            'score',
            get_event_score(
                events['event'], F.get_json_object(events['properties'], "$['properties']"))
        )

    # aggregate score per user, item, event, action (action to differentiate clicks).
    # use max on properties for score because page scroll sends intermediate states for example.
    # use max on properties in case it's null or empty string in one of the events
    content_items = content_items \
        .groupBy(
            F.get_json_object(
                events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'),
            events['event'],
            F.get_json_object(
                events['properties'], "$['properties']['MessageType']".lower()).alias('topic'),
            F.get_json_object(
                events['properties'], "$['properties']['ActionId']".lower()).alias('action')
        ) \
        .agg(
            F.max(F.get_json_object(
                events['properties'], "$['properties']['AtlasUniqueUserId']".lower())).alias('user_id'),
            F.max('CarrierName'.lower()).alias('carrier_name'),
            F.max('DeviceModel'.lower()).alias('device_model'),
            F.max('DeviceModelName'.lower()).alias('device_model_name'),
            F.max('DeviceOsType'.lower()).alias('device_os_type'),
            F.max('DeviceVendor'.lower()).alias('device_vendor'),
            F.max('score').alias('score')
        )

    # FIXME fix view according action events
    received_content_items = content_items \
        .groupBy('device_id') \
        .pivot('event', ['ContentItem_Received'.lower(), 'ContentItem_View'.lower()]).sum('score') \
        .fillna(0.0) \
        .select(
            'device_id',
            fix_received(F.col('contentitem_received'), F.col('contentitem_view')).alias('receive'))

    # calculate final score for user.
    content_items = content_items \
        .filter(events['event'] != 'ContentItem_Received'.lower()) \
        .groupBy('device_id') \
        .agg(
            F.max('user_id').alias('user_id'),
            F.max('carrier_name').alias('carrier_name'),
            F.max('device_model').alias('device_model'),
            F.max('device_model_name').alias('device_model_name'),
            F.max('device_os_type').alias('device_os_type'),
            F.max('device_vendor').alias('device_vendor'),
            F.sum('score').alias('total_score')
        ) \
        .join(received_content_items, 'device_id', 'left') \
        .withColumn('score', F.round(F.col('total_score') / F.col('receive'))) \
        .drop('total_score', 'receive') \
        .withColumn('photography_interest', F.lit(None))

    # choose users who completed user interest questionnaire
    interests = events \
        .filter(events['event'] == 'Timeline_OnboardingMessage_Click'.lower()) \
        .filter(events['CarrierName'.lower()].isin('sprint', 'verizon')) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['IsTest']".lower()) == 'false') \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['DeviceId']".lower()).isNotNull()) \
        .filter(F.get_json_object(
            events['properties'], "$['properties']['ActionId']".lower()) == 'done')

    # assign score for photography interest
    interests = interests \
        .withColumn(
            'score',
            get_event_score(
                events['event'], F.get_json_object(events['properties'], "$['properties']"))
        )

    # subset relevant properties and drop duplicated devices
    # (assuming each user should answer questionnaire ones)
    interests = interests \
        .select(
            F.get_json_object(
                events['properties'], "$['properties']['DeviceId']".lower()).alias('device_id'),
            F.get_json_object(
                events['properties'], "$['properties']['AtlasUniqueUserId']".lower()).alias('user_id'),
            events['CarrierName'.lower()].alias('carrier_name'),
            events['DeviceModel'.lower()].alias('device_model'),
            events['DeviceModelName'.lower()].alias('device_model_name'),
            events['DeviceOsType'.lower()].alias('device_os_type'),
            events['DeviceVendor'.lower()].alias('device_vendor'),
            'score'
        ) \
        .drop_duplicates(['device_id']) \
        .withColumn('photography_interest', F.when(F.col('score') > 0, 1.0).otherwise(0.0))

    # assregate content and interest scores
    # use max on properties in case it's null or empty string in one of the events
    photography_user = content_items.union(interests) \
        .groupBy('device_id') \
        .agg(
            F.max('user_id').alias('user_id'),
            F.max('carrier_name').alias('carrier_name'),
            F.max('device_model').alias('device_model'),
            F.max('device_model_name').alias('device_model_name'),
            F.max('device_os_type').alias('device_os_type'),
            F.max('device_vendor').alias('device_vendor'),
            F.sum('score').alias('score'),
            F.max('photography_interest').alias('photography_interest')
        )

    dgx = hive_context.table('l2_asurion.demographics_dbo_source_dgx')
    mobileid = hive_context.table('l3_sprint.mobileid')

    # FIXME: decrypt ethnicityrollup, dob, ethnicity
    photography_user_augmented = photography_user \
        .join(mobileid.select('mobileuid', 'subid'),
              photography_user['user_id'] == mobileid['mobileuid'],
              'left') \
        .join(dgx.select('source_dfx_id', 'nameprefix', 'state', 'age_range', 'income_range_vds',
                         'gender', 'marital_status', 'dwelling_type', 'home_ownership',
                         'length_of_residence', 'presence_of_children',
                         'mail_public_responder_indicator', 'mail_responsive_buyer_indicator',
                         'home_value_range', 'networthindicator_rollup', 'wealth_decile',
                         'homeandlandvalue', 'first_mortgage_amount', 'level_of_education',
                         'head_of_household', 'professionalrollup', 'premover',
                         'active_fitness_interest', 'golf_interest', 'traveler', 'green_advocate'),
              mobileid['subid'] == dgx['source_dfx_id'],
              'left')

    apps = hive_context.read.parquet(APPS_PATH)

    photography_user_augmented = photography_user_augmented \
        .join(apps, photography_user_augmented['device_id'] == apps['deviceId'], 'left')

    photography_user_augmented.write.csv('s3://' + output_path,
                                         mode='overwrite',
                                         compression='gzip',
                                         header=True)
Пример #25
0
    .master('spark://master:7077') \
    .getOrCreate()
    
# Get the json-formatted data from Kafka stream
kafka_movies = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "147.46.216.122:9092") \
    .option("subscribe", "movie") \
    .load()
    
# Define the relational schema
schema = StructType().add("title", StringType()).add("genre", StringType()).add("year", LongType())

# Change the JSON events into relational tuples
relational_movies = kafka_movies.select([get_json_object(col("value").cast("string"), "$.{}".format(c)).alias(c)
    for c in ["title", "genre", "year"]])

# Change the type of year from string to integer
relational_movies = relational_movies.select(col("title"), col("genre"), relational_movies.year.cast('integer').alias('year'))
# Select the movie titles with year < 2000
results = relational_movies.select("title").where("year < 2000")

query = results \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination(60)
Пример #26
0
    .appName("ReadFromKafkaStoreToMysql") \
    .getOrCreate()

# readStream for reading data from kafka topic test
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9094") \
  .option("subscribe", "test") \
  .load()

# Cast Value of dataframe in string and aliasing it as json
data = df.selectExpr("CAST(value AS STRING) as json")

# Create schema for streaming json objects
new_data = data.withColumn("id",F.get_json_object("json",'$.id'))\
    .withColumn("Product",F.get_json_object("json",'$.Product'))  \
    .withColumn("Price",F.get_json_object("json",'$.Price'))  \
    .withColumn("Payment_Type",F.get_json_object("json","$.Payment_Type"))  \
    .withColumn("Name",F.get_json_object("json","$.Name")).drop("json","$.json")


# Function to write eachbatch of kafka to mysql
def writeToSQL(df, epochId):
    df.write.format("jdbc")\
         .option("url","jdbc:mysql://localhost:3306/praveen")\
     .option("driver","com.mysql.cj.jdbc.Driver")\
     .option("dbtable","product")\
     .option("user","root")\
     .option("password","root")\
         .mode("append")\
Пример #27
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, struct, split, explode, expr, create_map, get_json_object, json_tuple

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()
    df = spark.range(1).selectExpr("""
        '{"myJsonKey": {"myJsonValue": [1, 2, 3]}}' as jsonString
    """)

    df.select(get_json_object(col('jsonString'),
                              "$.myJsonKey.myJsonValue[0]")).show()
    df.select(json_tuple(col('jsonString'), "myJsonKey")).show()
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
  .selectExpr("explode(complex_map)").show(2)


# COMMAND ----------

jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")


# COMMAND ----------

from pyspark.sql.functions import get_json_object, json_tuple

jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)


# COMMAND ----------

from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
  .select(to_json(col("myStruct")))


# COMMAND ----------

from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
def transform_data(clickstream_df: DataFrame, purchases_df: DataFrame,
                   logger: Log4jWrapper) -> DataFrame:
    """
    Builds purchases attribution dataset from mobile app clickstream
    and user purchases data.

    :param df: dataset to transform as DataFrame.
    :return: transformed DataFrame.
    """
    user_window: Window = Window.partitionBy("userId").orderBy("eventTime")

    logger.warning("build leading events dataset ...")
    leading_events_df = (clickstream_df.filter(
        F.col("eventType").isin({"app_open", "app_close"})).withColumn(
            "nextEventId",
            F.lead(F.col("eventId"), 1).over(user_window)).select(
                F.col("userId"),
                F.col("eventTime").alias("startSessionEventTime"),
                F.col("eventType").alias("startSessionEventType"),
                F.col("attributes"), F.col("nextEventId")).filter(
                    F.col("startSessionEventType") == "app_open").withColumn(
                        "campaignId",
                        F.get_json_object(
                            F.col("attributes"), "$.campaign_id")).withColumn(
                                "channelId",
                                F.get_json_object(
                                    F.col("attributes"),
                                    "$.channel_id")).drop("attributes"))

    leading_events_df.show()

    logger.warning("calculate user sessions ...")
    sessions_df = (leading_events_df.alias("leading_events").join(
        clickstream_df.alias("all_events"),
        on=F.col("leading_events.nextEventId") == F.col("all_events.eventId"),
        how="left").select(
            F.col("leading_events.userId"),
            F.col("leading_events.startSessionEventTime"),
            F.col("leading_events.campaignId"),
            F.col("leading_events.channelId"),
            F.col("all_events.eventTime").alias("endSessionEventTime"),
        ).withColumn("sessionId", F.monotonically_increasing_id()))

    sessions_df.show()

    logger.warning("append session to each event ...")
    sessioned_purchases_df = (clickstream_df.alias("c").filter(
        F.col("c.eventType") == "purchase").join(
            sessions_df.alias("s"),
            on=[
                F.col("c.userId") == F.col("s.userId"),
                F.col("c.eventTime") >= F.col("s.startSessionEventTime"),
                (F.col("c.eventTime") <= F.col("s.endSessionEventTime"))
                | F.col("s.endSessionEventTime").isNull(),
            ]).select(
                F.col("s.userId"),
                F.col("s.sessionId"),
                F.col("s.campaignId"),
                F.col("s.channelId"),
                F.get_json_object(F.col("c.attributes"),
                                  "$.purchase_id").alias("purchaseId"),
            ).orderBy(F.col("userId"), F.col("eventTime")))

    sessioned_purchases_df.show()

    logger.warning("build purchases attribution ...")
    projection_df = (sessioned_purchases_df.alias("s").join(
        purchases_df.alias("p"),
        on=F.col("p.purchaseId") == F.col("s.purchaseId")).select(
            F.col("p.purchaseId"), F.col("p.purchaseTime"),
            F.col("p.billingCost"), F.col("p.isConfirmed"),
            F.col("s.sessionId"), F.col("s.campaignId"), F.col("s.channelId")))

    projection_df.show()

    return projection_df
Пример #30
0
def explode_json_column(df: DataFrame, column: str,
                        json_schema: StructType) -> DataFrame:
    """Create new columns extracting properties from a JSON column.

    Example:
    >>> from pyspark import SparkContext
    >>> from pyspark.sql import session
    >>> from butterfree.testing.dataframe import create_df_from_collection
    >>> from butterfree.core.extract.pre_processing import explode_json_column
    >>> from pyspark.sql.types import (
    ...     ArrayType,
    ...     IntegerType,
    ...     StringType,
    ...     StructField,
    ...     StructType,
    ... )
    >>> spark_context = SparkContext.getOrCreate()
    >>> spark_session = session.SparkSession(spark_context)
    >>> data = [{"json_column": '{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}'}]
    >>> df = create_df_from_collection(data, spark_context, spark_session)
    >>> df.collect()
    [Row(json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}')]
    >>> json_column_schema = StructType(
    ... [
    ...    StructField("a", IntegerType()),
    ...    StructField("b", StringType()),
    ...    StructField("c", IntegerType()),
    ...    StructField("d", ArrayType(IntegerType())),
    ... ]
    >>> explode_json_column(
    ...     df, column='json_column', json_schema=json_column_schema
    ... ).collect()
    [
        Row(
            json_column='{"a": 123, "b": "abc", "c": "123", "d": [1, 2, 3]}',
            a=123,
            b='abc',
            c=123,
            d=[1, 2, 3]
        )
    ]

    Args:
        df: input dataframe with the target JSON column.
        column: column name that is going to be exploded.
        json_schema: expected schema from that JSON column.
            Not all "first layer" fields need to be mapped in the json_schema,
            just the desired columns. If there is any JSON field that is needed
            to be cast to a struct, the declared expected schema (a StructType)
            need to have the exact same schema as the presented record, if don't,
            the value in the resulting column will be null.

    Returns:
        dataframe with the new extracted columns from the JSON column.

    """
    for field in json_schema:
        if field.dataType.typeName() in JSON_TYPE_NAMES:
            df = df.withColumn(
                field.name,
                from_json(
                    get_json_object(df[column], "$.{}".format(field.name)),
                    schema=field.dataType,
                ),
            )
        else:  # non-collection data types
            df = df.withColumn(
                field.name,
                get_json_object(df[column], "$.{}".format(field.name)).cast(
                    field.dataType),
            )
    return df
Пример #31
0
# COMMAND ----------

# MAGIC %md ###### Extract and flatten some fields

# COMMAND ----------

attrDF = bizDF.select(
    col("business_id"),
    col("name").alias("BusinessName"), col("categories"), col("address"),
    col("city"), col("state"), col("postal_code"), col("is_open"),
    col("review_count"),
    col("stars").alias("Rating"), col("attributes.AcceptsInsurance"),
    col("attributes.AgesAllowed"),
    regexp_replace(regexp_replace(col("attributes.Alcohol"), 'u\'', ''), '\'',
                   '').alias("Alcohol"),
    get_json_object("Ambience", "$.touristy").alias("HasTouristyAmbience"),
    get_json_object("Ambience", "$.hipster").alias("HasHipsterAmbience"),
    get_json_object("Ambience", "$.romantic").alias("HasRomanticAmbience"),
    get_json_object("Ambience", "$.intimate").alias("HasIntimateAmbience"),
    get_json_object("Ambience", "$.trendy").alias("HasTrendyAmbience"),
    get_json_object("Ambience", "$.upscale").alias("HasUpscaleAmbience"),
    get_json_object("Ambience", "$.classy").alias("HasClassyAmbience"),
    get_json_object("Ambience", "$.casual").alias("HasCasualAmbience"),
    get_json_object("DietaryRestrictions",
                    "$.dairy-free").alias("IsDairyFree"),
    get_json_object("DietaryRestrictions",
                    "$.gluten-free").alias("IsGlutenFree"),
    get_json_object("DietaryRestrictions", "$.vegan").alias("IsVegan"),
    get_json_object("DietaryRestrictions", "$.kosher").alias("IsKosher"),
    get_json_object("DietaryRestrictions", "$.halal").alias("IsHalal"),
    get_json_object("DietaryRestrictions", "$.soy-free").alias("IsSoyFree"),